Merge lp:~stub/launchpad/librarian-gc into lp:launchpad
- librarian-gc
- Merge into devel
Status: | Merged | ||||
---|---|---|---|---|---|
Approved by: | Aaron Bentley | ||||
Approved revision: | no longer in the source branch. | ||||
Merged at revision: | not available | ||||
Proposed branch: | lp:~stub/launchpad/librarian-gc | ||||
Merge into: | lp:launchpad | ||||
Diff against target: | 600 lines | ||||
To merge this branch: | bzr merge lp:~stub/launchpad/librarian-gc | ||||
Related bugs: |
|
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Aaron Bentley (community) | Approve | ||
Canonical Launchpad Engineering | Pending | ||
Review via email: mp+8451@code.launchpad.net |
Commit message
Description of the change
Stuart Bishop (stub) wrote : | # |
Aaron Bentley (abentley) wrote : | # |
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
review needs-fixing
Stuart Bishop wrote:
> Stuart Bishop has proposed merging lp:~stub/launchpad/librarian-gc into lp:~launchpad-pqm/launchpad/devel.
>
> Requested reviews:
> Canonical Launchpad Engineering (launchpad)
>
> Address Bug #290837.
>
> The Librarian garbage collector currently drives the system into swap because it is doing its set logic in RAM. We should stop doing this.
>
> === modified file 'lib/canonical/
> --- lib/canonical/
> +++ lib/canonical/
> @@ -1,4 +1,4 @@
> -# Copyright 2004-2007 Canonical Ltd. All rights reserved.
> +# Copyright 2004-2009 Canonical Ltd. All rights reserved.
> """Librarian garbage collection tests"""
>
> __metaclass__ = type
> @@ -119,6 +119,8 @@
> past = past.replace(
> f1.last_accessed = past
> f2.last_accessed = past
> + f1.date_created = past
> + f2.date_created = past
>
> del f1, f2
>
> @@ -159,7 +161,7 @@
> # This should have committed
> self.ztm.begin()
>
> - # Confirm that the LibaryFileContents are still there
> + # Confirm that the LibaryFileContents are still there.
> c1 = LibraryFileCont
> c2 = LibraryFileCont
>
>
> === modified file 'lib/canonical/
> --- lib/canonical/
> +++ lib/canonical/
> @@ -1,4 +1,4 @@
> -# Copyright 2004-2008 Canonical Ltd. All rights reserved.
> +# Copyright 2004-2009 Canonical Ltd. All rights reserved.
> """Librarian garbage collection routines"""
>
> __metaclass__ = type
> @@ -8,13 +8,15 @@
> from time import time
> import os
>
> +from zope.interface import implements
> +
> from canonical.config import config
> +from canonical.
> +from canonical.
> from canonical.
> from canonical.
> from canonical.
>
> -BATCH_SIZE = 1
> -
> log = None
> debug = False
>
> @@ -176,8 +178,11 @@
> con.commit()
>
>
> -def delete_
> - """Delete unreferenced LibraryFileAliases and their LibraryFileContent
> +class UnreferencedLib
> + """Delete unreferenced LibraryFileAliases.
> +
> + The LibraryFileContent records are left untouched for the code that
> + knows how to delete them and the corresponding files on disk.
>
> This is the second step in a full garbage collection sweep. We determine
> which LibraryFileContent entries are not being referenced by other objects
> @@ -190,92 +195,124 @@
> must be unreferenced for them to be deleted - a single reference will keep
> the whole set alive.
> """
> - log.info("Deleting unreferenced LibraryFileAlia
> -
> - # Generate ...
Stuart Bishop (stub) wrote : | # |
On Mon, Jul 13, 2009 at 10:04 PM, Aaron Bentley<email address hidden> wrote:
> review needs-fixing
>
> Stuart Bishop wrote:
>> Stuart Bishop has proposed merging lp:~stub/launchpad/librarian-gc into lp:~launchpad-pqm/launchpad/devel.
>>
>> Requested reviews:
>> Canonical Launchpad Engineering (launchpad)
>>
>> Address Bug #290837.
>>
>> The Librarian garbage collector currently drives the system into swap because it is doing its set logic in RAM. We should stop doing this.
>> === modified file 'lib/canonical/
>> + implements(
>> +
>> + index = 1 # Index of current batch of rows to remove.
>> +
>> + total_deleted = 0 # Running total
>> +
>> + con = None # Database connection to use
>
> "index", "total_deleted" and "con" are instance variables. It would not
> make sense to check the value of
> UnreferencedLib
> only check it on an instance of UnreferencedLib
> Therefore, they should be initialized in
> UnreferencedLib
Ok. These are all being initialized in __init__ and the class attributes removed here and in the other two tunable loops.
>> + # Start with a list of all unexpired and recently accessed content
>> + # - we don't remove them even if they are unlinked.
>> + # We currently don't remove stuff until it has been expired for
>> + # more than one week, but we will change this if disk space becomes
>> + # short and it actually will make a noticeable difference.
>> + # Note that ReferencedLibra
>> + # duplicates are the exception so we are better off filtering them
>
> Perhaps "duplicates are unusual", "duplicates are exceptional", or else
> "duplicates are the exception, not the rule".
Fixed. "duplicates are unusual".
>> + # once at the end rather than when we load the data into the temporary
>> + # file.
>> + cur.execute("DROP TABLE IF EXISTS ReferencedLibra
>> + cur.execute("""
>> + SELECT LibraryFileAlia
>> + INTO TEMPORARY TABLE ReferencedLibra
>> FROM LibraryFileAlias
>> - WHERE content in (%(in_content_
>> - AND (
>> - expires + '1 week'::interval
>> - > CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
>> - OR last_accessed + '1 week'::interval
>> - > CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
>> - )
>> - """ % vars())
>> - assert cur.fetchone()[0] == 0, "Logic error - sanity check failed"
>> + WHERE
>> + expires >
>> + CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - interval '1 week'
>> + OR last_accessed >
>> + CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - interval '1 week'
>> + OR date_created >
>> + CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - interval '1 week'
>> + """...
1 | === modified file 'lib/canonical/librarian/librariangc.py' |
2 | --- lib/canonical/librarian/librariangc.py 2009-07-09 12:40:21 +0000 |
3 | +++ lib/canonical/librarian/librariangc.py 2009-07-14 08:09:48 +0000 |
4 | @@ -4,6 +4,7 @@ |
5 | __metaclass__ = type |
6 | |
7 | from datetime import datetime, timedelta |
8 | +import errno |
9 | import sys |
10 | from time import time |
11 | import os |
12 | @@ -11,6 +12,7 @@ |
13 | from zope.interface import implements |
14 | |
15 | from canonical.config import config |
16 | +from canonical.database.postgresql import quoteIdentifier |
17 | from canonical.launchpad.interfaces.looptuner import ITunableLoop |
18 | from canonical.launchpad.utilities.looptuner import DBLoopTuner |
19 | from canonical.librarian.storage import _relFileLocation as relative_file_path |
20 | @@ -197,28 +199,24 @@ |
21 | """ |
22 | implements(ITunableLoop) |
23 | |
24 | - index = 1 # Index of current batch of rows to remove. |
25 | - |
26 | - total_deleted = 0 # Running total |
27 | - |
28 | - con = None # Database connection to use |
29 | - |
30 | def __init__(self, con): |
31 | - self.con = con |
32 | + self.con = con # Database connection to use |
33 | + self.total_deleted = 0 # Running total |
34 | + self.index = 1 |
35 | |
36 | log.info("Deleting unreferenced LibraryFileAliases") |
37 | |
38 | cur = con.cursor() |
39 | |
40 | - # Start with a list of all unexpired and recently accessed content |
41 | - # - we don't remove them even if they are unlinked. |
42 | - # We currently don't remove stuff until it has been expired for |
43 | - # more than one week, but we will change this if disk space becomes |
44 | - # short and it actually will make a noticeable difference. |
45 | - # Note that ReferencedLibraryFileContent will contain duplicates - |
46 | - # duplicates are the exception so we are better off filtering them |
47 | - # once at the end rather than when we load the data into the temporary |
48 | - # file. |
49 | + # Start with a list of all unexpired and recently accessed |
50 | + # content - we don't remove them even if they are unlinked. We |
51 | + # currently don't remove stuff until it has been expired for |
52 | + # more than one week, but we will change this if disk space |
53 | + # becomes short and it actually will make a noticeable |
54 | + # difference. Note that ReferencedLibraryFileContent will |
55 | + # contain duplicates - duplicates are unusual so we are better |
56 | + # off filtering them once at the end rather than when we load |
57 | + # the data into the temporary file. |
58 | cur.execute("DROP TABLE IF EXISTS ReferencedLibraryFileContent") |
59 | cur.execute(""" |
60 | SELECT LibraryFileAlias.content |
61 | @@ -255,7 +253,9 @@ |
62 | SELECT LibraryFileAlias.content |
63 | FROM LibraryFileAlias, %(table)s |
64 | WHERE LibraryFileAlias.id = %(table)s.%(column)s |
65 | - """ % {'table': table, 'column': column}) |
66 | + """ % { |
67 | + 'table': quoteIdentifier(table), |
68 | + 'column': quoteIdentifier(column)}) |
69 | con.commit() |
70 | |
71 | log.debug("Calculating expired unreferenced LibraryFileContent set.") |
72 | @@ -321,12 +321,11 @@ |
73 | what their expires flag says. |
74 | """ |
75 | implements(ITunableLoop) |
76 | - index = 0 |
77 | - con = None |
78 | - total_deleted = 0 |
79 | |
80 | def __init__(self, con): |
81 | self.con = con |
82 | + self.index = 1 |
83 | + self.total_deleted = 0 |
84 | cur = con.cursor() |
85 | cur.execute("DROP TABLE IF EXISTS UnreferencedLibraryFileContent") |
86 | cur.execute(""" |
87 | @@ -365,10 +364,10 @@ |
88 | DELETE FROM LibraryFileContent |
89 | USING ( |
90 | SELECT content FROM UnreferencedLibraryFileContent |
91 | - WHERE id BETWEEN %d AND %d) AS UnreferencedLibraryFileContent |
92 | + WHERE id BETWEEN %s AND %s) AS UnreferencedLibraryFileContent |
93 | WHERE |
94 | LibraryFileContent.id = UnreferencedLibraryFileContent.content |
95 | - """ % (self.index, self.index + chunksize - 1)) |
96 | + """, (self.index, self.index + chunksize - 1)) |
97 | rows_deleted = cur.rowcount |
98 | self.total_deleted += rows_deleted |
99 | self.con.commit() |
100 | @@ -378,19 +377,23 @@ |
101 | # on disk but not in the DB. |
102 | cur.execute(""" |
103 | SELECT content FROM UnreferencedLibraryFileContent |
104 | - WHERE id BETWEEN %d AND %d |
105 | - """ % (self.index, self.index + chunksize - 1)) |
106 | + WHERE id BETWEEN %s AND %s |
107 | + """, (self.index, self.index + chunksize - 1)) |
108 | for content_id in (row[0] for row in cur.fetchall()): |
109 | # Remove the file from disk, if it hasn't already been |
110 | path = get_file_path(content_id) |
111 | - if os.path.exists(path): |
112 | - log.debug("Deleting %s", path) |
113 | + try: |
114 | os.unlink(path) |
115 | - elif config.librarian_server.upstream_host is None: |
116 | - # It is normal to have files in the database that are |
117 | - # not on disk if the Librarian has an upstream |
118 | - # Librarian, such as on staging. |
119 | - log.info("%s already deleted", path) |
120 | + log.debug("Deleted %s", path) |
121 | + except OSError, e: |
122 | + if e.errno != errno.ENOENT: |
123 | + raise |
124 | + if config.librarian_server.upstream_host is None: |
125 | + # It is normal to have files in the database that |
126 | + # are not on disk if the Librarian has an upstream |
127 | + # Librarian, such as on staging. Don't annoy the |
128 | + # operator with noise in this case. |
129 | + log.info("%s already deleted", path) |
130 | |
131 | self.index += chunksize |
132 | |
133 | @@ -409,14 +412,10 @@ |
134 | """ |
135 | implements(ITunableLoop) |
136 | |
137 | - index = 1 |
138 | - |
139 | - con = None # Database connection to use |
140 | - |
141 | - total_flagged = 0 |
142 | - |
143 | def __init__(self, con): |
144 | self.con = con |
145 | + self.index = 1 |
146 | + self.total_flagged = 0 |
147 | cur = con.cursor() |
148 | |
149 | log.debug("Creating set of expired LibraryFileContent.") |
150 | @@ -514,33 +513,44 @@ |
151 | and content_id > next_wanted_content_id): |
152 | next_wanted_content_id = get_next_wanted_content_id() |
153 | |
154 | - if (next_wanted_content_id is not None |
155 | - and next_wanted_content_id == content_id): |
156 | - file_wanted = True |
157 | - else: |
158 | - file_wanted = False |
159 | + file_wanted = ( |
160 | + next_wanted_content_id is not None |
161 | + and next_wanted_content_id == content_id) |
162 | |
163 | path = get_file_path(content_id) |
164 | - path_exists = os.path.exists(path) |
165 | |
166 | - if file_wanted and not path_exists: |
167 | - if config.librarian_server.upstream_host is None: |
168 | + if file_wanted: |
169 | + if (config.librarian_server.upstream_host is None |
170 | + and not os.path.exists(path)): |
171 | # It is normal to have files in the database that are |
172 | # not on disk if the Librarian has an upstream |
173 | - # Librarian, such as on staging. |
174 | + # Librarian, such as on staging. Don't spam in this |
175 | + # case. |
176 | log.error( |
177 | "LibraryFileContent %d exists in the db but not at %s" |
178 | % (content_id, path)) |
179 | - elif path_exists and not file_wanted: |
180 | - one_day = 24 * 60 * 60 |
181 | - if time() - os.path.getctime(path) < one_day: |
182 | - log.debug( |
183 | - "File %d not removed - created too recently" % content_id) |
184 | - else: |
185 | - # File uploaded a while ago but no longer wanted - remove it |
186 | - log.debug("Deleting %s" % path) |
187 | - os.unlink(path) |
188 | - count += 1 |
189 | + |
190 | + else: |
191 | + try: |
192 | + one_day = 24 * 60 * 60 |
193 | + if time() - os.path.getctime(path) < one_day: |
194 | + log.debug( |
195 | + "File %d not removed - created too recently" |
196 | + % content_id) |
197 | + else: |
198 | + # File uploaded a while ago but no longer wanted. |
199 | + os.unlink(path) |
200 | + log.debug("Deleted %s" % path) |
201 | + count += 1 |
202 | + except OSError, e: |
203 | + if e.errno != errno.ENOENT: |
204 | + raise |
205 | + if config.librarian_server.upstream_host is None: |
206 | + # It is normal to have files in the database that |
207 | + # are not on disk if the Librarian has an upstream |
208 | + # Librarian, such as on staging. Don't annoy the |
209 | + # operator with noise in this case. |
210 | + log.info("%s already deleted", path) |
211 | |
212 | log.info( |
213 | "Deleted %d files from disk that where no longer referenced " |
Aaron Bentley (abentley) wrote : | # |
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
status approved
Please add a comment to the global log variable, as noted below. The
rest is chatter.
Stuart Bishop wrote:
>>> + log.debug("Getting references from %s.%s." % (table, column))
>>> + cur.execute("""
>>> + INSERT INTO ReferencedLibra
>>> + SELECT LibraryFileAlia
>>> + FROM LibraryFileAlias, %(table)s
>>> + WHERE LibraryFileAlias.id = %(table)
>>> + """ % {'table': table, 'column': column})
>> Wouldn't it be better practise to use SQL substitution here, like
>> ...WHERE LibraryFileAlias.id = ?.?", table, column)
>
> We can't use bound variables here because SQL identifiers use different quoting rules to SQL strings (" vs ').
>
>> I doubt any of our table names are SQL keywords, but some of our columns
>> might be named "from", for example.
>
> Yes - it is fragile. I've fixed this by doing the quoting myself using canonical.
Cool.
>> Is it actually performant to generate a table of *referenced*
>> LibraryFileContent? I would have thought you'd want to generate a table
>> of unreferenced content directly.
>
> I don't really know which approach would perform better. I don't think it is worth benchmarking unless performance turns out to be an issue. I suspect this approach is faster, as if we started with a table containing all LFC and deleted rows when we find references, we end up working with a table that is mostly holes.
Oh, I meant that it seemed like putting it all into one query that only
returned the rows we wanted to play with would allow the DB engine to do
less work, ideally scaling with the number of rows returned instead of
the total number of rows. As you say, if it's not a performance issue,
let's not worry about it.
>>> + cur.execute("DROP TABLE ReferencedLibra
>>> + cur.execute(
>>> + "SELECT COALESCE(max(id),0) FROM UnreferencedLib
>>> + self.max_id = cur.fetchone()[0]
>>> + log.debug("%d unferenced LibraryFileContent to remove." % self.max_id)
>>> con.commit()
>> Is there any risk that some LibraryFileContent might become referenced
>> after you commit?
>
> There is a risk. I mitigate it by not deleting anything younger than a day - if it hasn't been linked by then it probably isn't going to be linked. I could try catching the exception, aborting the database transaction and possibly retry the batch without the failed entry but I doubt it will ever actually happen and I'd rather not write the tests for this extra complexity :) At the moment, if this condition is actually triggered the foreign key constraints will cause an exception to be raised. I vote we worry about that if it happens more than once.
I guess I'm really wondering why not do it all in one transaction to
avoid that race?
>>> + def __call__(self, chunksize):
>>> + chunksize = int(chunksize)
>> It seems odd that this might not be an int.
>
> The loop tuner is probably too generic. You actually get a float in case your chunksize represe...
Preview Diff
1 | === modified file 'lib/canonical/librarian/ftests/test_gc.py' |
2 | --- lib/canonical/librarian/ftests/test_gc.py 2009-01-06 03:40:57 +0000 |
3 | +++ lib/canonical/librarian/ftests/test_gc.py 2009-07-16 02:56:16 +0000 |
4 | @@ -1,4 +1,4 @@ |
5 | -# Copyright 2004-2007 Canonical Ltd. All rights reserved. |
6 | +# Copyright 2004-2009 Canonical Ltd. All rights reserved. |
7 | """Librarian garbage collection tests""" |
8 | |
9 | __metaclass__ = type |
10 | @@ -119,6 +119,8 @@ |
11 | past = past.replace(tzinfo=utc) |
12 | f1.last_accessed = past |
13 | f2.last_accessed = past |
14 | + f1.date_created = past |
15 | + f2.date_created = past |
16 | |
17 | del f1, f2 |
18 | |
19 | @@ -159,7 +161,7 @@ |
20 | # This should have committed |
21 | self.ztm.begin() |
22 | |
23 | - # Confirm that the LibaryFileContents are still there |
24 | + # Confirm that the LibaryFileContents are still there. |
25 | c1 = LibraryFileContent.get(c1_id) |
26 | c2 = LibraryFileContent.get(c2_id) |
27 | |
28 | |
29 | === modified file 'lib/canonical/librarian/librariangc.py' |
30 | --- lib/canonical/librarian/librariangc.py 2009-01-20 08:15:41 +0000 |
31 | +++ lib/canonical/librarian/librariangc.py 2009-07-16 02:56:16 +0000 |
32 | @@ -1,20 +1,24 @@ |
33 | -# Copyright 2004-2008 Canonical Ltd. All rights reserved. |
34 | +# Copyright 2004-2009 Canonical Ltd. All rights reserved. |
35 | """Librarian garbage collection routines""" |
36 | |
37 | __metaclass__ = type |
38 | |
39 | from datetime import datetime, timedelta |
40 | +import errno |
41 | import sys |
42 | from time import time |
43 | import os |
44 | |
45 | +from zope.interface import implements |
46 | + |
47 | from canonical.config import config |
48 | +from canonical.database.postgresql import quoteIdentifier |
49 | +from canonical.launchpad.interfaces.looptuner import ITunableLoop |
50 | +from canonical.launchpad.utilities.looptuner import DBLoopTuner |
51 | from canonical.librarian.storage import _relFileLocation as relative_file_path |
52 | from canonical.librarian.storage import _sameFile |
53 | from canonical.database.postgresql import listReferences |
54 | |
55 | -BATCH_SIZE = 1 |
56 | - |
57 | log = None |
58 | debug = False |
59 | |
60 | @@ -176,8 +180,11 @@ |
61 | con.commit() |
62 | |
63 | |
64 | -def delete_unreferenced_aliases(con): |
65 | - """Delete unreferenced LibraryFileAliases and their LibraryFileContent |
66 | +class UnreferencedLibraryFileContentPruner: |
67 | + """Delete unreferenced LibraryFileAliases. |
68 | + |
69 | + The LibraryFileContent records are left untouched for the code that |
70 | + knows how to delete them and the corresponding files on disk. |
71 | |
72 | This is the second step in a full garbage collection sweep. We determine |
73 | which LibraryFileContent entries are not being referenced by other objects |
74 | @@ -190,92 +197,122 @@ |
75 | must be unreferenced for them to be deleted - a single reference will keep |
76 | the whole set alive. |
77 | """ |
78 | - log.info("Deleting unreferenced LibraryFileAliases") |
79 | - |
80 | - # Generate a set of all our LibraryFileContent ids, except for ones |
81 | - # with expiry dates not yet reached (these lurk in the database until |
82 | - # expired) and those that have been accessed in the last week (currently |
83 | - # in use, so leave them lurking a while longer) |
84 | - cur = con.cursor() |
85 | - cur.execute(""" |
86 | - SELECT c.id |
87 | - FROM LibraryFileContent AS c, LibraryFileAlias AS a |
88 | - WHERE c.id = a.content |
89 | - GROUP BY c.id |
90 | - HAVING (max(expires) IS NULL OR max(expires) |
91 | - < CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - '1 week'::interval |
92 | - ) |
93 | - AND (max(last_accessed) IS NULL OR max(last_accessed) |
94 | - < CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - '1 week'::interval |
95 | - ) |
96 | - """) |
97 | - content_ids = set(row[0] for row in cur.fetchall()) |
98 | - log.info( |
99 | - "Found %d LibraryFileContent entries possibly unreferenced", |
100 | - len(content_ids) |
101 | - ) |
102 | - |
103 | - # Determine what columns link to LibraryFileAlias |
104 | - # references = [(table, column), ...] |
105 | - references = [ |
106 | - tuple(ref[:2]) |
107 | - for ref in listReferences(cur, 'libraryfilealias', 'id') |
108 | - if ref[0] != 'libraryfiledownloadcount' |
109 | - ] |
110 | - assert len(references) > 10, 'Database introspection returned nonsense' |
111 | - log.info("Found %d columns referencing LibraryFileAlias", len(references)) |
112 | - |
113 | - # Remove all referenced LibraryFileContent ids from content_ids |
114 | - for table, column in references: |
115 | - cur.execute(""" |
116 | - SELECT DISTINCT LibraryFileContent.id |
117 | - FROM LibraryFileContent, LibraryFileAlias, %(table)s |
118 | - WHERE LibraryFileContent.id = LibraryFileAlias.content |
119 | - AND LibraryFileAlias.id = %(table)s.%(column)s |
120 | - """ % vars()) |
121 | - referenced_ids = set(row[0] for row in cur.fetchall()) |
122 | - log.info( |
123 | - "Found %d distinct LibraryFileAlias references in %s(%s)", |
124 | - len(referenced_ids), table, column |
125 | - ) |
126 | - content_ids.difference_update(referenced_ids) |
127 | - log.debug( |
128 | - "Now only %d LibraryFileContents possibly unreferenced", |
129 | - len(content_ids) |
130 | - ) |
131 | - |
132 | - # Delete unreferenced LibraryFileAliases. Note that this will raise a |
133 | - # database exception if we screwed up and attempt to delete an alias that |
134 | - # is still referenced. |
135 | - content_ids = list(content_ids) |
136 | - for i in range(0, len(content_ids), BATCH_SIZE): |
137 | - in_content_ids = ','.join( |
138 | - (str(content_id) for content_id in content_ids[i:i+BATCH_SIZE])) |
139 | - # First a sanity check to ensure we aren't removing anything we |
140 | - # shouldn't be. |
141 | - cur.execute(""" |
142 | - SELECT COUNT(*) |
143 | + implements(ITunableLoop) |
144 | + |
145 | + def __init__(self, con): |
146 | + self.con = con # Database connection to use |
147 | + self.total_deleted = 0 # Running total |
148 | + self.index = 1 |
149 | + |
150 | + log.info("Deleting unreferenced LibraryFileAliases") |
151 | + |
152 | + cur = con.cursor() |
153 | + |
154 | + # Start with a list of all unexpired and recently accessed |
155 | + # content - we don't remove them even if they are unlinked. We |
156 | + # currently don't remove stuff until it has been expired for |
157 | + # more than one week, but we will change this if disk space |
158 | + # becomes short and it actually will make a noticeable |
159 | + # difference. Note that ReferencedLibraryFileContent will |
160 | + # contain duplicates - duplicates are unusual so we are better |
161 | + # off filtering them once at the end rather than when we load |
162 | + # the data into the temporary file. |
163 | + cur.execute("DROP TABLE IF EXISTS ReferencedLibraryFileContent") |
164 | + cur.execute(""" |
165 | + SELECT LibraryFileAlias.content |
166 | + INTO TEMPORARY TABLE ReferencedLibraryFileContent |
167 | FROM LibraryFileAlias |
168 | - WHERE content in (%(in_content_ids)s) |
169 | - AND ( |
170 | - expires + '1 week'::interval |
171 | - > CURRENT_TIMESTAMP AT TIME ZONE 'UTC' |
172 | - OR last_accessed + '1 week'::interval |
173 | - > CURRENT_TIMESTAMP AT TIME ZONE 'UTC' |
174 | - ) |
175 | - """ % vars()) |
176 | - assert cur.fetchone()[0] == 0, "Logic error - sanity check failed" |
177 | + WHERE |
178 | + expires > |
179 | + CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - interval '1 week' |
180 | + OR last_accessed > |
181 | + CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - interval '1 week' |
182 | + OR date_created > |
183 | + CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - interval '1 week' |
184 | + """) |
185 | + con.commit() |
186 | + |
187 | + # Determine what columns link to LibraryFileAlias |
188 | + # references = [(table, column), ...] |
189 | + references = [ |
190 | + tuple(ref[:2]) |
191 | + for ref in listReferences(cur, 'libraryfilealias', 'id') |
192 | + if ref[0] != 'libraryfiledownloadcount' |
193 | + ] |
194 | + assert len(references) > 10, ( |
195 | + 'Database introspection returned nonsense') |
196 | log.debug( |
197 | - "Deleting all LibraryFileAlias references to " |
198 | - "LibraryFileContents %s", in_content_ids |
199 | - ) |
200 | - cur.execute(""" |
201 | - DELETE FROM LibraryFileAlias WHERE content IN (%(in_content_ids)s) |
202 | - """ % vars()) |
203 | + "Found %d columns referencing LibraryFileAlias", len(references)) |
204 | + |
205 | + # Find all relevant LibraryFileAlias references and fill in |
206 | + # ReferencedLibraryFileContent |
207 | + for table, column in references: |
208 | + log.debug("Getting references from %s.%s." % (table, column)) |
209 | + cur.execute(""" |
210 | + INSERT INTO ReferencedLibraryFileContent |
211 | + SELECT LibraryFileAlias.content |
212 | + FROM LibraryFileAlias, %(table)s |
213 | + WHERE LibraryFileAlias.id = %(table)s.%(column)s |
214 | + """ % { |
215 | + 'table': quoteIdentifier(table), |
216 | + 'column': quoteIdentifier(column)}) |
217 | + con.commit() |
218 | + |
219 | + log.debug("Calculating expired unreferenced LibraryFileContent set.") |
220 | + cur.execute("DROP TABLE IF EXISTS UnreferencedLibraryFileContent") |
221 | + cur.execute(""" |
222 | + CREATE TEMPORARY TABLE UnreferencedLibraryFileContent ( |
223 | + id serial PRIMARY KEY, |
224 | + content integer UNIQUE) |
225 | + """) |
226 | + cur.execute(""" |
227 | + INSERT INTO UnreferencedLibraryFileContent (content) |
228 | + SELECT id AS content FROM LibraryFileContent |
229 | + EXCEPT |
230 | + SELECT content FROM ReferencedLibraryFileContent |
231 | + """) |
232 | + cur.execute("DROP TABLE ReferencedLibraryFileContent") |
233 | + cur.execute( |
234 | + "SELECT COALESCE(max(id),0) FROM UnreferencedLibraryFileContent") |
235 | + self.max_id = cur.fetchone()[0] |
236 | + log.debug("%d unferenced LibraryFileContent to remove." % self.max_id) |
237 | con.commit() |
238 | |
239 | - |
240 | -def delete_unreferenced_content(con): |
241 | + def isDone(self): |
242 | + if self.index > self.max_id: |
243 | + log.info( |
244 | + "Deleted %d LibraryFileAlias records." % self.total_deleted) |
245 | + return True |
246 | + else: |
247 | + return False |
248 | + |
249 | + def __call__(self, chunksize): |
250 | + chunksize = int(chunksize) |
251 | + cur = self.con.cursor() |
252 | + cur.execute(""" |
253 | + DELETE FROM LibraryFileAlias |
254 | + USING ( |
255 | + SELECT content FROM UnreferencedLibraryFileContent |
256 | + WHERE id BETWEEN %s AND %s |
257 | + ) AS UnreferencedLibraryFileContent |
258 | + WHERE LibraryFileAlias.content |
259 | + = UnreferencedLibraryFileContent.content |
260 | + """, (self.index, self.index + chunksize - 1)) |
261 | + deleted_rows = cur.rowcount |
262 | + self.total_deleted += deleted_rows |
263 | + log.debug("Deleted %d LibraryFileAlias records." % deleted_rows) |
264 | + self.con.commit() |
265 | + self.index += chunksize |
266 | + |
267 | + |
268 | +def delete_unreferenced_aliases(con): |
269 | + "Run the UnreferencedLibraryFileContentPruner." |
270 | + loop_tuner = DBLoopTuner( |
271 | + UnreferencedLibraryFileContentPruner(con), 5, log=log) |
272 | + loop_tuner.run() |
273 | + |
274 | + |
275 | +class UnreferencedContentPruner: |
276 | """Delete LibraryFileContent entries and their disk files that are |
277 | not referenced by any LibraryFileAlias entries. |
278 | |
279 | @@ -283,87 +320,159 @@ |
280 | LibraryFileAlias, so all entries in this state are garbage no matter |
281 | what their expires flag says. |
282 | """ |
283 | - cur = con.cursor() |
284 | - cur.execute(""" |
285 | - SELECT LibraryFileContent.id |
286 | - FROM LibraryFileContent |
287 | - LEFT OUTER JOIN LibraryFileAlias |
288 | - ON LibraryFileContent.id = LibraryFileAlias.content |
289 | - WHERE LibraryFileAlias.content IS NULL |
290 | + implements(ITunableLoop) |
291 | + |
292 | + def __init__(self, con): |
293 | + self.con = con |
294 | + self.index = 1 |
295 | + self.total_deleted = 0 |
296 | + cur = con.cursor() |
297 | + cur.execute("DROP TABLE IF EXISTS UnreferencedLibraryFileContent") |
298 | + cur.execute(""" |
299 | + CREATE TEMPORARY TABLE UnreferencedLibraryFileContent ( |
300 | + id serial PRIMARY KEY, |
301 | + content integer UNIQUE) |
302 | + """) |
303 | + cur.execute(""" |
304 | + INSERT INTO UnreferencedLibraryFileContent (content) |
305 | + SELECT DISTINCT LibraryFileContent.id |
306 | + FROM LibraryFileContent |
307 | + LEFT OUTER JOIN LibraryFileAlias |
308 | + ON LibraryFileContent.id = LibraryFileAlias.content |
309 | + WHERE LibraryFileAlias.content IS NULL |
310 | """) |
311 | - garbage_ids = [row[0] for row in cur.fetchall()] |
312 | - |
313 | - for i in range(0, len(garbage_ids), BATCH_SIZE): |
314 | - in_garbage_ids = ','.join( |
315 | - (str(garbage_id) for garbage_id in garbage_ids[i:i+BATCH_SIZE]) |
316 | - ) |
317 | - |
318 | - # Delete old LibraryFileContent entries. Note that this will fail |
319 | - # if we screwed up and still have LibraryFileAlias entries referencing |
320 | - # it. |
321 | - log.debug("Deleting LibraryFileContents %s", in_garbage_ids) |
322 | - cur.execute(""" |
323 | - DELETE FROM LibraryFileContent WHERE id in (%s) |
324 | - """ % in_garbage_ids) |
325 | - |
326 | - for garbage_id in garbage_ids[i:i+BATCH_SIZE]: |
327 | + cur.execute(""" |
328 | + SELECT COALESCE(max(id), 0) FROM UnreferencedLibraryFileContent |
329 | + """) |
330 | + self.max_id = cur.fetchone()[0] |
331 | + log.debug("%d unreferenced LibraryFileContent rows to remove.") |
332 | + |
333 | + def isDone(self): |
334 | + if self.index > self.max_id: |
335 | + log.info("Deleted %d unreferenced files." % self.total_deleted) |
336 | + return True |
337 | + else: |
338 | + return False |
339 | + |
340 | + def __call__(self, chunksize): |
341 | + chunksize = int(chunksize) |
342 | + |
343 | + cur = self.con.cursor() |
344 | + |
345 | + # Delete unreferenced LibraryFileContent entries. |
346 | + cur.execute(""" |
347 | + DELETE FROM LibraryFileContent |
348 | + USING ( |
349 | + SELECT content FROM UnreferencedLibraryFileContent |
350 | + WHERE id BETWEEN %s AND %s) AS UnreferencedLibraryFileContent |
351 | + WHERE |
352 | + LibraryFileContent.id = UnreferencedLibraryFileContent.content |
353 | + """, (self.index, self.index + chunksize - 1)) |
354 | + rows_deleted = cur.rowcount |
355 | + self.total_deleted += rows_deleted |
356 | + self.con.commit() |
357 | + |
358 | + # Remove files from disk. We do this outside the transaction, |
359 | + # as the garbage collector happily deals with files that exist |
360 | + # on disk but not in the DB. |
361 | + cur.execute(""" |
362 | + SELECT content FROM UnreferencedLibraryFileContent |
363 | + WHERE id BETWEEN %s AND %s |
364 | + """, (self.index, self.index + chunksize - 1)) |
365 | + for content_id in (row[0] for row in cur.fetchall()): |
366 | # Remove the file from disk, if it hasn't already been |
367 | - path = get_file_path(garbage_id) |
368 | - if os.path.exists(path): |
369 | - log.debug("Deleting %s", path) |
370 | + path = get_file_path(content_id) |
371 | + try: |
372 | os.unlink(path) |
373 | - else: |
374 | - log.info("%s already deleted", path) |
375 | - |
376 | - # And commit the database changes. It may be possible for this to |
377 | - # fail in rare cases, leaving a record in the DB with no corresponding |
378 | - # file on disk, but that is OK as it will all be tidied up next run, |
379 | - # and the file is unreachable anyway so nothing will attempt to |
380 | - # access it between now and the next garbage collection run. |
381 | - con.commit() |
382 | - |
383 | - |
384 | -def flag_expired_files(connection): |
385 | + log.debug("Deleted %s", path) |
386 | + except OSError, e: |
387 | + if e.errno != errno.ENOENT: |
388 | + raise |
389 | + if config.librarian_server.upstream_host is None: |
390 | + # It is normal to have files in the database that |
391 | + # are not on disk if the Librarian has an upstream |
392 | + # Librarian, such as on staging. Don't annoy the |
393 | + # operator with noise in this case. |
394 | + log.info("%s already deleted", path) |
395 | + |
396 | + self.index += chunksize |
397 | + |
398 | + |
399 | +def delete_unreferenced_content(con): |
400 | + """Invoke UnreferencedContentPruner.""" |
401 | + loop_tuner = DBLoopTuner(UnreferencedContentPruner(con), 5, log=log) |
402 | + loop_tuner.run() |
403 | + |
404 | + |
405 | +class FlagExpiredFiles: |
406 | """Flag files past their expiry date as 'deleted' in the database. |
407 | |
408 | Actual removal from disk is not performed here - that is deferred to |
409 | delete_unwanted_files(). |
410 | """ |
411 | - cur = connection.cursor() |
412 | - |
413 | - # First get the list of all LibraryFileContent. |
414 | - cur.execute("SELECT id FROM LibraryFileContent WHERE deleted IS FALSE") |
415 | - all_ids = set(row[0] for row in cur.fetchall()) |
416 | - |
417 | - # Now the list of unexpired content. May contain some ids not in the |
418 | - # all_ids set if uploads are currently in progress. |
419 | - cur.execute(""" |
420 | - SELECT DISTINCT content |
421 | - FROM LibraryFileAlias |
422 | - WHERE expires IS NULL |
423 | - OR expires >= CURRENT_TIMESTAMP AT TIME ZONE 'UTC' |
424 | - """) |
425 | - unexpired_ids = set(row[0] for row in cur.fetchall()) |
426 | - |
427 | - # Destroy our all_ids set to create the set of expired ids. |
428 | - # We do it this way, as we are dealing with large sets and need to |
429 | - # be careful of RAM usage on the production server. |
430 | - all_ids -= unexpired_ids |
431 | - expired_ids = all_ids |
432 | - del all_ids |
433 | - del unexpired_ids |
434 | - |
435 | - for commit_counter, content_id in enumerate(expired_ids): |
436 | - log.debug("%d is expired." % content_id) |
437 | - cur = connection.cursor() |
438 | + implements(ITunableLoop) |
439 | + |
440 | + def __init__(self, con): |
441 | + self.con = con |
442 | + self.index = 1 |
443 | + self.total_flagged = 0 |
444 | + cur = con.cursor() |
445 | + |
446 | + log.debug("Creating set of expired LibraryFileContent.") |
447 | + cur.execute("DROP TABLE IF EXISTS ExpiredLibraryFileContent") |
448 | + cur.execute(""" |
449 | + CREATE TEMPORARY TABLE ExpiredLibraryFileContent |
450 | + (id serial PRIMARY KEY, content integer UNIQUE) |
451 | + """) |
452 | + cur.execute(""" |
453 | + INSERT INTO ExpiredLibraryFileContent (content) |
454 | + SELECT id FROM LibraryFileContent WHERE deleted IS FALSE |
455 | + EXCEPT ALL |
456 | + SELECT DISTINCT content |
457 | + FROM LibraryFileAlias |
458 | + WHERE expires IS NULL |
459 | + OR expires >= CURRENT_TIMESTAMP AT TIME ZONE 'UTC' |
460 | + """) |
461 | + cur.execute( |
462 | + "SELECT COALESCE(max(id),0) FROM ExpiredLibraryFileContent") |
463 | + self.max_id = cur.fetchone()[0] |
464 | + log.debug( |
465 | + "%d expired LibraryFileContent to flag for removal." |
466 | + % self.max_id) |
467 | + |
468 | + def isDone(self): |
469 | + if self.index > self.max_id: |
470 | + log.info( |
471 | + "Flagged %d expired files for removal." |
472 | + % self.total_flagged) |
473 | + return True |
474 | + else: |
475 | + return False |
476 | + |
477 | + def __call__(self, chunksize): |
478 | + chunksize = int(chunksize) |
479 | + cur = self.con.cursor() |
480 | cur.execute(""" |
481 | UPDATE LibraryFileContent SET deleted=TRUE |
482 | - WHERE id = %d |
483 | - """ % content_id) |
484 | - if commit_counter % 100 == 0: |
485 | - connection.commit() |
486 | - connection.commit() |
487 | - log.info("Flagged %d expired files for removal." % len(expired_ids)) |
488 | + FROM ( |
489 | + SELECT content FROM ExpiredLibraryFileContent |
490 | + WHERE id BETWEEN %s AND %s |
491 | + ) AS ExpiredLibraryFileContent |
492 | + WHERE LibraryFileContent.id = ExpiredLibraryFileContent.content |
493 | + """, (self.index, self.index + chunksize - 1)) |
494 | + flagged_rows = cur.rowcount |
495 | + log.debug( |
496 | + "Flagged %d expired LibraryFileContent for removal." |
497 | + % flagged_rows) |
498 | + self.total_flagged += flagged_rows |
499 | + self.index += chunksize |
500 | + self.con.commit() |
501 | + |
502 | + |
503 | +def flag_expired_files(connection): |
504 | + """Invoke FlagExpiredFiles.""" |
505 | + loop_tuner = DBLoopTuner(FlagExpiredFiles(connection), 5, log=log) |
506 | + loop_tuner.run() |
507 | |
508 | |
509 | def delete_unwanted_files(con): |
510 | @@ -380,38 +489,72 @@ |
511 | cur.execute("SELECT max(id) from LibraryFileContent") |
512 | max_id = cur.fetchone()[0] |
513 | |
514 | - # Build a set containing all stored LibraryFileContent ids |
515 | - # that we want to keep. |
516 | + # Calculate all stored LibraryFileContent ids that we want to keep. |
517 | + # Results are ordered so we don't have to suck them all in at once. |
518 | cur.execute(""" |
519 | SELECT id FROM LibraryFileContent |
520 | WHERE deleted IS FALSE OR datecreated |
521 | > CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - '1 day'::interval |
522 | + ORDER BY id |
523 | """) |
524 | - all_ids = set(row[0] for row in cur.fetchall()) |
525 | + |
526 | + def get_next_wanted_content_id(): |
527 | + result = cur.fetchone() |
528 | + if result is None: |
529 | + return None |
530 | + else: |
531 | + return result[0] |
532 | |
533 | count = 0 |
534 | + next_wanted_content_id = get_next_wanted_content_id() |
535 | + |
536 | for content_id in range(1, max_id+1): |
537 | - if content_id in all_ids: |
538 | - continue # Linked in the db - do nothing |
539 | + while (next_wanted_content_id is not None |
540 | + and content_id > next_wanted_content_id): |
541 | + next_wanted_content_id = get_next_wanted_content_id() |
542 | + |
543 | + file_wanted = ( |
544 | + next_wanted_content_id is not None |
545 | + and next_wanted_content_id == content_id) |
546 | + |
547 | path = get_file_path(content_id) |
548 | |
549 | - if not os.path.exists(path): |
550 | - continue # Exists neither on disk nor in the database - do nothing |
551 | - |
552 | - one_day = 24 * 60 * 60 |
553 | - if time() - os.path.getctime(path) < one_day: |
554 | - log.debug( |
555 | - "File %d not removed - created too recently" % content_id) |
556 | - continue # File created too recently - do nothing |
557 | - |
558 | - # File uploaded a while ago but not in the database - remove it |
559 | - log.debug("Deleting %s" % path) |
560 | - os.remove(path) |
561 | - count += 1 |
562 | + if file_wanted: |
563 | + if (config.librarian_server.upstream_host is None |
564 | + and not os.path.exists(path)): |
565 | + # It is normal to have files in the database that are |
566 | + # not on disk if the Librarian has an upstream |
567 | + # Librarian, such as on staging. Don't spam in this |
568 | + # case. |
569 | + log.error( |
570 | + "LibraryFileContent %d exists in the db but not at %s" |
571 | + % (content_id, path)) |
572 | + |
573 | + else: |
574 | + try: |
575 | + one_day = 24 * 60 * 60 |
576 | + if time() - os.path.getctime(path) < one_day: |
577 | + log.debug( |
578 | + "File %d not removed - created too recently" |
579 | + % content_id) |
580 | + else: |
581 | + # File uploaded a while ago but no longer wanted. |
582 | + os.unlink(path) |
583 | + log.debug("Deleted %s" % path) |
584 | + count += 1 |
585 | + except OSError, e: |
586 | + if e.errno != errno.ENOENT: |
587 | + raise |
588 | + if config.librarian_server.upstream_host is None: |
589 | + # It is normal to have files in the database that |
590 | + # are not on disk if the Librarian has an upstream |
591 | + # Librarian, such as on staging. Don't annoy the |
592 | + # operator with noise in this case. |
593 | + log.info("%s already deleted", path) |
594 | |
595 | log.info( |
596 | - "Removed %d from disk that where no longer referenced in the db" |
597 | - % count |
598 | + "Deleted %d files from disk that where no longer referenced " |
599 | + "in the db" % count |
600 | ) |
601 | |
602 |
Address Bug #290837.
The Librarian garbage collector currently drives the system into swap because it is doing its set logic in RAM. We should stop doing this.