Merge lp:~stub/launchpad/librarian-gc into lp:launchpad

Proposed by Stuart Bishop
Status: Merged
Approved by: Aaron Bentley
Approved revision: no longer in the source branch.
Merged at revision: not available
Proposed branch: lp:~stub/launchpad/librarian-gc
Merge into: lp:launchpad
Diff against target: 600 lines
To merge this branch: bzr merge lp:~stub/launchpad/librarian-gc
Reviewer Review Type Date Requested Status
Aaron Bentley (community) Approve
Canonical Launchpad Engineering Pending
Review via email: mp+8451@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Stuart Bishop (stub) wrote :

Address Bug #290837.

The Librarian garbage collector currently drives the system into swap because it is doing its set logic in RAM. We should stop doing this.

Revision history for this message
Aaron Bentley (abentley) wrote :
Download full text (27.3 KiB)

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

 review needs-fixing

Stuart Bishop wrote:
> Stuart Bishop has proposed merging lp:~stub/launchpad/librarian-gc into lp:~launchpad-pqm/launchpad/devel.
>
> Requested reviews:
> Canonical Launchpad Engineering (launchpad)
>
> Address Bug #290837.
>
> The Librarian garbage collector currently drives the system into swap because it is doing its set logic in RAM. We should stop doing this.
>
> === modified file 'lib/canonical/librarian/ftests/test_gc.py'
> --- lib/canonical/librarian/ftests/test_gc.py 2009-01-06 03:40:57 +0000
> +++ lib/canonical/librarian/ftests/test_gc.py 2009-07-09 12:41:44 +0000
> @@ -1,4 +1,4 @@
> -# Copyright 2004-2007 Canonical Ltd. All rights reserved.
> +# Copyright 2004-2009 Canonical Ltd. All rights reserved.
> """Librarian garbage collection tests"""
>
> __metaclass__ = type
> @@ -119,6 +119,8 @@
> past = past.replace(tzinfo=utc)
> f1.last_accessed = past
> f2.last_accessed = past
> + f1.date_created = past
> + f2.date_created = past
>
> del f1, f2
>
> @@ -159,7 +161,7 @@
> # This should have committed
> self.ztm.begin()
>
> - # Confirm that the LibaryFileContents are still there
> + # Confirm that the LibaryFileContents are still there.
> c1 = LibraryFileContent.get(c1_id)
> c2 = LibraryFileContent.get(c2_id)
>
>
> === modified file 'lib/canonical/librarian/librariangc.py'
> --- lib/canonical/librarian/librariangc.py 2009-01-20 08:15:41 +0000
> +++ lib/canonical/librarian/librariangc.py 2009-07-09 12:40:21 +0000
> @@ -1,4 +1,4 @@
> -# Copyright 2004-2008 Canonical Ltd. All rights reserved.
> +# Copyright 2004-2009 Canonical Ltd. All rights reserved.
> """Librarian garbage collection routines"""
>
> __metaclass__ = type
> @@ -8,13 +8,15 @@
> from time import time
> import os
>
> +from zope.interface import implements
> +
> from canonical.config import config
> +from canonical.launchpad.interfaces.looptuner import ITunableLoop
> +from canonical.launchpad.utilities.looptuner import DBLoopTuner
> from canonical.librarian.storage import _relFileLocation as relative_file_path
> from canonical.librarian.storage import _sameFile
> from canonical.database.postgresql import listReferences
>
> -BATCH_SIZE = 1
> -
> log = None
> debug = False
>
> @@ -176,8 +178,11 @@
> con.commit()
>
>
> -def delete_unreferenced_aliases(con):
> - """Delete unreferenced LibraryFileAliases and their LibraryFileContent
> +class UnreferencedLibraryFileContentPruner:
> + """Delete unreferenced LibraryFileAliases.
> +
> + The LibraryFileContent records are left untouched for the code that
> + knows how to delete them and the corresponding files on disk.
>
> This is the second step in a full garbage collection sweep. We determine
> which LibraryFileContent entries are not being referenced by other objects
> @@ -190,92 +195,124 @@
> must be unreferenced for them to be deleted - a single reference will keep
> the whole set alive.
> """
> - log.info("Deleting unreferenced LibraryFileAliases")
> -
> - # Generate ...

review: Needs Fixing
Revision history for this message
Stuart Bishop (stub) wrote :
Download full text (15.6 KiB)

On Mon, Jul 13, 2009 at 10:04 PM, Aaron Bentley<email address hidden> wrote:

>  review needs-fixing
>
> Stuart Bishop wrote:
>> Stuart Bishop has proposed merging lp:~stub/launchpad/librarian-gc into lp:~launchpad-pqm/launchpad/devel.
>>
>> Requested reviews:
>>     Canonical Launchpad Engineering (launchpad)
>>
>> Address Bug #290837.
>>
>> The Librarian garbage collector currently drives the system into swap because it is doing its set logic in RAM. We should stop doing this.

>> === modified file 'lib/canonical/librarian/librariangc.py'
>> +    implements(ITunableLoop)
>> +
>> +    index = 1 # Index of current batch of rows to remove.
>> +
>> +    total_deleted = 0 # Running total
>> +
>> +    con = None # Database connection to use
>
> "index", "total_deleted" and "con" are instance variables.  It would not
> make sense to check the value of
> UnreferencedLibraryFileContentPruner.total_deleted, ever.  You would
> only check it on an instance of UnreferencedLibraryFileContentPruner.
> Therefore, they should be initialized in
> UnreferencedLibraryFileContentPruner.__init__, instead.

Ok. These are all being initialized in __init__ and the class attributes removed here and in the other two tunable loops.

>> +        # Start with a list of all unexpired and recently accessed content
>> +        # - we don't remove them even if they are unlinked.
>> +        # We currently don't remove stuff until it has been expired for
>> +        # more than one week, but we will change this if disk space becomes
>> +        # short and it actually will make a noticeable difference.
>> +        # Note that ReferencedLibraryFileContent will contain duplicates -
>> +        # duplicates are the exception so we are better off filtering them
>
> Perhaps "duplicates are unusual", "duplicates are exceptional", or else
> "duplicates are the exception, not the rule".

Fixed. "duplicates are unusual".

>> +        # once at the end rather than when we load the data into the temporary
>> +        # file.
>> +        cur.execute("DROP TABLE IF EXISTS ReferencedLibraryFileContent")
>> +        cur.execute("""
>> +            SELECT LibraryFileAlias.content
>> +            INTO TEMPORARY TABLE ReferencedLibraryFileContent
>>              FROM LibraryFileAlias
>> -            WHERE content in (%(in_content_ids)s)
>> -                AND (
>> -                    expires + '1 week'::interval
>> -                        > CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
>> -                    OR last_accessed + '1 week'::interval
>> -                        > CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
>> -                    )
>> -            """ % vars())
>> -        assert cur.fetchone()[0] == 0, "Logic error - sanity check failed"
>> +            WHERE
>> +                expires >
>> +                    CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - interval '1 week'
>> +                OR last_accessed >
>> +                    CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - interval '1 week'
>> +                OR date_created >
>> +                    CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - interval '1 week'
>> +            """...

1=== modified file 'lib/canonical/librarian/librariangc.py'
2--- lib/canonical/librarian/librariangc.py 2009-07-09 12:40:21 +0000
3+++ lib/canonical/librarian/librariangc.py 2009-07-14 08:09:48 +0000
4@@ -4,6 +4,7 @@
5 __metaclass__ = type
6
7 from datetime import datetime, timedelta
8+import errno
9 import sys
10 from time import time
11 import os
12@@ -11,6 +12,7 @@
13 from zope.interface import implements
14
15 from canonical.config import config
16+from canonical.database.postgresql import quoteIdentifier
17 from canonical.launchpad.interfaces.looptuner import ITunableLoop
18 from canonical.launchpad.utilities.looptuner import DBLoopTuner
19 from canonical.librarian.storage import _relFileLocation as relative_file_path
20@@ -197,28 +199,24 @@
21 """
22 implements(ITunableLoop)
23
24- index = 1 # Index of current batch of rows to remove.
25-
26- total_deleted = 0 # Running total
27-
28- con = None # Database connection to use
29-
30 def __init__(self, con):
31- self.con = con
32+ self.con = con # Database connection to use
33+ self.total_deleted = 0 # Running total
34+ self.index = 1
35
36 log.info("Deleting unreferenced LibraryFileAliases")
37
38 cur = con.cursor()
39
40- # Start with a list of all unexpired and recently accessed content
41- # - we don't remove them even if they are unlinked.
42- # We currently don't remove stuff until it has been expired for
43- # more than one week, but we will change this if disk space becomes
44- # short and it actually will make a noticeable difference.
45- # Note that ReferencedLibraryFileContent will contain duplicates -
46- # duplicates are the exception so we are better off filtering them
47- # once at the end rather than when we load the data into the temporary
48- # file.
49+ # Start with a list of all unexpired and recently accessed
50+ # content - we don't remove them even if they are unlinked. We
51+ # currently don't remove stuff until it has been expired for
52+ # more than one week, but we will change this if disk space
53+ # becomes short and it actually will make a noticeable
54+ # difference. Note that ReferencedLibraryFileContent will
55+ # contain duplicates - duplicates are unusual so we are better
56+ # off filtering them once at the end rather than when we load
57+ # the data into the temporary file.
58 cur.execute("DROP TABLE IF EXISTS ReferencedLibraryFileContent")
59 cur.execute("""
60 SELECT LibraryFileAlias.content
61@@ -255,7 +253,9 @@
62 SELECT LibraryFileAlias.content
63 FROM LibraryFileAlias, %(table)s
64 WHERE LibraryFileAlias.id = %(table)s.%(column)s
65- """ % {'table': table, 'column': column})
66+ """ % {
67+ 'table': quoteIdentifier(table),
68+ 'column': quoteIdentifier(column)})
69 con.commit()
70
71 log.debug("Calculating expired unreferenced LibraryFileContent set.")
72@@ -321,12 +321,11 @@
73 what their expires flag says.
74 """
75 implements(ITunableLoop)
76- index = 0
77- con = None
78- total_deleted = 0
79
80 def __init__(self, con):
81 self.con = con
82+ self.index = 1
83+ self.total_deleted = 0
84 cur = con.cursor()
85 cur.execute("DROP TABLE IF EXISTS UnreferencedLibraryFileContent")
86 cur.execute("""
87@@ -365,10 +364,10 @@
88 DELETE FROM LibraryFileContent
89 USING (
90 SELECT content FROM UnreferencedLibraryFileContent
91- WHERE id BETWEEN %d AND %d) AS UnreferencedLibraryFileContent
92+ WHERE id BETWEEN %s AND %s) AS UnreferencedLibraryFileContent
93 WHERE
94 LibraryFileContent.id = UnreferencedLibraryFileContent.content
95- """ % (self.index, self.index + chunksize - 1))
96+ """, (self.index, self.index + chunksize - 1))
97 rows_deleted = cur.rowcount
98 self.total_deleted += rows_deleted
99 self.con.commit()
100@@ -378,19 +377,23 @@
101 # on disk but not in the DB.
102 cur.execute("""
103 SELECT content FROM UnreferencedLibraryFileContent
104- WHERE id BETWEEN %d AND %d
105- """ % (self.index, self.index + chunksize - 1))
106+ WHERE id BETWEEN %s AND %s
107+ """, (self.index, self.index + chunksize - 1))
108 for content_id in (row[0] for row in cur.fetchall()):
109 # Remove the file from disk, if it hasn't already been
110 path = get_file_path(content_id)
111- if os.path.exists(path):
112- log.debug("Deleting %s", path)
113+ try:
114 os.unlink(path)
115- elif config.librarian_server.upstream_host is None:
116- # It is normal to have files in the database that are
117- # not on disk if the Librarian has an upstream
118- # Librarian, such as on staging.
119- log.info("%s already deleted", path)
120+ log.debug("Deleted %s", path)
121+ except OSError, e:
122+ if e.errno != errno.ENOENT:
123+ raise
124+ if config.librarian_server.upstream_host is None:
125+ # It is normal to have files in the database that
126+ # are not on disk if the Librarian has an upstream
127+ # Librarian, such as on staging. Don't annoy the
128+ # operator with noise in this case.
129+ log.info("%s already deleted", path)
130
131 self.index += chunksize
132
133@@ -409,14 +412,10 @@
134 """
135 implements(ITunableLoop)
136
137- index = 1
138-
139- con = None # Database connection to use
140-
141- total_flagged = 0
142-
143 def __init__(self, con):
144 self.con = con
145+ self.index = 1
146+ self.total_flagged = 0
147 cur = con.cursor()
148
149 log.debug("Creating set of expired LibraryFileContent.")
150@@ -514,33 +513,44 @@
151 and content_id > next_wanted_content_id):
152 next_wanted_content_id = get_next_wanted_content_id()
153
154- if (next_wanted_content_id is not None
155- and next_wanted_content_id == content_id):
156- file_wanted = True
157- else:
158- file_wanted = False
159+ file_wanted = (
160+ next_wanted_content_id is not None
161+ and next_wanted_content_id == content_id)
162
163 path = get_file_path(content_id)
164- path_exists = os.path.exists(path)
165
166- if file_wanted and not path_exists:
167- if config.librarian_server.upstream_host is None:
168+ if file_wanted:
169+ if (config.librarian_server.upstream_host is None
170+ and not os.path.exists(path)):
171 # It is normal to have files in the database that are
172 # not on disk if the Librarian has an upstream
173- # Librarian, such as on staging.
174+ # Librarian, such as on staging. Don't spam in this
175+ # case.
176 log.error(
177 "LibraryFileContent %d exists in the db but not at %s"
178 % (content_id, path))
179- elif path_exists and not file_wanted:
180- one_day = 24 * 60 * 60
181- if time() - os.path.getctime(path) < one_day:
182- log.debug(
183- "File %d not removed - created too recently" % content_id)
184- else:
185- # File uploaded a while ago but no longer wanted - remove it
186- log.debug("Deleting %s" % path)
187- os.unlink(path)
188- count += 1
189+
190+ else:
191+ try:
192+ one_day = 24 * 60 * 60
193+ if time() - os.path.getctime(path) < one_day:
194+ log.debug(
195+ "File %d not removed - created too recently"
196+ % content_id)
197+ else:
198+ # File uploaded a while ago but no longer wanted.
199+ os.unlink(path)
200+ log.debug("Deleted %s" % path)
201+ count += 1
202+ except OSError, e:
203+ if e.errno != errno.ENOENT:
204+ raise
205+ if config.librarian_server.upstream_host is None:
206+ # It is normal to have files in the database that
207+ # are not on disk if the Librarian has an upstream
208+ # Librarian, such as on staging. Don't annoy the
209+ # operator with noise in this case.
210+ log.info("%s already deleted", path)
211
212 log.info(
213 "Deleted %d files from disk that where no longer referenced "
Revision history for this message
Aaron Bentley (abentley) wrote :
Download full text (8.5 KiB)

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

 status approved

Please add a comment to the global log variable, as noted below. The
rest is chatter.

Stuart Bishop wrote:
>>> + log.debug("Getting references from %s.%s." % (table, column))
>>> + cur.execute("""
>>> + INSERT INTO ReferencedLibraryFileContent
>>> + SELECT LibraryFileAlias.content
>>> + FROM LibraryFileAlias, %(table)s
>>> + WHERE LibraryFileAlias.id = %(table)s.%(column)s
>>> + """ % {'table': table, 'column': column})
>> Wouldn't it be better practise to use SQL substitution here, like
>> ...WHERE LibraryFileAlias.id = ?.?", table, column)
>
> We can't use bound variables here because SQL identifiers use different quoting rules to SQL strings (" vs ').
>
>> I doubt any of our table names are SQL keywords, but some of our columns
>> might be named "from", for example.
>
> Yes - it is fragile. I've fixed this by doing the quoting myself using canonical.database.postgresql.quoteIdentifier()

Cool.

>> Is it actually performant to generate a table of *referenced*
>> LibraryFileContent? I would have thought you'd want to generate a table
>> of unreferenced content directly.
>
> I don't really know which approach would perform better. I don't think it is worth benchmarking unless performance turns out to be an issue. I suspect this approach is faster, as if we started with a table containing all LFC and deleted rows when we find references, we end up working with a table that is mostly holes.

Oh, I meant that it seemed like putting it all into one query that only
returned the rows we wanted to play with would allow the DB engine to do
less work, ideally scaling with the number of rows returned instead of
the total number of rows. As you say, if it's not a performance issue,
let's not worry about it.

>>> + cur.execute("DROP TABLE ReferencedLibraryFileContent")
>>> + cur.execute(
>>> + "SELECT COALESCE(max(id),0) FROM UnreferencedLibraryFileContent")
>>> + self.max_id = cur.fetchone()[0]
>>> + log.debug("%d unferenced LibraryFileContent to remove." % self.max_id)
>>> con.commit()
>> Is there any risk that some LibraryFileContent might become referenced
>> after you commit?
>
> There is a risk. I mitigate it by not deleting anything younger than a day - if it hasn't been linked by then it probably isn't going to be linked. I could try catching the exception, aborting the database transaction and possibly retry the batch without the failed entry but I doubt it will ever actually happen and I'd rather not write the tests for this extra complexity :) At the moment, if this condition is actually triggered the foreign key constraints will cause an exception to be raised. I vote we worry about that if it happens more than once.

I guess I'm really wondering why not do it all in one transaction to
avoid that race?

>>> + def __call__(self, chunksize):
>>> + chunksize = int(chunksize)
>> It seems odd that this might not be an int.
>
> The loop tuner is probably too generic. You actually get a float in case your chunksize represe...

Read more...

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/canonical/librarian/ftests/test_gc.py'
2--- lib/canonical/librarian/ftests/test_gc.py 2009-01-06 03:40:57 +0000
3+++ lib/canonical/librarian/ftests/test_gc.py 2009-07-16 02:56:16 +0000
4@@ -1,4 +1,4 @@
5-# Copyright 2004-2007 Canonical Ltd. All rights reserved.
6+# Copyright 2004-2009 Canonical Ltd. All rights reserved.
7 """Librarian garbage collection tests"""
8
9 __metaclass__ = type
10@@ -119,6 +119,8 @@
11 past = past.replace(tzinfo=utc)
12 f1.last_accessed = past
13 f2.last_accessed = past
14+ f1.date_created = past
15+ f2.date_created = past
16
17 del f1, f2
18
19@@ -159,7 +161,7 @@
20 # This should have committed
21 self.ztm.begin()
22
23- # Confirm that the LibaryFileContents are still there
24+ # Confirm that the LibaryFileContents are still there.
25 c1 = LibraryFileContent.get(c1_id)
26 c2 = LibraryFileContent.get(c2_id)
27
28
29=== modified file 'lib/canonical/librarian/librariangc.py'
30--- lib/canonical/librarian/librariangc.py 2009-01-20 08:15:41 +0000
31+++ lib/canonical/librarian/librariangc.py 2009-07-16 02:56:16 +0000
32@@ -1,20 +1,24 @@
33-# Copyright 2004-2008 Canonical Ltd. All rights reserved.
34+# Copyright 2004-2009 Canonical Ltd. All rights reserved.
35 """Librarian garbage collection routines"""
36
37 __metaclass__ = type
38
39 from datetime import datetime, timedelta
40+import errno
41 import sys
42 from time import time
43 import os
44
45+from zope.interface import implements
46+
47 from canonical.config import config
48+from canonical.database.postgresql import quoteIdentifier
49+from canonical.launchpad.interfaces.looptuner import ITunableLoop
50+from canonical.launchpad.utilities.looptuner import DBLoopTuner
51 from canonical.librarian.storage import _relFileLocation as relative_file_path
52 from canonical.librarian.storage import _sameFile
53 from canonical.database.postgresql import listReferences
54
55-BATCH_SIZE = 1
56-
57 log = None
58 debug = False
59
60@@ -176,8 +180,11 @@
61 con.commit()
62
63
64-def delete_unreferenced_aliases(con):
65- """Delete unreferenced LibraryFileAliases and their LibraryFileContent
66+class UnreferencedLibraryFileContentPruner:
67+ """Delete unreferenced LibraryFileAliases.
68+
69+ The LibraryFileContent records are left untouched for the code that
70+ knows how to delete them and the corresponding files on disk.
71
72 This is the second step in a full garbage collection sweep. We determine
73 which LibraryFileContent entries are not being referenced by other objects
74@@ -190,92 +197,122 @@
75 must be unreferenced for them to be deleted - a single reference will keep
76 the whole set alive.
77 """
78- log.info("Deleting unreferenced LibraryFileAliases")
79-
80- # Generate a set of all our LibraryFileContent ids, except for ones
81- # with expiry dates not yet reached (these lurk in the database until
82- # expired) and those that have been accessed in the last week (currently
83- # in use, so leave them lurking a while longer)
84- cur = con.cursor()
85- cur.execute("""
86- SELECT c.id
87- FROM LibraryFileContent AS c, LibraryFileAlias AS a
88- WHERE c.id = a.content
89- GROUP BY c.id
90- HAVING (max(expires) IS NULL OR max(expires)
91- < CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - '1 week'::interval
92- )
93- AND (max(last_accessed) IS NULL OR max(last_accessed)
94- < CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - '1 week'::interval
95- )
96- """)
97- content_ids = set(row[0] for row in cur.fetchall())
98- log.info(
99- "Found %d LibraryFileContent entries possibly unreferenced",
100- len(content_ids)
101- )
102-
103- # Determine what columns link to LibraryFileAlias
104- # references = [(table, column), ...]
105- references = [
106- tuple(ref[:2])
107- for ref in listReferences(cur, 'libraryfilealias', 'id')
108- if ref[0] != 'libraryfiledownloadcount'
109- ]
110- assert len(references) > 10, 'Database introspection returned nonsense'
111- log.info("Found %d columns referencing LibraryFileAlias", len(references))
112-
113- # Remove all referenced LibraryFileContent ids from content_ids
114- for table, column in references:
115- cur.execute("""
116- SELECT DISTINCT LibraryFileContent.id
117- FROM LibraryFileContent, LibraryFileAlias, %(table)s
118- WHERE LibraryFileContent.id = LibraryFileAlias.content
119- AND LibraryFileAlias.id = %(table)s.%(column)s
120- """ % vars())
121- referenced_ids = set(row[0] for row in cur.fetchall())
122- log.info(
123- "Found %d distinct LibraryFileAlias references in %s(%s)",
124- len(referenced_ids), table, column
125- )
126- content_ids.difference_update(referenced_ids)
127- log.debug(
128- "Now only %d LibraryFileContents possibly unreferenced",
129- len(content_ids)
130- )
131-
132- # Delete unreferenced LibraryFileAliases. Note that this will raise a
133- # database exception if we screwed up and attempt to delete an alias that
134- # is still referenced.
135- content_ids = list(content_ids)
136- for i in range(0, len(content_ids), BATCH_SIZE):
137- in_content_ids = ','.join(
138- (str(content_id) for content_id in content_ids[i:i+BATCH_SIZE]))
139- # First a sanity check to ensure we aren't removing anything we
140- # shouldn't be.
141- cur.execute("""
142- SELECT COUNT(*)
143+ implements(ITunableLoop)
144+
145+ def __init__(self, con):
146+ self.con = con # Database connection to use
147+ self.total_deleted = 0 # Running total
148+ self.index = 1
149+
150+ log.info("Deleting unreferenced LibraryFileAliases")
151+
152+ cur = con.cursor()
153+
154+ # Start with a list of all unexpired and recently accessed
155+ # content - we don't remove them even if they are unlinked. We
156+ # currently don't remove stuff until it has been expired for
157+ # more than one week, but we will change this if disk space
158+ # becomes short and it actually will make a noticeable
159+ # difference. Note that ReferencedLibraryFileContent will
160+ # contain duplicates - duplicates are unusual so we are better
161+ # off filtering them once at the end rather than when we load
162+ # the data into the temporary file.
163+ cur.execute("DROP TABLE IF EXISTS ReferencedLibraryFileContent")
164+ cur.execute("""
165+ SELECT LibraryFileAlias.content
166+ INTO TEMPORARY TABLE ReferencedLibraryFileContent
167 FROM LibraryFileAlias
168- WHERE content in (%(in_content_ids)s)
169- AND (
170- expires + '1 week'::interval
171- > CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
172- OR last_accessed + '1 week'::interval
173- > CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
174- )
175- """ % vars())
176- assert cur.fetchone()[0] == 0, "Logic error - sanity check failed"
177+ WHERE
178+ expires >
179+ CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - interval '1 week'
180+ OR last_accessed >
181+ CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - interval '1 week'
182+ OR date_created >
183+ CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - interval '1 week'
184+ """)
185+ con.commit()
186+
187+ # Determine what columns link to LibraryFileAlias
188+ # references = [(table, column), ...]
189+ references = [
190+ tuple(ref[:2])
191+ for ref in listReferences(cur, 'libraryfilealias', 'id')
192+ if ref[0] != 'libraryfiledownloadcount'
193+ ]
194+ assert len(references) > 10, (
195+ 'Database introspection returned nonsense')
196 log.debug(
197- "Deleting all LibraryFileAlias references to "
198- "LibraryFileContents %s", in_content_ids
199- )
200- cur.execute("""
201- DELETE FROM LibraryFileAlias WHERE content IN (%(in_content_ids)s)
202- """ % vars())
203+ "Found %d columns referencing LibraryFileAlias", len(references))
204+
205+ # Find all relevant LibraryFileAlias references and fill in
206+ # ReferencedLibraryFileContent
207+ for table, column in references:
208+ log.debug("Getting references from %s.%s." % (table, column))
209+ cur.execute("""
210+ INSERT INTO ReferencedLibraryFileContent
211+ SELECT LibraryFileAlias.content
212+ FROM LibraryFileAlias, %(table)s
213+ WHERE LibraryFileAlias.id = %(table)s.%(column)s
214+ """ % {
215+ 'table': quoteIdentifier(table),
216+ 'column': quoteIdentifier(column)})
217+ con.commit()
218+
219+ log.debug("Calculating expired unreferenced LibraryFileContent set.")
220+ cur.execute("DROP TABLE IF EXISTS UnreferencedLibraryFileContent")
221+ cur.execute("""
222+ CREATE TEMPORARY TABLE UnreferencedLibraryFileContent (
223+ id serial PRIMARY KEY,
224+ content integer UNIQUE)
225+ """)
226+ cur.execute("""
227+ INSERT INTO UnreferencedLibraryFileContent (content)
228+ SELECT id AS content FROM LibraryFileContent
229+ EXCEPT
230+ SELECT content FROM ReferencedLibraryFileContent
231+ """)
232+ cur.execute("DROP TABLE ReferencedLibraryFileContent")
233+ cur.execute(
234+ "SELECT COALESCE(max(id),0) FROM UnreferencedLibraryFileContent")
235+ self.max_id = cur.fetchone()[0]
236+ log.debug("%d unferenced LibraryFileContent to remove." % self.max_id)
237 con.commit()
238
239-
240-def delete_unreferenced_content(con):
241+ def isDone(self):
242+ if self.index > self.max_id:
243+ log.info(
244+ "Deleted %d LibraryFileAlias records." % self.total_deleted)
245+ return True
246+ else:
247+ return False
248+
249+ def __call__(self, chunksize):
250+ chunksize = int(chunksize)
251+ cur = self.con.cursor()
252+ cur.execute("""
253+ DELETE FROM LibraryFileAlias
254+ USING (
255+ SELECT content FROM UnreferencedLibraryFileContent
256+ WHERE id BETWEEN %s AND %s
257+ ) AS UnreferencedLibraryFileContent
258+ WHERE LibraryFileAlias.content
259+ = UnreferencedLibraryFileContent.content
260+ """, (self.index, self.index + chunksize - 1))
261+ deleted_rows = cur.rowcount
262+ self.total_deleted += deleted_rows
263+ log.debug("Deleted %d LibraryFileAlias records." % deleted_rows)
264+ self.con.commit()
265+ self.index += chunksize
266+
267+
268+def delete_unreferenced_aliases(con):
269+ "Run the UnreferencedLibraryFileContentPruner."
270+ loop_tuner = DBLoopTuner(
271+ UnreferencedLibraryFileContentPruner(con), 5, log=log)
272+ loop_tuner.run()
273+
274+
275+class UnreferencedContentPruner:
276 """Delete LibraryFileContent entries and their disk files that are
277 not referenced by any LibraryFileAlias entries.
278
279@@ -283,87 +320,159 @@
280 LibraryFileAlias, so all entries in this state are garbage no matter
281 what their expires flag says.
282 """
283- cur = con.cursor()
284- cur.execute("""
285- SELECT LibraryFileContent.id
286- FROM LibraryFileContent
287- LEFT OUTER JOIN LibraryFileAlias
288- ON LibraryFileContent.id = LibraryFileAlias.content
289- WHERE LibraryFileAlias.content IS NULL
290+ implements(ITunableLoop)
291+
292+ def __init__(self, con):
293+ self.con = con
294+ self.index = 1
295+ self.total_deleted = 0
296+ cur = con.cursor()
297+ cur.execute("DROP TABLE IF EXISTS UnreferencedLibraryFileContent")
298+ cur.execute("""
299+ CREATE TEMPORARY TABLE UnreferencedLibraryFileContent (
300+ id serial PRIMARY KEY,
301+ content integer UNIQUE)
302+ """)
303+ cur.execute("""
304+ INSERT INTO UnreferencedLibraryFileContent (content)
305+ SELECT DISTINCT LibraryFileContent.id
306+ FROM LibraryFileContent
307+ LEFT OUTER JOIN LibraryFileAlias
308+ ON LibraryFileContent.id = LibraryFileAlias.content
309+ WHERE LibraryFileAlias.content IS NULL
310 """)
311- garbage_ids = [row[0] for row in cur.fetchall()]
312-
313- for i in range(0, len(garbage_ids), BATCH_SIZE):
314- in_garbage_ids = ','.join(
315- (str(garbage_id) for garbage_id in garbage_ids[i:i+BATCH_SIZE])
316- )
317-
318- # Delete old LibraryFileContent entries. Note that this will fail
319- # if we screwed up and still have LibraryFileAlias entries referencing
320- # it.
321- log.debug("Deleting LibraryFileContents %s", in_garbage_ids)
322- cur.execute("""
323- DELETE FROM LibraryFileContent WHERE id in (%s)
324- """ % in_garbage_ids)
325-
326- for garbage_id in garbage_ids[i:i+BATCH_SIZE]:
327+ cur.execute("""
328+ SELECT COALESCE(max(id), 0) FROM UnreferencedLibraryFileContent
329+ """)
330+ self.max_id = cur.fetchone()[0]
331+ log.debug("%d unreferenced LibraryFileContent rows to remove.")
332+
333+ def isDone(self):
334+ if self.index > self.max_id:
335+ log.info("Deleted %d unreferenced files." % self.total_deleted)
336+ return True
337+ else:
338+ return False
339+
340+ def __call__(self, chunksize):
341+ chunksize = int(chunksize)
342+
343+ cur = self.con.cursor()
344+
345+ # Delete unreferenced LibraryFileContent entries.
346+ cur.execute("""
347+ DELETE FROM LibraryFileContent
348+ USING (
349+ SELECT content FROM UnreferencedLibraryFileContent
350+ WHERE id BETWEEN %s AND %s) AS UnreferencedLibraryFileContent
351+ WHERE
352+ LibraryFileContent.id = UnreferencedLibraryFileContent.content
353+ """, (self.index, self.index + chunksize - 1))
354+ rows_deleted = cur.rowcount
355+ self.total_deleted += rows_deleted
356+ self.con.commit()
357+
358+ # Remove files from disk. We do this outside the transaction,
359+ # as the garbage collector happily deals with files that exist
360+ # on disk but not in the DB.
361+ cur.execute("""
362+ SELECT content FROM UnreferencedLibraryFileContent
363+ WHERE id BETWEEN %s AND %s
364+ """, (self.index, self.index + chunksize - 1))
365+ for content_id in (row[0] for row in cur.fetchall()):
366 # Remove the file from disk, if it hasn't already been
367- path = get_file_path(garbage_id)
368- if os.path.exists(path):
369- log.debug("Deleting %s", path)
370+ path = get_file_path(content_id)
371+ try:
372 os.unlink(path)
373- else:
374- log.info("%s already deleted", path)
375-
376- # And commit the database changes. It may be possible for this to
377- # fail in rare cases, leaving a record in the DB with no corresponding
378- # file on disk, but that is OK as it will all be tidied up next run,
379- # and the file is unreachable anyway so nothing will attempt to
380- # access it between now and the next garbage collection run.
381- con.commit()
382-
383-
384-def flag_expired_files(connection):
385+ log.debug("Deleted %s", path)
386+ except OSError, e:
387+ if e.errno != errno.ENOENT:
388+ raise
389+ if config.librarian_server.upstream_host is None:
390+ # It is normal to have files in the database that
391+ # are not on disk if the Librarian has an upstream
392+ # Librarian, such as on staging. Don't annoy the
393+ # operator with noise in this case.
394+ log.info("%s already deleted", path)
395+
396+ self.index += chunksize
397+
398+
399+def delete_unreferenced_content(con):
400+ """Invoke UnreferencedContentPruner."""
401+ loop_tuner = DBLoopTuner(UnreferencedContentPruner(con), 5, log=log)
402+ loop_tuner.run()
403+
404+
405+class FlagExpiredFiles:
406 """Flag files past their expiry date as 'deleted' in the database.
407
408 Actual removal from disk is not performed here - that is deferred to
409 delete_unwanted_files().
410 """
411- cur = connection.cursor()
412-
413- # First get the list of all LibraryFileContent.
414- cur.execute("SELECT id FROM LibraryFileContent WHERE deleted IS FALSE")
415- all_ids = set(row[0] for row in cur.fetchall())
416-
417- # Now the list of unexpired content. May contain some ids not in the
418- # all_ids set if uploads are currently in progress.
419- cur.execute("""
420- SELECT DISTINCT content
421- FROM LibraryFileAlias
422- WHERE expires IS NULL
423- OR expires >= CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
424- """)
425- unexpired_ids = set(row[0] for row in cur.fetchall())
426-
427- # Destroy our all_ids set to create the set of expired ids.
428- # We do it this way, as we are dealing with large sets and need to
429- # be careful of RAM usage on the production server.
430- all_ids -= unexpired_ids
431- expired_ids = all_ids
432- del all_ids
433- del unexpired_ids
434-
435- for commit_counter, content_id in enumerate(expired_ids):
436- log.debug("%d is expired." % content_id)
437- cur = connection.cursor()
438+ implements(ITunableLoop)
439+
440+ def __init__(self, con):
441+ self.con = con
442+ self.index = 1
443+ self.total_flagged = 0
444+ cur = con.cursor()
445+
446+ log.debug("Creating set of expired LibraryFileContent.")
447+ cur.execute("DROP TABLE IF EXISTS ExpiredLibraryFileContent")
448+ cur.execute("""
449+ CREATE TEMPORARY TABLE ExpiredLibraryFileContent
450+ (id serial PRIMARY KEY, content integer UNIQUE)
451+ """)
452+ cur.execute("""
453+ INSERT INTO ExpiredLibraryFileContent (content)
454+ SELECT id FROM LibraryFileContent WHERE deleted IS FALSE
455+ EXCEPT ALL
456+ SELECT DISTINCT content
457+ FROM LibraryFileAlias
458+ WHERE expires IS NULL
459+ OR expires >= CURRENT_TIMESTAMP AT TIME ZONE 'UTC'
460+ """)
461+ cur.execute(
462+ "SELECT COALESCE(max(id),0) FROM ExpiredLibraryFileContent")
463+ self.max_id = cur.fetchone()[0]
464+ log.debug(
465+ "%d expired LibraryFileContent to flag for removal."
466+ % self.max_id)
467+
468+ def isDone(self):
469+ if self.index > self.max_id:
470+ log.info(
471+ "Flagged %d expired files for removal."
472+ % self.total_flagged)
473+ return True
474+ else:
475+ return False
476+
477+ def __call__(self, chunksize):
478+ chunksize = int(chunksize)
479+ cur = self.con.cursor()
480 cur.execute("""
481 UPDATE LibraryFileContent SET deleted=TRUE
482- WHERE id = %d
483- """ % content_id)
484- if commit_counter % 100 == 0:
485- connection.commit()
486- connection.commit()
487- log.info("Flagged %d expired files for removal." % len(expired_ids))
488+ FROM (
489+ SELECT content FROM ExpiredLibraryFileContent
490+ WHERE id BETWEEN %s AND %s
491+ ) AS ExpiredLibraryFileContent
492+ WHERE LibraryFileContent.id = ExpiredLibraryFileContent.content
493+ """, (self.index, self.index + chunksize - 1))
494+ flagged_rows = cur.rowcount
495+ log.debug(
496+ "Flagged %d expired LibraryFileContent for removal."
497+ % flagged_rows)
498+ self.total_flagged += flagged_rows
499+ self.index += chunksize
500+ self.con.commit()
501+
502+
503+def flag_expired_files(connection):
504+ """Invoke FlagExpiredFiles."""
505+ loop_tuner = DBLoopTuner(FlagExpiredFiles(connection), 5, log=log)
506+ loop_tuner.run()
507
508
509 def delete_unwanted_files(con):
510@@ -380,38 +489,72 @@
511 cur.execute("SELECT max(id) from LibraryFileContent")
512 max_id = cur.fetchone()[0]
513
514- # Build a set containing all stored LibraryFileContent ids
515- # that we want to keep.
516+ # Calculate all stored LibraryFileContent ids that we want to keep.
517+ # Results are ordered so we don't have to suck them all in at once.
518 cur.execute("""
519 SELECT id FROM LibraryFileContent
520 WHERE deleted IS FALSE OR datecreated
521 > CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - '1 day'::interval
522+ ORDER BY id
523 """)
524- all_ids = set(row[0] for row in cur.fetchall())
525+
526+ def get_next_wanted_content_id():
527+ result = cur.fetchone()
528+ if result is None:
529+ return None
530+ else:
531+ return result[0]
532
533 count = 0
534+ next_wanted_content_id = get_next_wanted_content_id()
535+
536 for content_id in range(1, max_id+1):
537- if content_id in all_ids:
538- continue # Linked in the db - do nothing
539+ while (next_wanted_content_id is not None
540+ and content_id > next_wanted_content_id):
541+ next_wanted_content_id = get_next_wanted_content_id()
542+
543+ file_wanted = (
544+ next_wanted_content_id is not None
545+ and next_wanted_content_id == content_id)
546+
547 path = get_file_path(content_id)
548
549- if not os.path.exists(path):
550- continue # Exists neither on disk nor in the database - do nothing
551-
552- one_day = 24 * 60 * 60
553- if time() - os.path.getctime(path) < one_day:
554- log.debug(
555- "File %d not removed - created too recently" % content_id)
556- continue # File created too recently - do nothing
557-
558- # File uploaded a while ago but not in the database - remove it
559- log.debug("Deleting %s" % path)
560- os.remove(path)
561- count += 1
562+ if file_wanted:
563+ if (config.librarian_server.upstream_host is None
564+ and not os.path.exists(path)):
565+ # It is normal to have files in the database that are
566+ # not on disk if the Librarian has an upstream
567+ # Librarian, such as on staging. Don't spam in this
568+ # case.
569+ log.error(
570+ "LibraryFileContent %d exists in the db but not at %s"
571+ % (content_id, path))
572+
573+ else:
574+ try:
575+ one_day = 24 * 60 * 60
576+ if time() - os.path.getctime(path) < one_day:
577+ log.debug(
578+ "File %d not removed - created too recently"
579+ % content_id)
580+ else:
581+ # File uploaded a while ago but no longer wanted.
582+ os.unlink(path)
583+ log.debug("Deleted %s" % path)
584+ count += 1
585+ except OSError, e:
586+ if e.errno != errno.ENOENT:
587+ raise
588+ if config.librarian_server.upstream_host is None:
589+ # It is normal to have files in the database that
590+ # are not on disk if the Librarian has an upstream
591+ # Librarian, such as on staging. Don't annoy the
592+ # operator with noise in this case.
593+ log.info("%s already deleted", path)
594
595 log.info(
596- "Removed %d from disk that where no longer referenced in the db"
597- % count
598+ "Deleted %d files from disk that where no longer referenced "
599+ "in the db" % count
600 )
601
602