Merge lp:~gholt/swift/containerupdater into lp:~hudson-openstack/swift/trunk

Proposed by gholt
Status: Merged
Approved by: Mike Barton
Approved revision: 192
Merged at revision: 198
Proposed branch: lp:~gholt/swift/containerupdater
Merge into: lp:~hudson-openstack/swift/trunk
Diff against target: 184 lines (+69/-19)
4 files modified
doc/source/deployment_guide.rst (+19/-13)
etc/container-server.conf-sample (+2/-0)
swift/container/updater.py (+47/-6)
test/unit/container/test_updater.py (+1/-0)
To merge this branch: bzr merge lp:~gholt/swift/containerupdater
Reviewer Review Type Date Requested Status
Chuck Thier (community) Approve
Mike Barton Pending
Review via email: mp+47098@code.launchpad.net

Description of the change

container-updater: temporary account update suppression on errors

To post a comment you must log in.
Revision history for this message
Chuck Thier (cthier) wrote :

Seems reasonable as a hack until we can rewrite this.

review: Approve
lp:~gholt/swift/containerupdater updated
191. By gholt

Update to load suppressions from both os.wait points

192. By gholt

Merge from trunk

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'doc/source/deployment_guide.rst'
2--- doc/source/deployment_guide.rst 2011-01-24 23:16:48 +0000
3+++ doc/source/deployment_guide.rst 2011-01-25 23:27:14 +0000
4@@ -371,19 +371,25 @@
5
6 [container-updater]
7
8-================== ================= =======================================
9-Option Default Description
10------------------- ----------------- ---------------------------------------
11-log_name container-updater Label used when logging
12-log_facility LOG_LOCAL0 Syslog log facility
13-log_level INFO Logging level
14-interval 300 Minimum time for a pass to take
15-concurrency 4 Number of updater workers to spawn
16-node_timeout 3 Request timeout to external services
17-conn_timeout 0.5 Connection timeout to external services
18-slowdown 0.01 Time in seconds to wait between
19- containers
20-================== ================= =======================================
21+======================== ================= ==================================
22+Option Default Description
23+------------------------ ----------------- ----------------------------------
24+log_name container-updater Label used when logging
25+log_facility LOG_LOCAL0 Syslog log facility
26+log_level INFO Logging level
27+interval 300 Minimum time for a pass to take
28+concurrency 4 Number of updater workers to spawn
29+node_timeout 3 Request timeout to external
30+ services
31+conn_timeout 0.5 Connection timeout to external
32+ services
33+slowdown 0.01 Time in seconds to wait between
34+ containers
35+account_suppression_time 60 Seconds to suppress updating an
36+ account that has generated an
37+ error (timeout, not yet found,
38+ etc.)
39+======================== ================= ==================================
40
41 [container-auditor]
42
43
44=== modified file 'etc/container-server.conf-sample'
45--- etc/container-server.conf-sample 2011-01-23 21:18:28 +0000
46+++ etc/container-server.conf-sample 2011-01-25 23:27:14 +0000
47@@ -50,6 +50,8 @@
48 # conn_timeout = 0.5
49 # slowdown will sleep that amount between containers
50 # slowdown = 0.01
51+# Seconds to suppress updating an account that has generated an error
52+# account_suppression_time = 60
53
54 [container-auditor]
55 # You can override the default log routing for this app here (don't use set!):
56
57=== modified file 'swift/container/updater.py'
58--- swift/container/updater.py 2011-01-14 11:30:17 +0000
59+++ swift/container/updater.py 2011-01-25 23:27:14 +0000
60@@ -19,6 +19,7 @@
61 import sys
62 import time
63 from random import random, shuffle
64+from tempfile import mkstemp
65
66 from eventlet import spawn, patcher, Timeout
67
68@@ -51,6 +52,10 @@
69 self.no_changes = 0
70 self.successes = 0
71 self.failures = 0
72+ self.account_suppressions = {}
73+ self.account_suppression_time = \
74+ float(conf.get('account_suppression_time', 60))
75+ self.new_account_suppressions = None
76
77 def get_account_ring(self):
78 """Get the account ring. Load it if it hasn't been yet."""
79@@ -80,6 +85,19 @@
80 shuffle(paths)
81 return paths
82
83+ def _load_suppressions(self, filename):
84+ try:
85+ with open(filename, 'r') as tmpfile:
86+ for line in tmpfile:
87+ account, until = line.split()
88+ until = float(until)
89+ self.account_suppressions[account] = until
90+ except:
91+ self.logger.exception(
92+ _('ERROR with loading suppressions from %s: ') % filename)
93+ finally:
94+ os.unlink(filename)
95+
96 def run_forever(self): # pragma: no cover
97 """
98 Run the updator continuously.
99@@ -88,21 +106,33 @@
100 while True:
101 self.logger.info(_('Begin container update sweep'))
102 begin = time.time()
103- pids = []
104+ now = time.time()
105+ expired_suppressions = \
106+ [a for a, u in self.account_suppressions.iteritems() if u < now]
107+ for account in expired_suppressions:
108+ del self.account_suppressions[account]
109+ pid2filename = {}
110 # read from account ring to ensure it's fresh
111 self.get_account_ring().get_nodes('')
112 for path in self.get_paths():
113- while len(pids) >= self.concurrency:
114- pids.remove(os.wait()[0])
115+ while len(pid2filename) >= self.concurrency:
116+ pid = os.wait()[0]
117+ try:
118+ self._load_suppressions(pid2filename[pid])
119+ finally:
120+ del pid2filename[pid]
121+ fd, tmpfilename = mkstemp()
122+ os.close(fd)
123 pid = os.fork()
124 if pid:
125- pids.append(pid)
126+ pid2filename[pid] = tmpfilename
127 else:
128 signal.signal(signal.SIGTERM, signal.SIG_DFL)
129 patcher.monkey_patch(all=False, socket=True)
130 self.no_changes = 0
131 self.successes = 0
132 self.failures = 0
133+ self.new_account_suppressions = open(tmpfilename, 'w')
134 forkbegin = time.time()
135 self.container_sweep(path)
136 elapsed = time.time() - forkbegin
137@@ -114,8 +144,12 @@
138 'success': self.successes, 'fail': self.failures,
139 'no_change': self.no_changes})
140 sys.exit()
141- while pids:
142- pids.remove(os.wait()[0])
143+ while pid2filename:
144+ pid = os.wait()[0]
145+ try:
146+ self._load_suppressions(pid2filename[pid])
147+ finally:
148+ del pid2filename[pid]
149 elapsed = time.time() - begin
150 self.logger.info(_('Container update sweep completed: %.02fs'),
151 elapsed)
152@@ -165,6 +199,8 @@
153 # definitely doesn't have up to date statistics.
154 if float(info['put_timestamp']) <= 0:
155 return
156+ if self.account_suppressions.get(info['account'], 0) > time.time():
157+ return
158 if info['put_timestamp'] > info['reported_put_timestamp'] or \
159 info['delete_timestamp'] > info['reported_delete_timestamp'] \
160 or info['object_count'] != info['reported_object_count'] or \
161@@ -195,6 +231,11 @@
162 self.logger.debug(
163 _('Update report failed for %(container)s %(dbfile)s'),
164 {'container': container, 'dbfile': dbfile})
165+ self.account_suppressions[info['account']] = until = \
166+ time.time() + self.account_suppression_time
167+ if self.new_account_suppressions:
168+ print >>self.new_account_suppressions, \
169+ info['account'], until
170 else:
171 self.no_changes += 1
172
173
174=== modified file 'test/unit/container/test_updater.py'
175--- test/unit/container/test_updater.py 2011-01-25 01:12:38 +0000
176+++ test/unit/container/test_updater.py 2011-01-25 23:27:14 +0000
177@@ -78,6 +78,7 @@
178 'interval': '1',
179 'concurrency': '1',
180 'node_timeout': '15',
181+ 'account_suppression_time': 0
182 })
183 cu.run_once()
184 containers_dir = os.path.join(self.sda1, container_server.DATADIR)