Merge lp:~gholt/swift/containerupdater into lp:~hudson-openstack/swift/trunk

Proposed by gholt
Status: Merged
Approved by: Mike Barton
Approved revision: 192
Merged at revision: 198
Proposed branch: lp:~gholt/swift/containerupdater
Merge into: lp:~hudson-openstack/swift/trunk
Diff against target: 184 lines (+69/-19)
4 files modified
doc/source/deployment_guide.rst (+19/-13)
etc/container-server.conf-sample (+2/-0)
swift/container/updater.py (+47/-6)
test/unit/container/test_updater.py (+1/-0)
To merge this branch: bzr merge lp:~gholt/swift/containerupdater
Reviewer Review Type Date Requested Status
Chuck Thier (community) Approve
Mike Barton Pending
Review via email: mp+47098@code.launchpad.net

Description of the change

container-updater: temporary account update suppression on errors

To post a comment you must log in.
Revision history for this message
Chuck Thier (cthier) wrote :

Seems reasonable as a hack until we can rewrite this.

review: Approve
lp:~gholt/swift/containerupdater updated
191. By gholt

Update to load suppressions from both os.wait points

192. By gholt

Merge from trunk

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'doc/source/deployment_guide.rst'
--- doc/source/deployment_guide.rst 2011-01-24 23:16:48 +0000
+++ doc/source/deployment_guide.rst 2011-01-25 23:27:14 +0000
@@ -371,19 +371,25 @@
371371
372[container-updater]372[container-updater]
373373
374================== ================= =======================================374======================== ================= ==================================
375Option Default Description375Option Default Description
376------------------ ----------------- ---------------------------------------376------------------------ ----------------- ----------------------------------
377log_name container-updater Label used when logging377log_name container-updater Label used when logging
378log_facility LOG_LOCAL0 Syslog log facility378log_facility LOG_LOCAL0 Syslog log facility
379log_level INFO Logging level379log_level INFO Logging level
380interval 300 Minimum time for a pass to take380interval 300 Minimum time for a pass to take
381concurrency 4 Number of updater workers to spawn381concurrency 4 Number of updater workers to spawn
382node_timeout 3 Request timeout to external services382node_timeout 3 Request timeout to external
383conn_timeout 0.5 Connection timeout to external services383 services
384slowdown 0.01 Time in seconds to wait between 384conn_timeout 0.5 Connection timeout to external
385 containers385 services
386================== ================= =======================================386slowdown 0.01 Time in seconds to wait between
387 containers
388account_suppression_time 60 Seconds to suppress updating an
389 account that has generated an
390 error (timeout, not yet found,
391 etc.)
392======================== ================= ==================================
387393
388[container-auditor]394[container-auditor]
389395
390396
=== modified file 'etc/container-server.conf-sample'
--- etc/container-server.conf-sample 2011-01-23 21:18:28 +0000
+++ etc/container-server.conf-sample 2011-01-25 23:27:14 +0000
@@ -50,6 +50,8 @@
50# conn_timeout = 0.550# conn_timeout = 0.5
51# slowdown will sleep that amount between containers51# slowdown will sleep that amount between containers
52# slowdown = 0.0152# slowdown = 0.01
53# Seconds to suppress updating an account that has generated an error
54# account_suppression_time = 60
5355
54[container-auditor]56[container-auditor]
55# You can override the default log routing for this app here (don't use set!):57# You can override the default log routing for this app here (don't use set!):
5658
=== modified file 'swift/container/updater.py'
--- swift/container/updater.py 2011-01-14 11:30:17 +0000
+++ swift/container/updater.py 2011-01-25 23:27:14 +0000
@@ -19,6 +19,7 @@
19import sys19import sys
20import time20import time
21from random import random, shuffle21from random import random, shuffle
22from tempfile import mkstemp
2223
23from eventlet import spawn, patcher, Timeout24from eventlet import spawn, patcher, Timeout
2425
@@ -51,6 +52,10 @@
51 self.no_changes = 052 self.no_changes = 0
52 self.successes = 053 self.successes = 0
53 self.failures = 054 self.failures = 0
55 self.account_suppressions = {}
56 self.account_suppression_time = \
57 float(conf.get('account_suppression_time', 60))
58 self.new_account_suppressions = None
5459
55 def get_account_ring(self):60 def get_account_ring(self):
56 """Get the account ring. Load it if it hasn't been yet."""61 """Get the account ring. Load it if it hasn't been yet."""
@@ -80,6 +85,19 @@
80 shuffle(paths)85 shuffle(paths)
81 return paths86 return paths
8287
88 def _load_suppressions(self, filename):
89 try:
90 with open(filename, 'r') as tmpfile:
91 for line in tmpfile:
92 account, until = line.split()
93 until = float(until)
94 self.account_suppressions[account] = until
95 except:
96 self.logger.exception(
97 _('ERROR with loading suppressions from %s: ') % filename)
98 finally:
99 os.unlink(filename)
100
83 def run_forever(self): # pragma: no cover101 def run_forever(self): # pragma: no cover
84 """102 """
85 Run the updator continuously.103 Run the updator continuously.
@@ -88,21 +106,33 @@
88 while True:106 while True:
89 self.logger.info(_('Begin container update sweep'))107 self.logger.info(_('Begin container update sweep'))
90 begin = time.time()108 begin = time.time()
91 pids = []109 now = time.time()
110 expired_suppressions = \
111 [a for a, u in self.account_suppressions.iteritems() if u < now]
112 for account in expired_suppressions:
113 del self.account_suppressions[account]
114 pid2filename = {}
92 # read from account ring to ensure it's fresh115 # read from account ring to ensure it's fresh
93 self.get_account_ring().get_nodes('')116 self.get_account_ring().get_nodes('')
94 for path in self.get_paths():117 for path in self.get_paths():
95 while len(pids) >= self.concurrency:118 while len(pid2filename) >= self.concurrency:
96 pids.remove(os.wait()[0])119 pid = os.wait()[0]
120 try:
121 self._load_suppressions(pid2filename[pid])
122 finally:
123 del pid2filename[pid]
124 fd, tmpfilename = mkstemp()
125 os.close(fd)
97 pid = os.fork()126 pid = os.fork()
98 if pid:127 if pid:
99 pids.append(pid)128 pid2filename[pid] = tmpfilename
100 else:129 else:
101 signal.signal(signal.SIGTERM, signal.SIG_DFL)130 signal.signal(signal.SIGTERM, signal.SIG_DFL)
102 patcher.monkey_patch(all=False, socket=True)131 patcher.monkey_patch(all=False, socket=True)
103 self.no_changes = 0132 self.no_changes = 0
104 self.successes = 0133 self.successes = 0
105 self.failures = 0134 self.failures = 0
135 self.new_account_suppressions = open(tmpfilename, 'w')
106 forkbegin = time.time()136 forkbegin = time.time()
107 self.container_sweep(path)137 self.container_sweep(path)
108 elapsed = time.time() - forkbegin138 elapsed = time.time() - forkbegin
@@ -114,8 +144,12 @@
114 'success': self.successes, 'fail': self.failures,144 'success': self.successes, 'fail': self.failures,
115 'no_change': self.no_changes})145 'no_change': self.no_changes})
116 sys.exit()146 sys.exit()
117 while pids:147 while pid2filename:
118 pids.remove(os.wait()[0])148 pid = os.wait()[0]
149 try:
150 self._load_suppressions(pid2filename[pid])
151 finally:
152 del pid2filename[pid]
119 elapsed = time.time() - begin153 elapsed = time.time() - begin
120 self.logger.info(_('Container update sweep completed: %.02fs'),154 self.logger.info(_('Container update sweep completed: %.02fs'),
121 elapsed)155 elapsed)
@@ -165,6 +199,8 @@
165 # definitely doesn't have up to date statistics.199 # definitely doesn't have up to date statistics.
166 if float(info['put_timestamp']) <= 0:200 if float(info['put_timestamp']) <= 0:
167 return201 return
202 if self.account_suppressions.get(info['account'], 0) > time.time():
203 return
168 if info['put_timestamp'] > info['reported_put_timestamp'] or \204 if info['put_timestamp'] > info['reported_put_timestamp'] or \
169 info['delete_timestamp'] > info['reported_delete_timestamp'] \205 info['delete_timestamp'] > info['reported_delete_timestamp'] \
170 or info['object_count'] != info['reported_object_count'] or \206 or info['object_count'] != info['reported_object_count'] or \
@@ -195,6 +231,11 @@
195 self.logger.debug(231 self.logger.debug(
196 _('Update report failed for %(container)s %(dbfile)s'),232 _('Update report failed for %(container)s %(dbfile)s'),
197 {'container': container, 'dbfile': dbfile})233 {'container': container, 'dbfile': dbfile})
234 self.account_suppressions[info['account']] = until = \
235 time.time() + self.account_suppression_time
236 if self.new_account_suppressions:
237 print >>self.new_account_suppressions, \
238 info['account'], until
198 else:239 else:
199 self.no_changes += 1240 self.no_changes += 1
200241
201242
=== modified file 'test/unit/container/test_updater.py'
--- test/unit/container/test_updater.py 2011-01-25 01:12:38 +0000
+++ test/unit/container/test_updater.py 2011-01-25 23:27:14 +0000
@@ -78,6 +78,7 @@
78 'interval': '1',78 'interval': '1',
79 'concurrency': '1',79 'concurrency': '1',
80 'node_timeout': '15',80 'node_timeout': '15',
81 'account_suppression_time': 0
81 })82 })
82 cu.run_once()83 cu.run_once()
83 containers_dir = os.path.join(self.sda1, container_server.DATADIR)84 containers_dir = os.path.join(self.sda1, container_server.DATADIR)