Merge lp:~wgrant/launchpad/rescue-aborted-and-robbed-builders into lp:launchpad

Proposed by William Grant on 2010-03-27
Status: Rejected
Rejected by: Julian Edwards on 2010-08-06
Proposed branch: lp:~wgrant/launchpad/rescue-aborted-and-robbed-builders
Merge into: lp:launchpad
Diff against target: 311 lines (+191/-7) (has conflicts)
4 files modified
lib/lp/buildmaster/buildergroup.py (+86/-0)
lib/lp/buildmaster/model/buildfarmjobbehavior.py (+9/-1)
lib/lp/soyuz/doc/buildd-slavescanner.txt (+74/-6)
lib/lp/soyuz/tests/soyuzbuilddhelpers.py (+22/-0)
Text conflict in lib/lp/buildmaster/buildergroup.py
Text conflict in lib/lp/soyuz/doc/buildd-slavescanner.txt
To merge this branch: bzr merge lp:~wgrant/launchpad/rescue-aborted-and-robbed-builders
Reviewer Review Type Date Requested Status
Canonical Launchpad Engineering 2010-03-27 Pending
Review via email: mp+22289@code.launchpad.net
To post a comment you must log in.
Julian Edwards (julian-edwards) wrote :

Hey William, are you still working on this? It's become rather important as when we kill builds, it leaves the builder stuck after the b-m sees it building something we don't know about.

William Grant (wgrant) wrote :

Sorry, feel free to take this over. It needs a post-Wellington rewrite. It also needs slave work to make an abort kill sbuild properly.

Unmerged revisions

9795. By William Grant on 2010-03-19

Fix buildd-slavescanner.txt.

9794. By William Grant on 2010-03-19

Merge devel.

9793. By William Grant on 2009-10-30

Clean ABORTED slaves if they do not have a BuildQueue to clean up for them.

9792. By William Grant on 2009-10-29

Correct a comment.

9791. By William Grant on 2009-10-29

Rescue builders associated with non-building buildqueues.

9790. By William Grant on 2009-10-29

Verify that builders are rescued if their reported buildqueue is not assigned to them.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/buildmaster/buildergroup.py'
2--- lib/lp/buildmaster/buildergroup.py 2010-03-24 04:51:25 +0000
3+++ lib/lp/buildmaster/buildergroup.py 2010-03-27 12:17:46 +0000
4@@ -49,3 +49,89 @@
5
6 # Commit the updates made to the builders.
7 self.commit()
8+<<<<<<< TREE
9+=======
10+
11+ def updateBuilderStatus(self, builder, arch):
12+ """Update the status for a builder by probing it.
13+
14+ :param builder: A builder object.
15+ :param arch: The expected architecture family of the builder.
16+ """
17+ self.logger.debug('Checking %s' % builder.name)
18+ try:
19+ builder.checkSlaveAlive()
20+ builder.checkCanBuildForDistroArchSeries(arch)
21+ self.rescueBuilderIfLost(builder)
22+ # Catch only known exceptions.
23+ # XXX cprov 2007-06-15 bug=120571: ValueError & TypeError catching is
24+ # disturbing in this context. We should spend sometime sanitizing the
25+ # exceptions raised in the Builder API since we already started the
26+ # main refactoring of this area.
27+ except (ValueError, TypeError, xmlrpclib.Fault,
28+ BuildDaemonError), reason:
29+ builder.failBuilder(str(reason))
30+ self.logger.warn(
31+ "%s (%s) marked as failed due to: %s",
32+ builder.name, builder.url, builder.failnotes, exc_info=True)
33+ except socket.error, reason:
34+ error_message = str(reason)
35+ builder.handleTimeout(self.logger, error_message)
36+
37+ def rescueBuilderIfLost(self, builder):
38+ """Reset Builder slave if job information doesn't match with DB.
39+
40+ If builder is BUILDING or WAITING but has an information record
41+ that doesn't match what is stored in the DB, we have to dismiss
42+ its current actions and let the slave free for another job,
43+ assuming the XMLRPC is working properly at this point.
44+
45+ We also clean a builder if it is ABORTED but does not have an
46+ associated BuildQueue to clean up after it.
47+ """
48+ status_sentence = builder.slaveStatusSentence()
49+
50+ # 'ident_position' dict relates the position of the job identifier
51+ # token in the sentence received from status(), according the
52+ # two status we care about. See see lib/canonical/buildd/slave.py
53+ # for further information about sentence format.
54+ ident_position = {
55+ 'BuilderStatus.BUILDING': 1,
56+ 'BuilderStatus.WAITING': 2
57+ }
58+
59+ # Isolate the BuilderStatus string, always the first token in
60+ # see lib/canonical/buildd/slave.py and
61+ # IBuilder.slaveStatusSentence().
62+ status = status_sentence[0]
63+
64+ if status == 'BuilderStatus.ABORTED' and builder.currentjob is None:
65+ builder.cleanSlave()
66+ self.logger.warn(
67+ "Builder '%s' cleaned up from ABORTED" % builder.name)
68+ return
69+
70+ # If slave is not building nor waiting, it's not in need of rescuing.
71+ if status not in ident_position.keys():
72+ return
73+
74+ slave_build_id = status_sentence[ident_position[status]]
75+
76+ try:
77+ builder.verifySlaveBuildID(slave_build_id)
78+ except CorruptBuildID, reason:
79+ if status == 'BuilderStatus.WAITING':
80+ builder.cleanSlave()
81+ else:
82+ builder.requestAbort()
83+ self.logger.warn("Builder '%s' rescued from '%s': '%s'" % (
84+ builder.name, slave_build_id, reason))
85+
86+ def updateBuild(self, queueItem):
87+ """Verify the current build job status.
88+
89+ Perform the required actions for each state.
90+ """
91+ queueItem.builder.updateBuild(queueItem)
92+ self.commit()
93+>>>>>>> MERGE-SOURCE
94
95=== modified file 'lib/lp/buildmaster/model/buildfarmjobbehavior.py'
96--- lib/lp/buildmaster/model/buildfarmjobbehavior.py 2010-03-12 16:10:12 +0000
97+++ lib/lp/buildmaster/model/buildfarmjobbehavior.py 2010-03-27 12:17:46 +0000
98@@ -154,13 +154,21 @@
99 build_id, queue_item_id = slave_build_id.split('-')
100 except ValueError:
101 raise CorruptBuildID('Malformed build ID')
102-
103+
104 build = self.getVerifiedBuild(build_id)
105 queue_item = self.getVerifiedBuildQueue(queue_item_id)
106
107 if build != queue_item.specific_job.build:
108 raise CorruptBuildID('Job build entry mismatch')
109
110+ # Verify that the builder is properly assigned.
111+ if queue_item.builder != self._builder:
112+ raise CorruptBuildID('Job builder mismatch')
113+
114+ # And verify that the job is marked as started.
115+ if queue_item.job.date_started is None:
116+ raise CorruptBuildID('Job not started')
117+
118 def updateBuild(self, queueItem):
119 """See `IBuildFarmJobBehavior`."""
120 logger = logging.getLogger('slave-scanner')
121
122=== modified file 'lib/lp/soyuz/doc/buildd-slavescanner.txt'
123--- lib/lp/soyuz/doc/buildd-slavescanner.txt 2010-03-24 04:51:25 +0000
124+++ lib/lp/soyuz/doc/buildd-slavescanner.txt 2010-03-27 12:17:46 +0000
125@@ -42,7 +42,8 @@
126 ... AbortedSlave, AbortingSlave, BrokenSlave, BuildingSlave,
127 ... InsaneWaitingSlave, LostBuildingBrokenSlave,
128 ... LostBuildingSlave, LostWaitingSlave, MockBuilder, OkSlave,
129- ... SaneBuildingSlave, SaneWaitingSlave, WaitingSlave)
130+ ... RobbedBuildingSlave, RobbedWaitingSlave, SaneBuildingSlave,
131+ ... SaneWaitingSlave, WaitingSlave)
132
133 Let's play with a BuilderGroup method designed to rescue build slaves
134 that are processing unknown jobs. In real conditions, this situation
135@@ -67,16 +68,30 @@
136 Initializing the sane_builder. It was not rescued, since the job
137 identifier is sane (Build.id == 8 and BuildQueue.id == 1 exist):
138
139+<<<<<<< TREE
140 >>> sanebuilding_builder = MockBuilder(
141 ... 'Sane Building Slave', SaneBuildingSlave())
142 >>> sanebuilding_builder.rescueIfLost(buildergroup.logger) is None
143+=======
144+ >>> from lp.buildmaster.interfaces.builder import IBuilderSet
145+ >>> from lp.buildmaster.interfaces.buildqueue import IBuildQueueSet
146+ >>> from lp.soyuz.interfaces.build import IBuildSet
147+ >>> bob = getUtility(IBuilderSet)['bob']
148+ >>> bob.setSlaveForTesting(SaneBuildingSlave())
149+ >>> buildergroup.rescueBuilderIfLost(bob) is None
150+>>>>>>> MERGE-SOURCE
151 True
152
153 A sane WAITING slave:
154
155+<<<<<<< TREE
156 >>> sanewaiting_builder = MockBuilder(
157 ... 'Sane Waiting Slave', SaneWaitingSlave())
158 >>> sanewaiting_builder.rescueIfLost(buildergroup.logger) is None
159+=======
160+ >>> bob.setSlaveForTesting(SaneWaitingSlave())
161+ >>> buildergroup.rescueBuilderIfLost(bob) is None
162+>>>>>>> MERGE-SOURCE
163 True
164
165 A insane WAITING slave, with wrong BuildQueue/Build relation:
166@@ -108,6 +123,56 @@
167
168 Both got rescued, as expected.
169
170+A builder can also be lost if it references a valid Build/BuildQueue
171+combination, if the BuildQueue isn't started and assigned to that builder.
172+This can happen if the BuildQueue is stolen back from the builder, which
173+happens most often when the scanner temporarily loses contact with a builder.
174+
175+ >>> robbedbuilding_builder = MockBuilder(
176+ ... 'Robbed Building Slave', RobbedBuildingSlave())
177+ >>> buildergroup.rescueBuilderIfLost(robbedbuilding_builder)
178+ WARNING:root:Builder 'Robbed Building Slave' rescued from
179+ '8-1': 'Job builder mismatch'
180+
181+ >>> robbedwaiting_builder = MockBuilder(
182+ ... 'Robbed Waiting Slave', RobbedWaitingSlave())
183+ >>> buildergroup.rescueBuilderIfLost(robbedwaiting_builder)
184+ WARNING:root:Builder 'Robbed Waiting Slave' rescued from
185+ '8-1': 'Job builder mismatch'
186+
187+ >>> getUtility(IBuildQueueSet).get(1).setDateStarted(None)
188+
189+ >>> bob.setSlaveForTesting(RobbedBuildingSlave())
190+ >>> buildergroup.rescueBuilderIfLost(bob)
191+ WARNING:root:Builder 'bob' rescued from
192+ '8-1': 'Job not started'
193+
194+ >>> bob.setSlaveForTesting(RobbedWaitingSlave())
195+ >>> buildergroup.rescueBuilderIfLost(bob)
196+ WARNING:root:Builder 'bob' rescued from
197+ '8-1': 'Job not started'
198+
199+If a slave is rescued while it is BUILDING, the current build will be aborted.
200+In this case, a further rescue effort is required later, once the abort has
201+completed.
202+
203+This is needed only if the builder has no assigned buildqueue in the
204+database, as otherwise the associated buildqueue will handle the ABORTED
205+state.
206+
207+ >>> getUtility(IBuildQueueSet).get(1).builder = bob
208+
209+ >>> bob.setSlaveForTesting(AbortedSlave())
210+ >>> buildergroup.rescueBuilderIfLost(bob)
211+
212+ >>> getUtility(IBuildQueueSet).get(1).builder = None
213+
214+ >>> bob.setSlaveForTesting(AbortedSlave())
215+ >>> buildergroup.rescueBuilderIfLost(bob)
216+ WARNING:root:Builder 'bob' cleaned up from ABORTED
217+
218+ >>> getUtility(IBuildQueueSet).get(1).builder = bob
219+
220 A BUILDING or WAITING slave without a build assigned in the DB will also be rescued.
221
222 >>> from lp.buildmaster.model.buildfarmjobbehavior import IdleBuildBehavior
223@@ -130,7 +195,13 @@
224 >>> lostbuilding_builder = MockBuilder(
225 ... 'Lost Building Broken Slave', LostBuildingBrokenSlave())
226
227+<<<<<<< TREE
228 >>> lostbuilding_builder.updateStatus(buildergroup.logger)
229+=======
230+ >>> from lp.registry.interfaces.distribution import IDistributionSet
231+ >>> hoary_i386 = getUtility(IDistributionSet)['ubuntu']['hoary']['i386']
232+ >>> buildergroup.updateBuilderStatus(lostbuilding_builder, hoary_i386)
233+>>>>>>> MERGE-SOURCE
234 WARNING:root:Lost Building Broken Slave (http://fake:0000) marked as failed due to: <Fault 8002: 'Could not abort'>
235 Traceback (most recent call last):
236 ...
237@@ -191,8 +262,6 @@
238 The slavescanner system also perform build-notification for the
239 following states: FAILEDTOBUILD and CHROOTWAIT
240
241- >>> from lp.buildmaster.interfaces.builder import IBuilderSet
242- >>> from lp.soyuz.interfaces.build import IBuildSet
243 >>> import datetime, pytz
244
245 >>> UTC = pytz.timezone('UTC')
246@@ -759,7 +828,6 @@
247
248 Retrieve a known DistroArchSeries
249
250- >>> from canonical.launchpad.interfaces import IDistributionSet
251 >>> hoary_i386 = getUtility(IDistributionSet)['ubuntu']['hoary']['i386']
252 >>> warty_i386 = getUtility(IDistributionSet)['ubuntu']['warty']['i386']
253
254@@ -854,8 +922,8 @@
255 permissive database user.
256
257 >>> from canonical.config import config
258- >>> from canonical.launchpad.interfaces import PackagePublishingStatus
259 >>> from canonical.testing.layers import LaunchpadZopelessLayer
260+ >>> from lp.soyuz.interfaces.publishing import PackagePublishingStatus
261
262 >>> spr = build.sourcepackagerelease
263 >>> pub = removeSecurityProxy(build).current_source_publication
264@@ -1415,7 +1483,7 @@
265 Change the distroseries status for testing. FROZEN allows building in
266 all pockets:
267
268- >>> from canonical.launchpad.interfaces import SeriesStatus
269+ >>> from lp.registry.interfaces.series import SeriesStatus
270 >>> hoary_i386.distroseries.status = SeriesStatus.FROZEN
271
272 Now we can start a build in other pockets, and see what archives are
273
274=== modified file 'lib/lp/soyuz/tests/soyuzbuilddhelpers.py'
275--- lib/lp/soyuz/tests/soyuzbuilddhelpers.py 2010-03-26 01:52:07 +0000
276+++ lib/lp/soyuz/tests/soyuzbuilddhelpers.py 2010-03-27 12:17:46 +0000
277@@ -13,6 +13,8 @@
278 'LostBuildingSlave',
279 'LostWaitingSlave',
280 'LostBuildingBrokenSlave',
281+ 'RobbedBuildingSlave',
282+ 'RobbedWaitingSlave',
283 'BrokenSlave',
284 'OkSlave',
285 'BuildingSlave',
286@@ -154,6 +156,26 @@
287 raise xmlrpclib.Fault(8002, "Could not abort")
288
289
290+class RobbedBuildingSlave:
291+ """A mock slave building a BuildQueue that is not assigned to it."""
292+
293+ def status(self):
294+ return ('BuilderStatus.BUILDING', '8-1')
295+
296+ def abort(self):
297+ pass
298+
299+
300+class RobbedWaitingSlave:
301+ """A mock slave building a BuildQueue that is not assigned to it."""
302+
303+ def status(self):
304+ return ('BuilderStatus.WAITING', 'BuildStatus.OK', '8-1')
305+
306+ def clean(self):
307+ pass
308+
309+
310 class BrokenSlave:
311 """A mock slave that reports that it is broken."""
312