Merge lp:~chipaca/ubuntu-push/flapping-fixes into lp:ubuntu-push

Proposed by John Lenton
Status: Superseded
Proposed branch: lp:~chipaca/ubuntu-push/flapping-fixes
Merge into: lp:ubuntu-push
Diff against target: 488 lines (+116/-70)
14 files modified
bus/connectivity/connectivity.go (+16/-12)
bus/connectivity/connectivity_test.go (+10/-11)
bus/connectivity/webchecker.go (+4/-4)
bus/endpoint.go (+1/-1)
bus/systemimage/systemimage.go (+1/-1)
client/client.go (+1/-2)
client/service/postal.go (+0/-1)
client/session/session.go (+28/-1)
dependencies.tsv (+1/-1)
launch_helper/kindpool.go (+1/-1)
poller/poller.go (+1/-1)
util/redialer.go (+36/-27)
util/redialer_states.gv (+9/-0)
util/redialer_test.go (+7/-7)
To merge this branch: bzr merge lp:~chipaca/ubuntu-push/flapping-fixes
Reviewer Review Type Date Requested Status
Ubuntu Push Hackers Pending
Review via email: mp+247190@code.launchpad.net

This proposal has been superseded by a proposal from 2015-01-21.

Commit message

Partially work around the issue in a minimally intrusive way (real fix will have to wait).

Description of the change

Partially work around the issue in a minimally intrusive way (real fix will have to wait).

To post a comment you must log in.
359. By John Lenton

merged lots-of-small-logging-changes into flapping-fixes.

360. By John Lenton

fix a massive data race in session

Unmerged revisions

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'bus/connectivity/connectivity.go'
2--- bus/connectivity/connectivity.go 2014-07-04 19:12:38 +0000
3+++ bus/connectivity/connectivity.go 2015-01-21 17:56:14 +0000
4@@ -82,7 +82,7 @@
5 // Get the current state.
6 initial = nm.GetState()
7 if initial == networkmanager.Unknown {
8- cs.log.Debugf("Failed to get state.")
9+ cs.log.Debugf("failed to get state.")
10 goto Continue
11 }
12 cs.log.Debugf("got initial state of %s", initial)
13@@ -132,28 +132,32 @@
14 return false, errors.New("got not-OK from StateChanged watch")
15 }
16 cs.webgetCh = nil
17- cs.currentState = v
18- cs.timer.Reset(stabilizingTimeout)
19- log.Debugf("State changed to %s. Assuming disconnect.", v)
20- if cs.lastSent == true {
21- log.Infof("Sending 'disconnected'.")
22- cs.lastSent = false
23- break Loop
24+ if v != networkmanager.Connecting && cs.currentState != v {
25+ cs.currentState = v
26+ cs.timer.Reset(stabilizingTimeout)
27+ log.Debugf("State changed to %s. Assuming disconnect.", v)
28+ if cs.lastSent == true {
29+ log.Debugf("sending 'disconnected'.")
30+ cs.lastSent = false
31+ break Loop
32+ }
33+ } else {
34+ log.Debugf("got State of %s, current is %s, ignoring.", v, cs.currentState)
35 }
36
37 case <-cs.timer.C:
38 if cs.currentState == networkmanager.ConnectedGlobal {
39- log.Debugf("May be connected; checking...")
40+ log.Debugf("may be connected; checking...")
41 cs.webgetCh = make(chan bool)
42 go cs.webget(cs.webgetCh)
43 }
44
45 case connected := <-cs.webgetCh:
46 cs.timer.Reset(recheckTimeout)
47- log.Debugf("Connection check says: %t", connected)
48+ log.Debugf("connection check says: %t", connected)
49 cs.webgetCh = nil
50 if connected && cs.lastSent == false {
51- log.Infof("Sending 'connected'.")
52+ log.Debugf("sending 'connected'.")
53 cs.lastSent = true
54 break Loop
55 }
56@@ -178,7 +182,7 @@
57 }
58
59 Start:
60- log.Infof("Sending initial 'disconnected'.")
61+ log.Debugf("Sending initial 'disconnected'.")
62 out <- false
63 cs.lastSent = false
64 cs.currentState = cs.start()
65
66=== modified file 'bus/connectivity/connectivity_test.go'
67--- bus/connectivity/connectivity_test.go 2014-07-04 23:00:42 +0000
68+++ bus/connectivity/connectivity_test.go 2015-01-21 17:56:14 +0000
69@@ -217,20 +217,22 @@
70 f, e := cs.connectedStateStep()
71 c.Check(e, IsNil)
72 c.Check(f, Equals, true)
73- ch <- networkmanager.ConnectedGlobal // a ConnectedGlobal when connected signals trouble
74- f, e = cs.connectedStateStep()
75- c.Check(e, IsNil)
76- c.Check(f, Equals, false) // so we assume a disconnect happened
77- f, e = cs.connectedStateStep()
78- c.Check(e, IsNil)
79- c.Check(f, Equals, true) // and if the web check works, go back to connected
80+ ch <- networkmanager.Disconnected
81+ ch <- networkmanager.ConnectedGlobal
82+ f, e = cs.connectedStateStep()
83+ c.Check(e, IsNil)
84+ c.Check(f, Equals, false)
85+ f, e = cs.connectedStateStep()
86+ c.Check(e, IsNil)
87+ c.Check(f, Equals, true)
88
89 // same scenario, but with failing web check
90 webget_p = condition.Fail2Work(1)
91+ ch <- networkmanager.Disconnected
92 ch <- networkmanager.ConnectedGlobal
93 f, e = cs.connectedStateStep()
94 c.Check(e, IsNil)
95- c.Check(f, Equals, false) // first false is from assuming a Connected signals trouble
96+ c.Check(f, Equals, false) // first false is from the Disconnected
97
98 // the next call to Step will time out
99 _ch := make(chan bool, 1)
100@@ -284,7 +286,6 @@
101
102 endp := testingbus.NewTestingEndpoint(condition.Work(true), condition.Work(true),
103 uint32(networkmanager.ConnectedGlobal),
104- uint32(networkmanager.ConnectedGlobal),
105 uint32(networkmanager.Disconnected),
106 )
107
108@@ -303,8 +304,6 @@
109 }{
110 {false, "first state is always false", 0},
111 {true, "then it should be true as per ConnectedGlobal above", 0},
112- {false, "then, false (upon receiving the next ConnectedGlobal)", 2},
113- {true, "then it should be true (webcheck passed)", 0},
114 {false, "then it should be false (Disconnected)", 2},
115 {false, "then it should be false again because it's restarted", 2},
116 }
117
118=== modified file 'bus/connectivity/webchecker.go'
119--- bus/connectivity/webchecker.go 2014-03-20 12:24:33 +0000
120+++ bus/connectivity/webchecker.go 2015-01-21 17:56:14 +0000
121@@ -64,7 +64,7 @@
122 func (wb *webchecker) Webcheck(ch chan<- bool) {
123 response, err := wb.cli.Get(wb.url)
124 if err != nil {
125- wb.log.Errorf("While GETting %s: %v", wb.url, err)
126+ wb.log.Errorf("while GETting %s: %v", wb.url, err)
127 ch <- false
128 return
129 }
130@@ -72,17 +72,17 @@
131 hash := md5.New()
132 _, err = io.CopyN(hash, response.Body, 1024)
133 if err != io.EOF {
134- wb.log.Errorf("Reading %s, expecting EOF, got: %v",
135+ wb.log.Errorf("reading %s, expecting EOF, got: %v",
136 wb.url, err)
137 ch <- false
138 return
139 }
140 sum := fmt.Sprintf("%x", hash.Sum(nil))
141 if sum == wb.target {
142- wb.log.Infof("Connectivity check passed.")
143+ wb.log.Infof("connectivity check passed.")
144 ch <- true
145 } else {
146- wb.log.Infof("Connectivity check failed: content mismatch.")
147+ wb.log.Infof("connectivity check failed: content mismatch.")
148 ch <- false
149 }
150 }
151
152=== modified file 'bus/endpoint.go'
153--- bus/endpoint.go 2014-09-08 15:48:00 +0000
154+++ bus/endpoint.go 2015-01-21 17:56:14 +0000
155@@ -110,7 +110,7 @@
156 return errors.New(msg)
157 }
158 }
159- endp.log.Infof("%#v dialed in.", name)
160+ endp.log.Debugf("%#v dialed in.", name)
161 endp.bus = bus
162 endp.proxy = bus.Object(name, dbus.ObjectPath(endp.addr.Path))
163 return nil
164
165=== modified file 'bus/systemimage/systemimage.go'
166--- bus/systemimage/systemimage.go 2014-04-02 08:46:48 +0000
167+++ bus/systemimage/systemimage.go 2015-01-21 17:56:14 +0000
168@@ -57,7 +57,7 @@
169 var _ SystemImage = &systemImage{} // ensures it conforms
170
171 func (si *systemImage) Info() (*InfoResult, error) {
172- si.log.Debugf("Invoking Info")
173+ si.log.Debugf("invoking Info")
174 res := &InfoResult{}
175 err := si.endp.Call("Info", bus.Args(), &res.BuildNumber, &res.Device, &res.Channel, &res.LastUpdate, &res.VersionDetail)
176 if err != nil {
177
178=== modified file 'client/client.go'
179--- client/client.go 2014-11-03 13:36:00 +0000
180+++ client/client.go 2015-01-21 17:56:14 +0000
181@@ -381,10 +381,9 @@
182 return
183 }
184 client.hasConnectivity = hasConnectivity
185+ client.session.Close()
186 if hasConnectivity {
187 client.session.AutoRedial(client.sessionConnectedCh)
188- } else {
189- client.session.Close()
190 }
191 }
192
193
194=== modified file 'client/service/postal.go'
195--- client/service/postal.go 2014-11-03 13:36:00 +0000
196+++ client/service/postal.go 2015-01-21 17:56:14 +0000
197@@ -244,7 +244,6 @@
198 for _, endp := range endps {
199 go func(name string, endp bus.Endpoint) {
200 util.NewAutoRedialer(endp).Redial()
201- svc.Log.Debugf("%s dialed in", name)
202 wg.Done()
203 }(endp.name, endp.endp)
204 }
205
206=== modified file 'client/session/session.go'
207--- client/session/session.go 2014-11-19 13:58:12 +0000
208+++ client/session/session.go 2015-01-21 17:56:14 +0000
209@@ -74,8 +74,22 @@
210 Connected
211 Started
212 Running
213+ Unknown
214 )
215
216+func (s ClientSessionState) String() string {
217+ if s >= Unknown {
218+ return fmt.Sprintf("??? (%d)", s)
219+ }
220+ return [Unknown]string{
221+ "Error",
222+ "Disconnected",
223+ "Connected",
224+ "Started",
225+ "Running",
226+ }[s]
227+}
228+
229 type hostGetter interface {
230 Get() (*gethosts.Host, error)
231 }
232@@ -360,15 +374,26 @@
233 sess.setShouldDelay()
234 }
235 time.Sleep(sess.redialDelay(sess))
236+ if sess.retrier != nil {
237+ panic("session AutoRedial: unexpected non-nil retrier.")
238+ }
239 sess.retrier = util.NewAutoRedialer(sess)
240 sess.lastAutoRedial = time.Now()
241- go func() { doneCh <- sess.retrier.Redial() }()
242+ go func() {
243+ retrier := sess.retrier
244+ if sess.retrier == nil {
245+ sess.Log.Debugf("session autoredialer skipping retry: retrier has been set to nil.")
246+ return
247+ }
248+ doneCh <- retrier.Redial()
249+ }()
250 }
251
252 func (sess *ClientSession) Close() {
253 sess.stopRedial()
254 sess.doClose()
255 }
256+
257 func (sess *ClientSession) doClose() {
258 sess.connLock.Lock()
259 defer sess.connLock.Unlock()
260@@ -508,6 +533,7 @@
261 sess.proto.SetDeadline(time.Now().Add(deadAfter))
262 err = sess.proto.ReadMessage(&recv)
263 if err != nil {
264+ sess.Log.Debugf("session aborting with error on read.")
265 sess.setState(Error)
266 return err
267 }
268@@ -529,6 +555,7 @@
269 sess.Log.Errorf("server sent warning: %s", recv.Reason)
270 }
271 if err != nil {
272+ sess.Log.Debugf("session aborting with error from handler.")
273 return err
274 }
275 }
276
277=== modified file 'dependencies.tsv'
278--- dependencies.tsv 2014-08-20 12:42:51 +0000
279+++ dependencies.tsv 2015-01-21 17:56:14 +0000
280@@ -1,5 +1,5 @@
281 code.google.com/p/go-uuid hg 7dda39b2e7d5e265014674c5af696ba4186679e9 11
282 code.google.com/p/gosqlite hg 74691fb6f83716190870cde1b658538dd4b18eb0 15
283-launchpad.net/go-dbus/v1 bzr james@jamesh.id.au-20140801110939-lzfql7fk76dt6ckd 128
284+launchpad.net/go-dbus/v1 bzr jlenton@gmail.com-20141023032446-s5icvsucwlv5o38a 129
285 launchpad.net/go-xdg/v0 bzr john.lenton@canonical.com-20140208094800-gubd5md7cro3mtxa 10
286 launchpad.net/gocheck bzr gustavo@niemeyer.net-20140225173054-xu9zlkf9kxhvow02 87
287
288=== modified file 'launch_helper/kindpool.go'
289--- launch_helper/kindpool.go 2014-11-05 12:07:53 +0000
290+++ launch_helper/kindpool.go 2015-01-21 17:56:14 +0000
291@@ -296,7 +296,7 @@
292 if err != nil {
293 pool.log.Errorf("unable to read output from %v helper: %v", args.AppId, err)
294 } else {
295- pool.log.Infof("%v helper output: %#v", args.AppId, payload)
296+ pool.log.Infof("%v helper output: %s", args.AppId, payload)
297 res := &HelperResult{Input: args.Input}
298 err = json.Unmarshal(payload, &res.HelperOutput)
299 if err != nil {
300
301=== modified file 'poller/poller.go'
302--- poller/poller.go 2014-08-29 13:50:16 +0000
303+++ poller/poller.go 2015-01-21 17:56:14 +0000
304@@ -167,7 +167,7 @@
305 // the channel will produce a true for every
306 // wakeup, not only the one we asked for
307 now := time.Now()
308- p.log.Debugf("got woken up; time is %s", now)
309+ p.log.Debugf("got woken up; time is %s (𝛥: %s)", now, now.Sub(t))
310 if !now.Before(t) {
311 break
312 }
313
314=== modified file 'util/redialer.go'
315--- util/redialer.go 2014-03-20 12:15:47 +0000
316+++ util/redialer.go 2015-01-21 17:56:14 +0000
317@@ -19,6 +19,7 @@
318
319 import (
320 "sync"
321+ "sync/atomic"
322 "time"
323 )
324
325@@ -65,30 +66,42 @@
326 Stop() // Stop shuts down the given AutoRedialer, if it is still retrying.
327 }
328
329+type redialerState uint32
330+
331+const (
332+ Unconfigured redialerState = iota
333+ Redialing
334+ Stopped
335+)
336+
337+func (s *redialerState) String() string {
338+ return [3]string{"Unconfigured", "Redialing", "Stopped"}[uint32(*s)]
339+}
340+
341 type autoRedialer struct {
342- stop chan bool
343- lock sync.RWMutex
344+ stateP *redialerState
345 dial func() error
346 jitter func(time.Duration) time.Duration
347 }
348
349+func (ar *autoRedialer) state() redialerState {
350+ return redialerState(atomic.LoadUint32((*uint32)(ar.stateP)))
351+}
352+
353+func (ar *autoRedialer) setState(s redialerState) {
354+ atomic.StoreUint32((*uint32)(ar.stateP), uint32(s))
355+}
356+
357+func (ar *autoRedialer) setStateIfEqual(oldState, newState redialerState) bool {
358+ return atomic.CompareAndSwapUint32((*uint32)(ar.stateP), uint32(oldState), uint32(newState))
359+}
360+
361 func (ar *autoRedialer) Stop() {
362 if ar != nil {
363- ar.lock.RLock()
364- defer ar.lock.RUnlock()
365- if ar.stop != nil {
366- ar.stop <- true
367- }
368+ ar.setState(Stopped)
369 }
370 }
371
372-func (ar *autoRedialer) shutdown() {
373- ar.lock.Lock()
374- defer ar.lock.Unlock()
375- close(ar.stop)
376- ar.stop = nil
377-}
378-
379 // Redial keeps on calling Dial until it stops returning an error. It does
380 // exponential backoff, adding back the output of Jitter at each step.
381 func (ar *autoRedialer) Redial() uint32 {
382@@ -96,20 +109,19 @@
383 // at least it's better than a segfault...
384 panic("you can't Redial a nil AutoRedialer")
385 }
386- if ar.stop == nil {
387- panic("this AutoRedialer has already been shut down")
388+ if !ar.setStateIfEqual(Unconfigured, Redialing) {
389+ // XXX log this
390+ return 0
391 }
392- defer ar.shutdown()
393-
394- ar.lock.RLock()
395- stop := ar.stop
396- ar.lock.RUnlock()
397
398 var timeout time.Duration
399 var dialAttempts uint32 = 0 // unsigned so it can wrap safely ...
400 timeouts := Timeouts()
401 var numTimeouts uint32 = uint32(len(timeouts))
402 for {
403+ if ar.state() != Redialing {
404+ return dialAttempts
405+ }
406 if ar.dial() == nil {
407 return dialAttempts + 1
408 }
409@@ -122,18 +134,15 @@
410 timeout += ar.jitter(timeout)
411 }
412 dialAttempts++
413- select {
414- case <-stop:
415- return dialAttempts
416- case <-time.After(timeout):
417- }
418+ time.Sleep(timeout)
419 }
420 }
421
422 // Returns a stoppable AutoRedialer using the provided Dialer. If the Dialer
423 // is also a Jitterer, the backoff will be jittered.
424 func NewAutoRedialer(dialer Dialer) AutoRedialer {
425- ar := &autoRedialer{stop: make(chan bool), dial: dialer.Dial}
426+ state := Unconfigured
427+ ar := &autoRedialer{stateP: &state, dial: dialer.Dial}
428 jitterer, ok := dialer.(Jitterer)
429 if ok {
430 ar.jitter = jitterer.Jitter
431
432=== added file 'util/redialer_states.gv'
433--- util/redialer_states.gv 1970-01-01 00:00:00 +0000
434+++ util/redialer_states.gv 2015-01-21 17:56:14 +0000
435@@ -0,0 +1,9 @@
436+digraph "redialer" {
437+ "Unconfigured" -> "Redialing" [ label="Redial" ]
438+ "Unconfigured" -> "Stopped" [ label="Stop" ]
439+
440+ "Redialing" -> "Redialing" [ label="Redial" ]
441+ "Redialing" -> "Stopped" [ label="Stop" ]
442+
443+ "Stopped" -> "Stopped" [ label="*" ]
444+}
445
446=== modified file 'util/redialer_test.go'
447--- util/redialer_test.go 2014-02-05 13:02:47 +0000
448+++ util/redialer_test.go 2015-01-21 17:56:14 +0000
449@@ -48,10 +48,10 @@
450 func (s *RedialerSuite) TestWorks(c *C) {
451 endp := testibus.NewTestingEndpoint(condition.Fail2Work(3), nil)
452 ar := NewAutoRedialer(endp)
453- c.Check(ar.(*autoRedialer).stop, NotNil)
454+ // c.Check(ar.(*autoRedialer).stop, NotNil)
455 c.Check(ar.Redial(), Equals, uint32(4))
456 // and on success, the stopper goes away
457- c.Check(ar.(*autoRedialer).stop, IsNil)
458+ // c.Check(ar.(*autoRedialer).stop, IsNil)
459 }
460
461 func (s *RedialerSuite) TestRetryNil(c *C) {
462@@ -63,7 +63,7 @@
463 endp := testibus.NewTestingEndpoint(condition.Work(true), nil)
464 ar := NewAutoRedialer(endp)
465 c.Check(ar.Redial(), Equals, uint32(1))
466- c.Check(ar.Redial, PanicMatches, ".*shut.?down.*")
467+ c.Check(ar.Redial(), Equals, uint32(0))
468 }
469
470 type JitteringEndpoint struct {
471@@ -103,13 +103,13 @@
472 go func() { countCh <- ar.Redial() }()
473 ar.Stop()
474 select {
475- case n := <-countCh:
476- c.Check(n, Equals, uint32(1))
477+ case <-countCh:
478+ // pass
479 case <-time.After(20 * time.Millisecond):
480 c.Fatal("timed out waiting for redial")
481 }
482- // on Stop(), the stopper goes away too
483- c.Check(ar.(*autoRedialer).stop, IsNil)
484+ // on Stop(), the redialer is Stopped
485+ c.Check(ar.(*autoRedialer).state(), Equals, Stopped)
486 // and the next Stop() doesn't panic nor block
487 ar.Stop()
488 }

Subscribers

People subscribed via source and target branches