Code review comment for lp:~thumper/juju-core/fix-intermittent-failure

Revision history for this message
Tim Penhey (thumper) wrote :

Reviewers: mp+186690_code.launchpad.net,

Message:
Please take a look.

Description:
Fix race condition in SSHStorage test

I found an intermittent failure in the synchronization
test for the SSHStorage. It only happened once, and not
again, but I felt it was worth fixing anyway.

The race was in the flock subprocess actually starting
before the following lines in the test. The lines following
expected the flock to be taken, but since the flock was
managed by an executed command, there is a race where it
may not have started.

I broke the synchronisation test into three as it was
really testing three distinct things.

The flock helper method now waits for the flock to be taken
by incrementally reading from stdout waiting for the initial
echo to be written out prior to the sleep.

The flock cleanup is also now handled by a cleanup method.

By breaking the test up, we no longer need to manually kill the
process as part of the test.

https://code.launchpad.net/~thumper/juju-core/fix-intermittent-failure/+merge/186690

(do not edit description out of merge proposal)

Please review this at https://codereview.appspot.com/13799043/

Affected files (+33, -19 lines):
   A [revision details]
   M environs/sshstorage/storage_test.go

Index: [revision details]
=== added file '[revision details]'
--- [revision details] 2012-01-01 00:00:00 +0000
+++ [revision details] 2012-01-01 00:00:00 +0000
@@ -0,0 +1,2 @@
+Old revision: tarmac-20130919221201-urd9lbpjtto8a7pk
+New revision: <email address hidden>

Index: environs/sshstorage/storage_test.go
=== modified file 'environs/sshstorage/storage_test.go'
--- environs/sshstorage/storage_test.go 2013-09-18 22:54:32 +0000
+++ environs/sshstorage/storage_test.go 2013-09-19 23:29:41 +0000
@@ -244,33 +244,42 @@
   c.Assert(stor.DefaultConsistencyStrategy(), gc.Equals,
utils.AttemptStrategy{})
  }

-// flock is a test helper that flocks a file,
-// executes "sleep" with the specified duration,
-// and returns the *Cmd so it can be early terminated.
-func (s *storageSuite) flock(c *gc.C, mode flockmode, lockfile string,
duration time.Duration) *os.Process {
- sleepcmd := fmt.Sprintf("sleep %vs", duration.Seconds())
+const defaultFlockTimeout = 5 * time.Second
+
+// flock is a test helper that flocks a file, executes "sleep" with the
+// specified duration, the command is terminated in the test tear down.
+func (s *storageSuite) flock(c *gc.C, mode flockmode, lockfile string) {
+ sleepcmd := fmt.Sprintf("echo started && sleep %vs",
defaultFlockTimeout.Seconds())
   cmd := exec.Command(flockBin, "--nonblock", "--close", string(mode),
lockfile, "-c", sleepcmd)
+ stdout, err := cmd.StdoutPipe()
+ c.Assert(err, gc.IsNil)
   c.Assert(cmd.Start(), gc.IsNil)
- return cmd.Process
+ // Make sure the flock has been taken before returning by reading stdout
waiting for "started"
+ for count := len("started"); count > 0; {
+ result := make([]byte, count)
+ bytesRead, err := stdout.Read(result)
+ c.Assert(err, gc.IsNil)
+ count -= bytesRead
+ }
+ s.AddCleanup(func(*gc.C) {
+ cmd.Process.Kill()
+ cmd.Process.Wait()
+ })
  }

-const defaultFlockTimeout = 5 * time.Second
-
-func (s *storageSuite) TestSynchronisation(c *gc.C) {
+func (s *storageSuite) TestCreateFailsIfFlockNotAvailable(c *gc.C) {
   storageDir := c.MkDir()
- proc := s.flock(c, flockShared, storageDir, defaultFlockTimeout)
- defer proc.Wait()
- defer proc.Kill()
-
+ s.flock(c, flockShared, storageDir)
   // Creating storage requires an exclusive lock initially.
   //
   // flock exits with exit code 1 if it can't acquire the
   // lock immediately in non-blocking mode (which the tests force).
   _, err := NewSSHStorage("example.com", storageDir)
   c.Assert(err, gc.ErrorMatches, "exit code 1")
+}

- proc.Kill()
- proc.Wait()
+func (s *storageSuite) TestWithSharedLocks(c *gc.C) {
+ storageDir := c.MkDir()
   stor, err := NewSSHStorage("example.com", storageDir)
   c.Assert(err, gc.IsNil)

@@ -279,7 +288,7 @@
   data := []byte("abc\000def")
   c.Assert(ioutil.WriteFile(filepath.Join(storageDir, contentdir, "a"),
data, 0644), gc.IsNil)

- proc = s.flock(c, flockShared, storageDir, defaultFlockTimeout)
+ s.flock(c, flockShared, storageDir)
   _, err = storage.Get(stor, "a")
   c.Assert(err, gc.IsNil)
   _, err = storage.List(stor, "")
@@ -287,12 +296,15 @@
   c.Assert(stor.Put("a", bytes.NewBuffer(nil), 0), gc.NotNil)
   c.Assert(stor.Remove("a"), gc.NotNil)
   c.Assert(stor.RemoveAll(), gc.NotNil)
- proc.Kill()
- proc.Wait()
+}

+func (s *storageSuite) TestWithExclusiveLocks(c *gc.C) {
+ storageDir := c.MkDir()
+ stor, err := NewSSHStorage("example.com", storageDir)
+ c.Assert(err, gc.IsNil)
   // None of the methods (apart from URL) should be able to do anything
   // while an exclusive lock is held.
- proc = s.flock(c, flockExclusive, storageDir, defaultFlockTimeout)
+ s.flock(c, flockExclusive, storageDir)
   _, err = stor.URL("a")
   c.Assert(err, gc.IsNil)
   c.Assert(stor.Put("a", bytes.NewBuffer(nil), 0), gc.NotNil)

« Back to merge proposal