Merge into trunk : fix-intermittent-failure : Code : juju-core

Status:	Merged
Approved by:	Tim Penhey on 2013-09-20
Approved revision:	no longer in the source branch.
Merged at revision:	1847
Proposed branch:	lp:~thumper/juju-core/fix-intermittent-failure
Merge into:	lp:~go-bot/juju-core/trunk
Diff against target:	93 lines (+28/-19) 1 file modified environs/sshstorage/storage_test.go (+28/-19)
To merge this branch:	bzr merge lp:~thumper/juju-core/fix-intermittent-failure
Related bugs:	Link a bug report

Reviewer	Review Type	Date Requested	Status
Juju Engineering		2013-09-19	Pending
Review via email: mp+186690@code.launchpad.net

Commit message

Fix race condition in SSHStorage test

I found an intermittent failure in the synchronization
test for the SSHStorage. It only happened once, and not
again, but I felt it was worth fixing anyway.

The race was in the flock subprocess actually starting
before the following lines in the test. The lines following
expected the flock to be taken, but since the flock was
managed by an executed command, there is a race where it
may not have started.

I broke the synchronisation test into three as it was
really testing three distinct things.

The flock helper method now waits for the flock to be taken
by incrementally reading from stdout waiting for the initial
echo to be written out prior to the sleep.

The flock cleanup is also now handled by a cleanup method.

By breaking the test up, we no longer need to manually kill the
process as part of the test.

https://codereview.appspot.com/13799043/

Description of the change

Fix race condition in SSHStorage test

I found an intermittent failure in the synchronization
test for the SSHStorage. It only happened once, and not
again, but I felt it was worth fixing anyway.

The race was in the flock subprocess actually starting
before the following lines in the test. The lines following
expected the flock to be taken, but since the flock was
managed by an executed command, there is a race where it
may not have started.

I broke the synchronisation test into three as it was
really testing three distinct things.

The flock helper method now waits for the flock to be taken
by incrementally reading from stdout waiting for the initial
echo to be written out prior to the sleep.

The flock cleanup is also now handled by a cleanup method.

By breaking the test up, we no longer need to manually kill the
process as part of the test.

https://codereview.appspot.com/13799043/

Revision history for this message

Tim Penhey (thumper) wrote on 2013-09-19:

#

Download full text (4.9 KiB)

Reviewers: mp+186690_code.launchpad.net,

Message:
Please take a look.

Description:
Fix race condition in SSHStorage test

I found an intermittent failure in the synchronization
test for the SSHStorage. It only happened once, and not
again, but I felt it was worth fixing anyway.

The race was in the flock subprocess actually starting
before the following lines in the test. The lines following
expected the flock to be taken, but since the flock was
managed by an executed command, there is a race where it
may not have started.

I broke the synchronisation test into three as it was
really testing three distinct things.

The flock helper method now waits for the flock to be taken
by incrementally reading from stdout waiting for the initial
echo to be written out prior to the sleep.

The flock cleanup is also now handled by a cleanup method.

By breaking the test up, we no longer need to manually kill the
process as part of the test.

https://code.launchpad.net/~thumper/juju-core/fix-intermittent-failure/+merge/186690

(do not edit description out of merge proposal)

Please review this at https://codereview.appspot.com/13799043/

Affected files (+33, -19 lines):
A [revision details]
M environs/sshstorage/storage_test.go

Index: [revision details]
=== added file '[revision details]'
--- [revision details] 2012-01-01 00:00:00 +0000
+++ [revision details] 2012-01-01 00:00:00 +0000
@@ -0,0 +1,2 @@
+Old revision: tarmac-20130919221201-urd9lbpjtto8a7pk
+New revision: <email address hidden>

Index: environs/sshstorage/storage_test.go
=== modified file 'environs/sshstorage/storage_test.go'
--- environs/sshstorage/storage_test.go 2013-09-18 22:54:32 +0000
+++ environs/sshstorage/storage_test.go 2013-09-19 23:29:41 +0000
@@ -244,33 +244,42 @@
c.Assert(stor.DefaultConsistencyStrategy(), gc.Equals,
utils.AttemptStrategy{})
}

-// flock is a test helper that flocks a file,
-// executes "sleep" with the specified duration,
-// and returns the *Cmd so it can be early terminated.
-func (s *storageSuite) flock(c *gc.C, mode flockmode, lockfile string,
duration time.Duration) *os.Process {
- sleepcmd := fmt.Sprintf("sleep %vs", duration.Seconds())
+const defaultFlockTimeout = 5 * time.Second
+
+// flock is a test helper that flocks a file, executes "sleep" with the
+// specified duration, the command is terminated in the test tear down.
+func (s *storageSuite) flock(c *gc.C, mode flockmode, lockfile string) {
+ sleepcmd := fmt.Sprintf("echo started && sleep %vs",
defaultFlockTimeout.Seconds())
   cmd := exec.Command(flockBin, "--nonblock", "--close", string(mode),
lockfile, "-c", sleepcmd)
+ stdout, err := cmd.StdoutPipe()
+ c.Assert(err, gc.IsNil)
   c.Assert(cmd.Start(), gc.IsNil)
- return cmd.Process
+ // Make sure the flock has been taken before returning by reading stdout
waiting for "started"
+ for count := len("started"); count > 0; {
+ result := make([]byte, count)
+ bytesRead, err := stdout.Read(result)
+ c.Assert(err, gc.IsNil)
+ count -= bytesRead
+ }
+ s.AddCleanup(func(*gc.C) {
+ cmd.Process.Kill()
+ cmd.Process.Wait()
+ })
  }

-const defaultFlockTimeout = 5 * time.Second
-
-func (s *sto...

Reviewers: mp+186690_code.launchpad.net,

Message:
Please take a look.

Description:
Fix race condition in SSHStorage test

I found an intermittent failure in the synchronization
test for the SSHStorage. It only happened once, and not
again, but I felt it was worth fixing anyway.

The race was in the flock subprocess actually starting
before the following lines in the test.  The lines following
expected the flock to be taken, but since the flock was
managed by an executed command, there is a race where it
may not have started.

I broke the synchronisation test into three as it was
really testing three distinct things.

The flock helper method now waits for the flock to be taken
by incrementally reading from stdout waiting for the initial
echo to be written out prior to the sleep.

The flock cleanup is also now handled by a cleanup method.

By breaking the test up, we no longer need to manually kill the
process as part of the test.

https://code.launchpad.net/~thumper/juju-core/fix-intermittent-failure/+merge/186690

(do not edit description out of merge proposal)

Please review this at https://codereview.appspot.com/13799043/

Affected files (+33, -19 lines):
   A [revision details]
   M environs/sshstorage/storage_test.go

Index: [revision details]
=== added file '[revision details]'
--- [revision details]	2012-01-01 00:00:00 +0000
+++ [revision details]	2012-01-01 00:00:00 +0000
@@ -0,0 +1,2 @@
+Old revision: tarmac-20130919221201-urd9lbpjtto8a7pk
+New revision: tim.penhey@canonical.com-20130919232941-czpvvvzj0zy4j4eq

Index: environs/sshstorage/storage_test.go
=== modified file 'environs/sshstorage/storage_test.go'
--- environs/sshstorage/storage_test.go	2013-09-18 22:54:32 +0000
+++ environs/sshstorage/storage_test.go	2013-09-19 23:29:41 +0000
@@ -244,33 +244,42 @@
  	c.Assert(stor.DefaultConsistencyStrategy(), gc.Equals,  
utils.AttemptStrategy{})
  }

-// flock is a test helper that flocks a file,
-// executes "sleep" with the specified duration,
-// and returns the *Cmd so it can be early terminated.
-func (s *storageSuite) flock(c *gc.C, mode flockmode, lockfile string,  
duration time.Duration) *os.Process {
-	sleepcmd := fmt.Sprintf("sleep %vs", duration.Seconds())
+const defaultFlockTimeout = 5 * time.Second
+
+// flock is a test helper that flocks a file, executes "sleep" with the
+// specified duration, the command is terminated in the test tear down.
+func (s *storageSuite) flock(c *gc.C, mode flockmode, lockfile string) {
+	sleepcmd := fmt.Sprintf("echo started && sleep %vs",  
defaultFlockTimeout.Seconds())
  	cmd := exec.Command(flockBin, "--nonblock", "--close", string(mode),  
lockfile, "-c", sleepcmd)
+	stdout, err := cmd.StdoutPipe()
+	c.Assert(err, gc.IsNil)
  	c.Assert(cmd.Start(), gc.IsNil)
-	return cmd.Process
+	// Make sure the flock has been taken before returning by reading stdout  
waiting for "started"
+	for count := len("started"); count > 0; {
+		result := make([]byte, count)
+		bytesRead, err := stdout.Read(result)
+		c.Assert(err, gc.IsNil)
+		count -= bytesRead
+	}
+	s.AddCleanup(func(*gc.C) {
+		cmd.Process.Kill()
+		cmd.Process.Wait()
+	})
  }

-const defaultFlockTimeout = 5 * time.Second
-
-func (s *storageSuite) TestSynchronisation(c *gc.C) {
+func (s *storageSuite) TestCreateFailsIfFlockNotAvailable(c *gc.C) {
  	storageDir := c.MkDir()
-	proc := s.flock(c, flockShared, storageDir, defaultFlockTimeout)
-	defer proc.Wait()
-	defer proc.Kill()
-
+	s.flock(c, flockShared, storageDir)
  	// Creating storage requires an exclusive lock initially.
  	//
  	// flock exits with exit code 1 if it can't acquire the
  	// lock immediately in non-blocking mode (which the tests force).
  	_, err := NewSSHStorage("example.com", storageDir)
  	c.Assert(err, gc.ErrorMatches, "exit code 1")
+}

-	proc.Kill()
-	proc.Wait()
+func (s *storageSuite) TestWithSharedLocks(c *gc.C) {
+	storageDir := c.MkDir()
  	stor, err := NewSSHStorage("example.com", storageDir)
  	c.Assert(err, gc.IsNil)

@@ -279,7 +288,7 @@
  	data := []byte("abc\000def")
  	c.Assert(ioutil.WriteFile(filepath.Join(storageDir, contentdir, "a"),  
data, 0644), gc.IsNil)

-	proc = s.flock(c, flockShared, storageDir, defaultFlockTimeout)
+	s.flock(c, flockShared, storageDir)
  	_, err = storage.Get(stor, "a")
  	c.Assert(err, gc.IsNil)
  	_, err = storage.List(stor, "")
@@ -287,12 +296,15 @@
  	c.Assert(stor.Put("a", bytes.NewBuffer(nil), 0), gc.NotNil)
  	c.Assert(stor.Remove("a"), gc.NotNil)
  	c.Assert(stor.RemoveAll(), gc.NotNil)
-	proc.Kill()
-	proc.Wait()
+}

+func (s *storageSuite) TestWithExclusiveLocks(c *gc.C) {
+	storageDir := c.MkDir()
+	stor, err := NewSSHStorage("example.com", storageDir)
+	c.Assert(err, gc.IsNil)
  	// None of the methods (apart from URL) should be able to do anything
  	// while an exclusive lock is held.
-	proc = s.flock(c, flockExclusive, storageDir, defaultFlockTimeout)
+	s.flock(c, flockExclusive, storageDir)
  	_, err = stor.URL("a")
  	c.Assert(err, gc.IsNil)
  	c.Assert(stor.Put("a", bytes.NewBuffer(nil), 0), gc.NotNil)

Revision history for this message

Andrew Wilkins (axwalk) wrote on 2013-09-20:

#

LGTM, thanks for fixing my crap.

https://codereview.appspot.com/13799043/diff/1/environs/sshstorage/storage_test.go
File environs/sshstorage/storage_test.go (right):

https://codereview.appspot.com/13799043/diff/1/environs/sshstorage/storage_test.go#newcode258
environs/sshstorage/storage_test.go:258: for count := len("started");
count > 0; {
I'd probably just use
err = io.ReadFull(stdout, make([]byte, len("started")))
c.Assert(err, gc.IsNil)

https://codereview.appspot.com/13799043/

juju-core

Merge lp:~thumper/juju-core/fix-intermittent-failure into lp:~go-bot/juju-core/trunk

Commit message

Description of the change

Preview Diff

Subscribers

 === modified file 'environs/sshstorage/storage_test.go'
 --- environs/sshstorage/storage_test.go	2013-09-18 22:54:32 +0000
 +++ environs/sshstorage/storage_test.go	2013-09-20 01:34:23 +0000
@@ -6,6 +6,7 @@
  import (
  	"bytes"
  	"fmt"
++	"io"
  	"io/ioutil"
  	"os"
  	"os/exec"
@@ -244,33 +245,38 @@
  	c.Assert(stor.DefaultConsistencyStrategy(), gc.Equals, utils.AttemptStrategy{})
+ }
--// flock is a test helper that flocks a file,
--// executes "sleep" with the specified duration,
--// and returns the *Cmd so it can be early terminated.
--func (s *storageSuite) flock(c *gc.C, mode flockmode, lockfile string, duration time.Duration) *os.Process {
--	sleepcmd := fmt.Sprintf("sleep %vs", duration.Seconds())
++const defaultFlockTimeout = 5 * time.Second
++
++// flock is a test helper that flocks a file, executes "sleep" with the
++// specified duration, the command is terminated in the test tear down.
++func (s *storageSuite) flock(c *gc.C, mode flockmode, lockfile string) {
++	sleepcmd := fmt.Sprintf("echo started && sleep %vs", defaultFlockTimeout.Seconds())
  	cmd := exec.Command(flockBin, "--nonblock", "--close", string(mode), lockfile, "-c", sleepcmd)
++	stdout, err := cmd.StdoutPipe()
++	c.Assert(err, gc.IsNil)
  	c.Assert(cmd.Start(), gc.IsNil)
--	return cmd.Process
++	// Make sure the flock has been taken before returning by reading stdout waiting for "started"
++	_, err = io.ReadFull(stdout, make([]byte, len("started")))
++	c.Assert(err, gc.IsNil)
++	s.AddCleanup(func(*gc.C) {
++		cmd.Process.Kill()
++		cmd.Process.Wait()
++	})
+ }
--const defaultFlockTimeout = 5 * time.Second
--
--func (s *storageSuite) TestSynchronisation(c *gc.C) {
++func (s *storageSuite) TestCreateFailsIfFlockNotAvailable(c *gc.C) {
  	storageDir := c.MkDir()
--	proc := s.flock(c, flockShared, storageDir, defaultFlockTimeout)
--	defer proc.Wait()
--	defer proc.Kill()
--
++	s.flock(c, flockShared, storageDir)
  	// Creating storage requires an exclusive lock initially.
  	//
  	// flock exits with exit code 1 if it can't acquire the
  	// lock immediately in non-blocking mode (which the tests force).
  	_, err := NewSSHStorage("example.com", storageDir)
  	c.Assert(err, gc.ErrorMatches, "exit code 1")
++}
--	proc.Kill()
--	proc.Wait()
++func (s *storageSuite) TestWithSharedLocks(c *gc.C) {
++	storageDir := c.MkDir()
  	stor, err := NewSSHStorage("example.com", storageDir)
  	c.Assert(err, gc.IsNil)
@@ -279,7 +285,7 @@
  	data := []byte("abc\000def")
  	c.Assert(ioutil.WriteFile(filepath.Join(storageDir, contentdir, "a"), data, 0644), gc.IsNil)
--	proc = s.flock(c, flockShared, storageDir, defaultFlockTimeout)
++	s.flock(c, flockShared, storageDir)
  	_, err = storage.Get(stor, "a")
  	c.Assert(err, gc.IsNil)
  	_, err = storage.List(stor, "")
@@ -287,12 +293,15 @@
  	c.Assert(stor.Put("a", bytes.NewBuffer(nil), 0), gc.NotNil)
  	c.Assert(stor.Remove("a"), gc.NotNil)
  	c.Assert(stor.RemoveAll(), gc.NotNil)
--	proc.Kill()
--	proc.Wait()
++}
++func (s *storageSuite) TestWithExclusiveLocks(c *gc.C) {
++	storageDir := c.MkDir()
++	stor, err := NewSSHStorage("example.com", storageDir)
++	c.Assert(err, gc.IsNil)
  	// None of the methods (apart from URL) should be able to do anything
  	// while an exclusive lock is held.
--	proc = s.flock(c, flockExclusive, storageDir, defaultFlockTimeout)
++	s.flock(c, flockExclusive, storageDir)
  	_, err = stor.URL("a")
  	c.Assert(err, gc.IsNil)
  	c.Assert(stor.Put("a", bytes.NewBuffer(nil), 0), gc.NotNil)