Hi Stuart, Generally good, but I have a few things that look worth fixing. === modified file 'lib/lp/codehosting/tests/test_rewrite.py' --- lib/lp/codehosting/tests/test_rewrite.py 2011-08-12 11:37:08 +0000 +++ lib/lp/codehosting/tests/test_rewrite.py 2011-08-29 20:15:59 +0000 @@ -177,7 +181,8 @@ transaction.commit() rewriter.rewriteLine('/' + branch.unique_name + '/.bzr/README') rewriter.rewriteLine('/' + branch.unique_name + '/.bzr/README') - logging_output_lines = self.getLoggerOutput(rewriter).strip().split('\n') + logging_output_lines = self.getLoggerOutput( + rewriter).strip().split('\n') Cleaning out lint, I see. Thanks. @@ -274,3 +280,62 @@ # The script produces logging output, but not to stderr. self.assertEqual('', err) self.assertEqual(expected_lines, output_lines) + + +class TestBranchRewriterScriptHandlesDisconnects(TestCaseWithFactory): + """Ensure branch-rewrite.py survives fastdowntime deploys.""" + layer = LaunchpadScriptLayer + + def setUp(self): + super(TestBranchRewriterScriptHandlesDisconnects, self).setUp() + self.pgbouncer = PGBouncerFixture() + self.addCleanup(self.pgbouncer.cleanUp) + self.pgbouncer.setUp() Couldn't those last three lines be replaced with a simple self.pgbouncer = self.useFixture(PGBouncerFixture()) ? In fact I'm not even sure it's worth a setUp with the super() dance, and carrying self.pgbouncer from setUp to the tests. + def spawn(self): + script_file = os.path.join( + config.root, 'scripts', 'branch-rewrite.py') + + self.rewriter_proc = subprocess.Popen( + [script_file], stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, bufsize=0) Again, is it worth keeping self.rewriter_proc on the test class? It just hides state. Why not make it a return value? Moreover, is this process guaranteed to clean itself up? If not, then please ensure that it is. For instance, you could add something like "self.addCleanup(kill, rewriter_proc)" to spawn(). + def request(self, query): + self.rewriter_proc.stdin.write(query + '\n') + return self.rewriter_proc.stdout.readline().rstrip('\n') Do we know that the readline() will never hang under reasonable circumstances? + def test_disconnect(self): + self.spawn() Sure "disconnect" is something that happens in this test, but is that one word really a good description of what you're testing here? Would it help to say something with "reconnects"? + # Everything should be working, and we get valid output. + out = self.request('foo') + assert out.endswith('/foo'), out Why use assert here? Normally you'd say self.assertEndsWith(out, '/foo'). + self.pgbouncer.stop() + + # Now with pgbouncer down, we should get NULL messages and + # stderr spam, and this keeps happening. We test more than + # once to ensure that we will keep trying to reconnect even + # after several failures. + for count in range(5): + out = self.request('foo') Is this race-free? I don't know if pgbouncer is a separate process or not; if it is, does stop() block until it's taken effect? + assert out == 'NULL', out Here too: why assert instead of self.assertEqual? + self.pgbouncer.start() Here too: is this race-free? + # Everything should be working, and we get valid output. + out = self.request('foo') + assert out.endswith('/foo'), out Here too: why assert instead of self.assertEndsWith? + def test_starts_with_db_down(self): + self.pgbouncer.stop() + self.spawn() + + for count in range(5): + out = self.request('foo') + assert out == 'NULL', out Another assert. === modified file 'scripts/branch-rewrite.py' --- scripts/branch-rewrite.py 2010-11-06 12:50:22 +0000 +++ scripts/branch-rewrite.py 2011-08-29 20:15:59 +0000 @@ -60,9 +62,18 @@ return except KeyboardInterrupt: sys.exit() - except: + except Exception: Sensible. This means you no longer try to do useful things when things go really bad. self.logger.exception('Exception occurred:') print "NULL" + # The exception might have been a DisconnectionError or + # similar. Cleanup such as database reconnection will + # not happen until the transaction is rolled back. We + # are explicitly rolling back the store here instead of + # using transaction.abort() due to Bug #819282. This looks like it should properly be an XXX. Could you format it as one? XXX StuartBishop 2011-08-31 bug=819282: etc. + try: + ISlaveStore(Branch).rollback() + except Exception: + self.logger.exception('Exception occurred in rollback:') Just to make sure: is the exception swallowed deliberately? Or was it supposed to be re-raised, or sys.exit() called with an error code or whatever? Jeroen