Code review comment for lp:~rvb/maas/tweak-timeouts

Revision history for this message
Raphaël Badin (rvb) wrote :

You're right, and that's why I used a conservative number (100). This
is meant to be temporary until we come up with a more intelligent thread
pool.

There is a deadlock indeed, it's the DNS consolidator (see my comments
on the related bug).

On 07/07/2015 04:18 PM, Blake Rouse wrote:
> Remember last time we increased this we hit database connection issues. I
> don't know if increasing the number of threads is a good idea. That is why
> we lowered it back down.
>
> Does Gavin's branch not fix the deadlock issue? Is there another deadlock
> that needs this increase?
> On Jul 7, 2015 6:27 AM, "Raphaël Badin" <email address hidden> wrote:
>
>> Raphaël Badin has proposed merging lp:~rvb/maas/tweak-timeouts into
>> lp:maas.
>>
>> Commit message:
>> Set the maximum thread pool size to 100: this is meant as a workaround for
>> deadlocks such as the one described in the linked bug. It's not meant to be
>> a permanent solution but rather to buy us time to come up with a proper
>> solution (like an "intelligent thread pool manager). Unify the power query
>> timeout: use a unique timeout across the board and use a value that can
>> accommodate seamicro's tendency to take 30s to reply to a power query.
>>
>> Requested reviews:
>> MAAS Maintainers (maas-maintainers)
>> Related bugs:
>> Bug #1470013 in MAAS: "Failed power off/status for multiple nodes within
>> a SM15K chassis"
>> https://bugs.launchpad.net/maas/+bug/1470013
>>
>> For more details, see:
>> https://code.launchpad.net/~rvb/maas/tweak-timeouts/+merge/264018
>> --
>> Your team MAAS Maintainers is requested to review the proposed merge of
>> lp:~rvb/maas/tweak-timeouts into lp:maas.
>>
>> === modified file 'src/maasserver/api/nodes.py'
>> --- src/maasserver/api/nodes.py 2015-06-25 10:26:40 +0000
>> +++ src/maasserver/api/nodes.py 2015-07-07 10:22:52 +0000
>> @@ -76,6 +76,7 @@
>> from maasserver.utils import find_nodegroup
>> from maasserver.utils.orm import get_first
>> from piston.utils import rc
>> +from provisioningserver.drivers.power import POWER_QUERY_TIMEOUT
>> from provisioningserver.power.poweraction import (
>> PowerActionFail,
>> UnknownPowerType,
>> @@ -690,9 +691,7 @@
>> power_type=power_info.power_type,
>> context=power_info.power_parameters)
>> try:
>> - # Allow 30 seconds for the power query max as we're holding
>> - # up an appserver thread here.
>> - state = call.wait(30)
>> + state = call.wait(POWER_QUERY_TIMEOUT)
>> except crochet.TimeoutError:
>> maaslog.error(
>> "%s: Timed out waiting for power response in
>> Node.power_state",
>>
>> === modified file 'src/maasserver/plugin.py'
>> --- src/maasserver/plugin.py 2015-05-07 18:14:38 +0000
>> +++ src/maasserver/plugin.py 2015-07-07 10:22:52 +0000
>> @@ -24,6 +24,7 @@
>> from twisted.plugin import IPlugin
>> from twisted.python import usage
>> from zope.interface import implementer
>> +from twisted.internet import reactor
>>
>>
>> def serverFromString(description):
>> @@ -45,6 +46,14 @@
>> ]
>>
>>
>> +# The maximum number of threads used by the default twisted thread pool.
>> +# This value is a trade-off between a small value (such as the default:
>> 10)
>> +# which can create deadlocks (see 1470013) and a huge value which can
>> cause
>> +# MAAS to hit other limitations such as the number of open files or the
>> +# number of concurrent database connexions.
>> +MAX_THREADS = 100
>> +
>> +
>> @implementer(IServiceMaker, IPlugin)
>> class RegionServiceMaker:
>> """Create a service for the Twisted plugin."""
>> @@ -76,6 +85,10 @@
>> import crochet
>> crochet.no_setup()
>>
>> + def _configurePoolSize(self):
>> + threadpool = reactor.getThreadPool()
>> + threadpool.adjustPoolsize(10, MAX_THREADS)
>> +
>> def _makeIntrospectionService(self, endpoint):
>> from provisioningserver.utils import introspect
>> introspect_service = (
>> @@ -91,6 +104,7 @@
>> self._configureLogging()
>> self._configureDjango()
>> self._configureCrochet()
>> + self._configurePoolSize()
>>
>> # Populate the region's event-loop with services.
>> from maasserver import eventloop
>>
>> === modified file 'src/maasserver/tests/test_plugin.py'
>> --- src/maasserver/tests/test_plugin.py 2015-05-07 18:14:38 +0000
>> +++ src/maasserver/tests/test_plugin.py 2015-07-07 10:22:52 +0000
>> @@ -17,6 +17,7 @@
>> import crochet
>> from maasserver import eventloop
>> from maasserver.plugin import (
>> + MAX_THREADS,
>> Options,
>> RegionServiceMaker,
>> )
>> @@ -24,11 +25,13 @@
>> from maastesting.testcase import MAASTestCase
>> from provisioningserver import logger
>> from provisioningserver.utils.twisted import asynchronous
>> +from testtools.matchers import GreaterThan
>> from twisted.application.service import MultiService
>> +from twisted.internet import reactor
>>
>>
>> class TestOptions(MAASTestCase):
>> - """Tests for `provisioningserver.plugin.Options`."""
>> + """Tests for `maasserver.plugin.Options`."""
>>
>> def test_defaults(self):
>> options = Options()
>> @@ -42,7 +45,7 @@
>>
>>
>> class TestRegionServiceMaker(MAASTestCase):
>> - """Tests for `provisioningserver.plugin.RegionServiceMaker`."""
>> + """Tests for `maasserver.plugin.RegionServiceMaker`."""
>>
>> def setUp(self):
>> super(TestRegionServiceMaker, self).setUp()
>> @@ -78,3 +81,12 @@
>> "Not all services are named.")
>> self.assertThat(logger.basicConfig, MockCalledOnceWith())
>> self.assertThat(crochet.no_setup, MockCalledOnceWith())
>> +
>> + @asynchronous(timeout=5)
>> + def test__sets_pool_size(self):
>> + service_maker = RegionServiceMaker("Harry", "Hill")
>> + service_maker.makeService(Options())
>> + threadpool = reactor.getThreadPool()
>> + self.assertEqual(MAX_THREADS, threadpool.max)
>> + # Max threads is reasonable.
>> + self.assertThat(threadpool.max, GreaterThan(50))
>>
>> === modified file 'src/maasserver/websockets/handlers/node.py'
>> --- src/maasserver/websockets/handlers/node.py 2015-05-28 15:07:01 +0000
>> +++ src/maasserver/websockets/handlers/node.py 2015-07-07 10:22:52 +0000
>> @@ -48,6 +48,7 @@
>> )
>> from metadataserver.enum import RESULT_TYPE
>> from metadataserver.models import NodeResult
>> +from provisioningserver.drivers.power import POWER_QUERY_TIMEOUT
>> from provisioningserver.logger import get_maas_logger
>> from provisioningserver.power.poweraction import (
>> PowerActionFail,
>> @@ -547,9 +548,7 @@
>> power_type=power_info.power_type,
>> context=power_info.power_parameters)
>> try:
>> - # Allow 15 seconds for the power query max as we're
>> holding
>> - # up a thread waiting.
>> - state = call.wait(15)['state']
>> + state = call.wait(POWER_QUERY_TIMEOUT)['state']
>> except crochet.TimeoutError:
>> maaslog.error(
>> "%s: Timed out waiting for power response in "
>> @@ -569,5 +568,5 @@
>> # sure the change is committed and retried if required. Not
>> pushing
>> # this to another thread, would result in the entire power query
>> being
>> # performed again.
>> - update_power_state(state).wait(15)
>> + update_power_state(state).wait(POWER_QUERY_TIMEOUT)
>> return state
>>
>> === modified file 'src/provisioningserver/drivers/power/__init__.py'
>> --- src/provisioningserver/drivers/power/__init__.py 2015-06-23
>> 16:45:25 +0000
>> +++ src/provisioningserver/drivers/power/__init__.py 2015-07-07
>> 10:22:52 +0000
>> @@ -13,6 +13,7 @@
>>
>> __metaclass__ = type
>> __all__ = [
>> + "POWER_QUERY_TIMEOUT",
>> "PowerActionError",
>> "PowerAuthError",
>> "PowerConnError",
>> @@ -29,6 +30,7 @@
>> abstractmethod,
>> abstractproperty,
>> )
>> +from datetime import timedelta
>>
>> from jsonschema import validate
>> from provisioningserver.drivers import (
>> @@ -52,6 +54,13 @@
>> }
>>
>>
>> +# Timeout for the power query action. We might be holding up a thread for
>> that
>> +# long but some BMCs (notably seamicro) can take a long time to respond to
>> +# a power query request.
>> +# This should be configurable per-BMC.
>> +POWER_QUERY_TIMEOUT = timedelta(seconds=45).total_seconds()
>> +
>> +
>> class PowerError(Exception):
>> """Base error for all power driver failure commands."""
>>
>>
>> === modified file 'src/provisioningserver/rpc/power.py'
>> --- src/provisioningserver/rpc/power.py 2015-06-10 16:45:23 +0000
>> +++ src/provisioningserver/rpc/power.py 2015-07-07 10:22:52 +0000
>> @@ -79,7 +79,8 @@
>>
>>
>> # Timeout for change_power_state(). We set it to 5 minutes by default,
>> -# but it would be lovely if this was configurable.
>> +# but it would be lovely if this was configurable. This is only a backstop
>> +# meant to cope with broken BMCs.
>> CHANGE_POWER_STATE_TIMEOUT = timedelta(minutes=5).total_seconds()
>>
>>
>>
>>
>>
>

« Back to merge proposal