You're right, and that's why I used a conservative number (100). This
is meant to be temporary until we come up with a more intelligent thread
pool.
There is a deadlock indeed, it's the DNS consolidator (see my comments
on the related bug).
On 07/07/2015 04:18 PM, Blake Rouse wrote:
> Remember last time we increased this we hit database connection issues. I
> don't know if increasing the number of threads is a good idea. That is why
> we lowered it back down.
>
> Does Gavin's branch not fix the deadlock issue? Is there another deadlock
> that needs this increase?
> On Jul 7, 2015 6:27 AM, "Raphaël Badin" <email address hidden> wrote:
>
>> Raphaël Badin has proposed merging lp:~rvb/maas/tweak-timeouts into
>> lp:maas.
>>
>> Commit message:
>> Set the maximum thread pool size to 100: this is meant as a workaround for
>> deadlocks such as the one described in the linked bug. It's not meant to be
>> a permanent solution but rather to buy us time to come up with a proper
>> solution (like an "intelligent thread pool manager). Unify the power query
>> timeout: use a unique timeout across the board and use a value that can
>> accommodate seamicro's tendency to take 30s to reply to a power query.
>>
>> Requested reviews:
>> MAAS Maintainers (maas-maintainers)
>> Related bugs:
>> Bug #1470013 in MAAS: "Failed power off/status for multiple nodes within
>> a SM15K chassis"
>> https://bugs.launchpad.net/maas/+bug/1470013
>>
>> For more details, see:
>> https://code.launchpad.net/~rvb/maas/tweak-timeouts/+merge/264018
>> --
>> Your team MAAS Maintainers is requested to review the proposed merge of
>> lp:~rvb/maas/tweak-timeouts into lp:maas.
>>
>> === modified file 'src/maasserver/api/nodes.py'
>> --- src/maasserver/api/nodes.py 2015-06-25 10:26:40 +0000
>> +++ src/maasserver/api/nodes.py 2015-07-07 10:22:52 +0000
>> @@ -76,6 +76,7 @@
>> from maasserver.utils import find_nodegroup
>> from maasserver.utils.orm import get_first
>> from piston.utils import rc
>> +from provisioningserver.drivers.power import POWER_QUERY_TIMEOUT
>> from provisioningserver.power.poweraction import (
>> PowerActionFail,
>> UnknownPowerType,
>> @@ -690,9 +691,7 @@
>> power_type=power_info.power_type,
>> context=power_info.power_parameters)
>> try:
>> - # Allow 30 seconds for the power query max as we're holding
>> - # up an appserver thread here.
>> - state = call.wait(30)
>> + state = call.wait(POWER_QUERY_TIMEOUT)
>> except crochet.TimeoutError:
>> maaslog.error(
>> "%s: Timed out waiting for power response in
>> Node.power_state",
>>
>> === modified file 'src/maasserver/plugin.py'
>> --- src/maasserver/plugin.py 2015-05-07 18:14:38 +0000
>> +++ src/maasserver/plugin.py 2015-07-07 10:22:52 +0000
>> @@ -24,6 +24,7 @@
>> from twisted.plugin import IPlugin
>> from twisted.python import usage
>> from zope.interface import implementer
>> +from twisted.internet import reactor
>>
>>
>> def serverFromString(description):
>> @@ -45,6 +46,14 @@
>> ]
>>
>>
>> +# The maximum number of threads used by the default twisted thread pool.
>> +# This value is a trade-off between a small value (such as the default:
>> 10)
>> +# which can create deadlocks (see 1470013) and a huge value which can
>> cause
>> +# MAAS to hit other limitations such as the number of open files or the
>> +# number of concurrent database connexions.
>> +MAX_THREADS = 100
>> +
>> +
>> @implementer(IServiceMaker, IPlugin)
>> class RegionServiceMaker:
>> """Create a service for the Twisted plugin."""
>> @@ -76,6 +85,10 @@
>> import crochet
>> crochet.no_setup()
>>
>> + def _configurePoolSize(self):
>> + threadpool = reactor.getThreadPool()
>> + threadpool.adjustPoolsize(10, MAX_THREADS)
>> +
>> def _makeIntrospectionService(self, endpoint):
>> from provisioningserver.utils import introspect
>> introspect_service = (
>> @@ -91,6 +104,7 @@
>> self._configureLogging()
>> self._configureDjango()
>> self._configureCrochet()
>> + self._configurePoolSize()
>>
>> # Populate the region's event-loop with services.
>> from maasserver import eventloop
>>
>> === modified file 'src/maasserver/tests/test_plugin.py'
>> --- src/maasserver/tests/test_plugin.py 2015-05-07 18:14:38 +0000
>> +++ src/maasserver/tests/test_plugin.py 2015-07-07 10:22:52 +0000
>> @@ -17,6 +17,7 @@
>> import crochet
>> from maasserver import eventloop
>> from maasserver.plugin import (
>> + MAX_THREADS,
>> Options,
>> RegionServiceMaker,
>> )
>> @@ -24,11 +25,13 @@
>> from maastesting.testcase import MAASTestCase
>> from provisioningserver import logger
>> from provisioningserver.utils.twisted import asynchronous
>> +from testtools.matchers import GreaterThan
>> from twisted.application.service import MultiService
>> +from twisted.internet import reactor
>>
>>
>> class TestOptions(MAASTestCase):
>> - """Tests for `provisioningserver.plugin.Options`."""
>> + """Tests for `maasserver.plugin.Options`."""
>>
>> def test_defaults(self):
>> options = Options()
>> @@ -42,7 +45,7 @@
>>
>>
>> class TestRegionServiceMaker(MAASTestCase):
>> - """Tests for `provisioningserver.plugin.RegionServiceMaker`."""
>> + """Tests for `maasserver.plugin.RegionServiceMaker`."""
>>
>> def setUp(self):
>> super(TestRegionServiceMaker, self).setUp()
>> @@ -78,3 +81,12 @@
>> "Not all services are named.")
>> self.assertThat(logger.basicConfig, MockCalledOnceWith())
>> self.assertThat(crochet.no_setup, MockCalledOnceWith())
>> +
>> + @asynchronous(timeout=5)
>> + def test__sets_pool_size(self):
>> + service_maker = RegionServiceMaker("Harry", "Hill")
>> + service_maker.makeService(Options())
>> + threadpool = reactor.getThreadPool()
>> + self.assertEqual(MAX_THREADS, threadpool.max)
>> + # Max threads is reasonable.
>> + self.assertThat(threadpool.max, GreaterThan(50))
>>
>> === modified file 'src/maasserver/websockets/handlers/node.py'
>> --- src/maasserver/websockets/handlers/node.py 2015-05-28 15:07:01 +0000
>> +++ src/maasserver/websockets/handlers/node.py 2015-07-07 10:22:52 +0000
>> @@ -48,6 +48,7 @@
>> )
>> from metadataserver.enum import RESULT_TYPE
>> from metadataserver.models import NodeResult
>> +from provisioningserver.drivers.power import POWER_QUERY_TIMEOUT
>> from provisioningserver.logger import get_maas_logger
>> from provisioningserver.power.poweraction import (
>> PowerActionFail,
>> @@ -547,9 +548,7 @@
>> power_type=power_info.power_type,
>> context=power_info.power_parameters)
>> try:
>> - # Allow 15 seconds for the power query max as we're
>> holding
>> - # up a thread waiting.
>> - state = call.wait(15)['state']
>> + state = call.wait(POWER_QUERY_TIMEOUT)['state']
>> except crochet.TimeoutError:
>> maaslog.error(
>> "%s: Timed out waiting for power response in "
>> @@ -569,5 +568,5 @@
>> # sure the change is committed and retried if required. Not
>> pushing
>> # this to another thread, would result in the entire power query
>> being
>> # performed again.
>> - update_power_state(state).wait(15)
>> + update_power_state(state).wait(POWER_QUERY_TIMEOUT)
>> return state
>>
>> === modified file 'src/provisioningserver/drivers/power/__init__.py'
>> --- src/provisioningserver/drivers/power/__init__.py 2015-06-23
>> 16:45:25 +0000
>> +++ src/provisioningserver/drivers/power/__init__.py 2015-07-07
>> 10:22:52 +0000
>> @@ -13,6 +13,7 @@
>>
>> __metaclass__ = type
>> __all__ = [
>> + "POWER_QUERY_TIMEOUT",
>> "PowerActionError",
>> "PowerAuthError",
>> "PowerConnError",
>> @@ -29,6 +30,7 @@
>> abstractmethod,
>> abstractproperty,
>> )
>> +from datetime import timedelta
>>
>> from jsonschema import validate
>> from provisioningserver.drivers import (
>> @@ -52,6 +54,13 @@
>> }
>>
>>
>> +# Timeout for the power query action. We might be holding up a thread for
>> that
>> +# long but some BMCs (notably seamicro) can take a long time to respond to
>> +# a power query request.
>> +# This should be configurable per-BMC.
>> +POWER_QUERY_TIMEOUT = timedelta(seconds=45).total_seconds()
>> +
>> +
>> class PowerError(Exception):
>> """Base error for all power driver failure commands."""
>>
>>
>> === modified file 'src/provisioningserver/rpc/power.py'
>> --- src/provisioningserver/rpc/power.py 2015-06-10 16:45:23 +0000
>> +++ src/provisioningserver/rpc/power.py 2015-07-07 10:22:52 +0000
>> @@ -79,7 +79,8 @@
>>
>>
>> # Timeout for change_power_state(). We set it to 5 minutes by default,
>> -# but it would be lovely if this was configurable.
>> +# but it would be lovely if this was configurable. This is only a backstop
>> +# meant to cope with broken BMCs.
>> CHANGE_POWER_STATE_TIMEOUT = timedelta(minutes=5).total_seconds()
>>
>>
>>
>>
>>
>
You're right, and that's why I used a conservative number (100). This
is meant to be temporary until we come up with a more intelligent thread
pool.
There is a deadlock indeed, it's the DNS consolidator (see my comments
on the related bug).
On 07/07/2015 04:18 PM, Blake Rouse wrote: /bugs.launchpad .net/maas/ +bug/1470013 /code.launchpad .net/~rvb/ maas/tweak- timeouts/ +merge/ 264018 /api/nodes. py' api/nodes. py 2015-06-25 10:26:40 +0000 api/nodes. py 2015-07-07 10:22:52 +0000 utils.orm import get_first ver.drivers. power import POWER_QUERY_TIMEOUT ver.power. poweraction import ( power_info. power_type, power_info. power_parameter s) POWER_QUERY_ TIMEOUT) TimeoutError: /plugin. py' plugin. py 2015-05-07 18:14:38 +0000 plugin. py 2015-07-07 10:22:52 +0000 g(description) : IServiceMaker, IPlugin) ize(self) : getThreadPool( ) adjustPoolsize( 10, MAX_THREADS) ionService( self, endpoint): ver.utils import introspect Logging( ) Django( ) Crochet( ) PoolSize( ) /tests/ test_plugin. py' tests/test_ plugin. py 2015-05-07 18:14:38 +0000 tests/test_ plugin. py 2015-07-07 10:22:52 +0000 testcase import MAASTestCase ver.utils. twisted import asynchronous application. service import MultiService MAASTestCase) : rver.plugin. Options` .""" plugin. Options` .""" self): ceMaker( MAASTestCase) : rver.plugin. RegionServiceMa ker`."" " plugin. RegionServiceMa ker`."" " nServiceMaker, self).setUp() (logger. basicConfig, MockCalledOnceW ith()) (crochet. no_setup, MockCalledOnceW ith()) timeout= 5) pool_size( self): ker("Harry" , "Hill") maker.makeServi ce(Options( )) getThreadPool( ) l(MAX_THREADS, threadpool.max) (threadpool. max, GreaterThan(50)) /websockets/ handlers/ node.py' websockets/ handlers/ node.py 2015-05-28 15:07:01 +0000 websockets/ handlers/ node.py 2015-07-07 10:22:52 +0000 models import NodeResult ver.drivers. power import POWER_QUERY_TIMEOUT ver.logger import get_maas_logger ver.power. poweraction import ( power_info. power_type, power_info. power_parameter s) 15)['state' ] POWER_QUERY_ TIMEOUT) ['state' ] TimeoutError: power_state( state). wait(15) power_state( state). wait(POWER_ QUERY_TIMEOUT) ngserver/ drivers/ power/_ _init__ .py' gserver/ drivers/ power/_ _init__ .py 2015-06-23 gserver/ drivers/ power/_ _init__ .py 2015-07-07 QUERY_TIMEOUT" , ver.drivers import ( QUERY_TIMEOUT = timedelta( seconds= 45).total_ seconds( ) Exception) : ngserver/ rpc/power. py' gserver/ rpc/power. py 2015-06-10 16:45:23 +0000 gserver/ rpc/power. py 2015-07-07 10:22:52 +0000 power_state( ). We set it to 5 minutes by default, POWER_STATE_ TIMEOUT = timedelta( minutes= 5).total_ seconds( )
> Remember last time we increased this we hit database connection issues. I
> don't know if increasing the number of threads is a good idea. That is why
> we lowered it back down.
>
> Does Gavin's branch not fix the deadlock issue? Is there another deadlock
> that needs this increase?
> On Jul 7, 2015 6:27 AM, "Raphaël Badin" <email address hidden> wrote:
>
>> Raphaël Badin has proposed merging lp:~rvb/maas/tweak-timeouts into
>> lp:maas.
>>
>> Commit message:
>> Set the maximum thread pool size to 100: this is meant as a workaround for
>> deadlocks such as the one described in the linked bug. It's not meant to be
>> a permanent solution but rather to buy us time to come up with a proper
>> solution (like an "intelligent thread pool manager). Unify the power query
>> timeout: use a unique timeout across the board and use a value that can
>> accommodate seamicro's tendency to take 30s to reply to a power query.
>>
>> Requested reviews:
>> MAAS Maintainers (maas-maintainers)
>> Related bugs:
>> Bug #1470013 in MAAS: "Failed power off/status for multiple nodes within
>> a SM15K chassis"
>> https:/
>>
>> For more details, see:
>> https:/
>> --
>> Your team MAAS Maintainers is requested to review the proposed merge of
>> lp:~rvb/maas/tweak-timeouts into lp:maas.
>>
>> === modified file 'src/maasserver
>> --- src/maasserver/
>> +++ src/maasserver/
>> @@ -76,6 +76,7 @@
>> from maasserver.utils import find_nodegroup
>> from maasserver.
>> from piston.utils import rc
>> +from provisioningser
>> from provisioningser
>> PowerActionFail,
>> UnknownPowerType,
>> @@ -690,9 +691,7 @@
>> power_type=
>> context=
>> try:
>> - # Allow 30 seconds for the power query max as we're holding
>> - # up an appserver thread here.
>> - state = call.wait(30)
>> + state = call.wait(
>> except crochet.
>> maaslog.error(
>> "%s: Timed out waiting for power response in
>> Node.power_state",
>>
>> === modified file 'src/maasserver
>> --- src/maasserver/
>> +++ src/maasserver/
>> @@ -24,6 +24,7 @@
>> from twisted.plugin import IPlugin
>> from twisted.python import usage
>> from zope.interface import implementer
>> +from twisted.internet import reactor
>>
>>
>> def serverFromStrin
>> @@ -45,6 +46,14 @@
>> ]
>>
>>
>> +# The maximum number of threads used by the default twisted thread pool.
>> +# This value is a trade-off between a small value (such as the default:
>> 10)
>> +# which can create deadlocks (see 1470013) and a huge value which can
>> cause
>> +# MAAS to hit other limitations such as the number of open files or the
>> +# number of concurrent database connexions.
>> +MAX_THREADS = 100
>> +
>> +
>> @implementer(
>> class RegionServiceMaker:
>> """Create a service for the Twisted plugin."""
>> @@ -76,6 +85,10 @@
>> import crochet
>> crochet.no_setup()
>>
>> + def _configurePoolS
>> + threadpool = reactor.
>> + threadpool.
>> +
>> def _makeIntrospect
>> from provisioningser
>> introspect_service = (
>> @@ -91,6 +104,7 @@
>> self._configure
>> self._configure
>> self._configure
>> + self._configure
>>
>> # Populate the region's event-loop with services.
>> from maasserver import eventloop
>>
>> === modified file 'src/maasserver
>> --- src/maasserver/
>> +++ src/maasserver/
>> @@ -17,6 +17,7 @@
>> import crochet
>> from maasserver import eventloop
>> from maasserver.plugin import (
>> + MAX_THREADS,
>> Options,
>> RegionServiceMaker,
>> )
>> @@ -24,11 +25,13 @@
>> from maastesting.
>> from provisioningserver import logger
>> from provisioningser
>> +from testtools.matchers import GreaterThan
>> from twisted.
>> +from twisted.internet import reactor
>>
>>
>> class TestOptions(
>> - """Tests for `provisioningse
>> + """Tests for `maasserver.
>>
>> def test_defaults(
>> options = Options()
>> @@ -42,7 +45,7 @@
>>
>>
>> class TestRegionServi
>> - """Tests for `provisioningse
>> + """Tests for `maasserver.
>>
>> def setUp(self):
>> super(TestRegio
>> @@ -78,3 +81,12 @@
>> "Not all services are named.")
>> self.assertThat
>> self.assertThat
>> +
>> + @asynchronous(
>> + def test__sets_
>> + service_maker = RegionServiceMa
>> + service_
>> + threadpool = reactor.
>> + self.assertEqua
>> + # Max threads is reasonable.
>> + self.assertThat
>>
>> === modified file 'src/maasserver
>> --- src/maasserver/
>> +++ src/maasserver/
>> @@ -48,6 +48,7 @@
>> )
>> from metadataserver.enum import RESULT_TYPE
>> from metadataserver.
>> +from provisioningser
>> from provisioningser
>> from provisioningser
>> PowerActionFail,
>> @@ -547,9 +548,7 @@
>> power_type=
>> context=
>> try:
>> - # Allow 15 seconds for the power query max as we're
>> holding
>> - # up a thread waiting.
>> - state = call.wait(
>> + state = call.wait(
>> except crochet.
>> maaslog.error(
>> "%s: Timed out waiting for power response in "
>> @@ -569,5 +568,5 @@
>> # sure the change is committed and retried if required. Not
>> pushing
>> # this to another thread, would result in the entire power query
>> being
>> # performed again.
>> - update_
>> + update_
>> return state
>>
>> === modified file 'src/provisioni
>> --- src/provisionin
>> 16:45:25 +0000
>> +++ src/provisionin
>> 10:22:52 +0000
>> @@ -13,6 +13,7 @@
>>
>> __metaclass__ = type
>> __all__ = [
>> + "POWER_
>> "PowerActionError",
>> "PowerAuthError",
>> "PowerConnError",
>> @@ -29,6 +30,7 @@
>> abstractmethod,
>> abstractproperty,
>> )
>> +from datetime import timedelta
>>
>> from jsonschema import validate
>> from provisioningser
>> @@ -52,6 +54,13 @@
>> }
>>
>>
>> +# Timeout for the power query action. We might be holding up a thread for
>> that
>> +# long but some BMCs (notably seamicro) can take a long time to respond to
>> +# a power query request.
>> +# This should be configurable per-BMC.
>> +POWER_
>> +
>> +
>> class PowerError(
>> """Base error for all power driver failure commands."""
>>
>>
>> === modified file 'src/provisioni
>> --- src/provisionin
>> +++ src/provisionin
>> @@ -79,7 +79,8 @@
>>
>>
>> # Timeout for change_
>> -# but it would be lovely if this was configurable.
>> +# but it would be lovely if this was configurable. This is only a backstop
>> +# meant to cope with broken BMCs.
>> CHANGE_
>>
>>
>>
>>
>>
>