Merge lp:~cjwatson/launchpad/reduce-webhook-retries into lp:launchpad

Proposed by Colin Watson
Status: Merged
Merged at revision: 18924
Proposed branch: lp:~cjwatson/launchpad/reduce-webhook-retries
Merge into: lp:launchpad
Diff against target: 173 lines (+61/-20)
2 files modified
lib/lp/services/webhooks/model.py (+15/-2)
lib/lp/services/webhooks/tests/test_job.py (+46/-18)
To merge this branch: bzr merge lp:~cjwatson/launchpad/reduce-webhook-retries
Reviewer Review Type Date Requested Status
William Grant code Approve
Review via email: mp+365873@code.launchpad.net

Commit message

Retry webhook deliveries that respond with 4xx for an hour rather than a day.

Description of the change

These are generally permanent errors due to some kind of misconfiguration, and we don't want them clogging up our job queue.

To post a comment you must log in.
Revision history for this message
William Grant (wgrant) wrote :

I don't think it's completely reasonable to not retry 4xxs at all, but we could be less aggressive. It's not uncommon for something to 404 for a bit during a reconfiguration, for example.

Revision history for this message
Colin Watson (cjwatson) wrote :

To my mind that could just be handled using manual retries, but OK. How about this? I made it retry for up to an hour rather than for up to a day, which should still serve to ease queue depth.

Revision history for this message
William Grant (wgrant) :
review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/services/webhooks/model.py'
2--- lib/lp/services/webhooks/model.py 2017-03-28 14:28:34 +0000
3+++ lib/lp/services/webhooks/model.py 2019-04-13 08:58:38 +0000
4@@ -1,4 +1,4 @@
5-# Copyright 2015-2017 Canonical Ltd. This software is licensed under the
6+# Copyright 2015-2019 Canonical Ltd. This software is licensed under the
7 # GNU Affero General Public License version 3 (see the file LICENSE).
8
9 __metaclass__ = type
10@@ -468,7 +468,20 @@
11
12 @property
13 def retry_automatically(self):
14- return self._time_since_first_attempt < timedelta(days=1)
15+ if 'result' not in self.json_data:
16+ return False
17+ if self.json_data['result'].get('connection_error') is not None:
18+ duration = timedelta(days=1)
19+ else:
20+ status_code = self.json_data['result']['response']['status_code']
21+ if 500 <= status_code <= 599:
22+ duration = timedelta(days=1)
23+ else:
24+ # Nominally a client error, but let's retry for a little
25+ # while anyway since it's quite common for servers to return
26+ # such errors for a short time during reconfigurations.
27+ duration = timedelta(hours=1)
28+ return self._time_since_first_attempt < duration
29
30 @property
31 def retry_delay(self):
32
33=== modified file 'lib/lp/services/webhooks/tests/test_job.py'
34--- lib/lp/services/webhooks/tests/test_job.py 2018-05-31 10:23:03 +0000
35+++ lib/lp/services/webhooks/tests/test_job.py 2019-04-13 08:58:38 +0000
36@@ -1,4 +1,4 @@
37-# Copyright 2015-2018 Canonical Ltd. This software is licensed under the
38+# Copyright 2015-2019 Canonical Ltd. This software is licensed under the
39 # GNU Affero General Public License version 3 (see the file LICENSE).
40
41 """Tests for `WebhookJob`s."""
42@@ -366,8 +366,8 @@
43 job,
44 MatchesStructure(
45 status=Equals(JobStatus.COMPLETED),
46- pending=Equals(False),
47- successful=Equals(True),
48+ pending=Is(False),
49+ successful=Is(True),
50 date_sent=Not(Is(None)),
51 error_message=Is(None),
52 json_data=ContainsDict(
53@@ -414,8 +414,8 @@
54 job,
55 MatchesStructure(
56 status=Equals(JobStatus.WAITING),
57- pending=Equals(True),
58- successful=Equals(False),
59+ pending=Is(True),
60+ successful=Is(False),
61 date_sent=Not(Is(None)),
62 error_message=Equals('Bad HTTP response: 404'),
63 json_data=ContainsDict(
64@@ -437,8 +437,8 @@
65 job,
66 MatchesStructure(
67 status=Equals(JobStatus.WAITING),
68- pending=Equals(True),
69- successful=Equals(False),
70+ pending=Is(True),
71+ successful=Is(False),
72 date_sent=Not(Is(None)),
73 error_message=Equals('Connection error: Connection refused'),
74 json_data=ContainsDict(
75@@ -462,7 +462,7 @@
76 job,
77 MatchesStructure(
78 status=Equals(JobStatus.FAILED),
79- pending=Equals(False),
80+ pending=Is(False),
81 successful=Is(None),
82 date_sent=Is(None),
83 error_message=Is(None),
84@@ -483,13 +483,13 @@
85 job,
86 MatchesStructure(
87 status=Equals(JobStatus.FAILED),
88- pending=Equals(False),
89- successful=Equals(False),
90+ pending=Is(False),
91+ successful=Is(False),
92 date_sent=Is(None),
93 error_message=Equals('Webhook deactivated'),
94 json_data=ContainsDict(
95 {'result': MatchesDict(
96- {'webhook_deactivated': Equals(True)})})))
97+ {'webhook_deactivated': Is(True)})})))
98 self.assertEqual([], reqs)
99 self.assertEqual([], oopses.oopses)
100
101@@ -526,13 +526,41 @@
102 job.date_first_sent - timedelta(minutes=30)).isoformat()
103 self.assertEqual(timedelta(hours=1), job.retry_delay)
104
105- def test_retry_automatically(self):
106- # Deliveries are automatically retried until 24 hours after the
107- # initial attempt.
108+ def test_retry_automatically_connection_error(self):
109+ # Deliveries that received a connection error are automatically
110+ # retried until 24 hours after the initial attempt.
111+ job, reqs = self.makeAndRunJob(
112+ raises=requests.ConnectionError('Connection refused'))
113+ self.assertTrue(job.retry_automatically)
114+ job.json_data['date_first_sent'] = (
115+ job.date_first_sent - timedelta(hours=23)).isoformat()
116+ self.assertTrue(job.retry_automatically)
117+ job.json_data['date_first_sent'] = (
118+ job.date_first_sent - timedelta(hours=24)).isoformat()
119+ self.assertFalse(job.retry_automatically)
120+
121+ def test_retry_automatically_5xx(self):
122+ # Deliveries that received a 5xx response are automatically retried
123+ # until 24 hours after the initial attempt.
124+ job, reqs = self.makeAndRunJob(response_status=503)
125+ self.assertTrue(job.retry_automatically)
126+ job.json_data['date_first_sent'] = (
127+ job.date_first_sent - timedelta(hours=23)).isoformat()
128+ self.assertTrue(job.retry_automatically)
129+ job.json_data['date_first_sent'] = (
130+ job.date_first_sent - timedelta(hours=24)).isoformat()
131+ self.assertFalse(job.retry_automatically)
132+
133+ def test_retry_automatically_4xx(self):
134+ # Deliveries that received a non-2xx/5xx response are automatically
135+ # retried until 24 hours after the initial attempt.
136 job, reqs = self.makeAndRunJob(response_status=404)
137 self.assertTrue(job.retry_automatically)
138 job.json_data['date_first_sent'] = (
139- job.date_first_sent - timedelta(hours=24)).isoformat()
140+ job.date_first_sent - timedelta(minutes=59)).isoformat()
141+ self.assertTrue(job.retry_automatically)
142+ job.json_data['date_first_sent'] = (
143+ job.date_first_sent - timedelta(hours=1)).isoformat()
144 self.assertFalse(job.retry_automatically)
145
146 def runJob(self, job):
147@@ -551,7 +579,7 @@
148 def test_automatic_retries(self):
149 hook = self.factory.makeWebhook()
150 job = WebhookDeliveryJob.create(hook, 'test', payload={'foo': 'bar'})
151- client = MockWebhookClient(response_status=404)
152+ client = MockWebhookClient(response_status=503)
153 self.useFixture(ZopeUtilityFixture(client, IWebhookClient))
154
155 # The first attempt fails but schedules a retry five minutes later.
156@@ -585,7 +613,7 @@
157 def test_manual_retries(self):
158 hook = self.factory.makeWebhook()
159 job = WebhookDeliveryJob.create(hook, 'test', payload={'foo': 'bar'})
160- client = MockWebhookClient(response_status=404)
161+ client = MockWebhookClient(response_status=503)
162 self.useFixture(ZopeUtilityFixture(client, IWebhookClient))
163
164 # Simulate a first attempt failure.
165@@ -633,7 +661,7 @@
166 # systemic errors that erroneously failed many deliveries.
167 hook = self.factory.makeWebhook()
168 job = WebhookDeliveryJob.create(hook, 'test', payload={'foo': 'bar'})
169- client = MockWebhookClient(response_status=404)
170+ client = MockWebhookClient(response_status=503)
171 self.useFixture(ZopeUtilityFixture(client, IWebhookClient))
172
173 # Simulate a first attempt failure.