Merge lp:~jamesodhunt/upstart/bug-1357252 into lp:upstart

Proposed by James Hunt on 2014-08-29
Status: Merged
Merged at revision: 1661
Proposed branch: lp:~jamesodhunt/upstart/bug-1357252
Merge into: lp:upstart
Diff against target: 503 lines (+287/-55)
8 files modified
ChangeLog (+21/-0)
init/cgroup.c (+56/-53)
init/cgroup.h (+2/-0)
init/job.c (+30/-0)
init/job.h (+7/-0)
init/job_process.c (+15/-1)
init/job_process.h (+2/-1)
init/tests/test_job.c (+154/-0)
To merge this branch: bzr merge lp:~jamesodhunt/upstart/bug-1357252
Reviewer Review Type Date Requested Status
St├ęphane Graber 2014-08-29 Approve on 2014-08-29
Review via email: mp+232680@code.launchpad.net

Description of the change

* init/cgroup.c:
  - Removed nih_debug() and nih_warn() calls since, although
    useful, this output pollutes job logs when running in debug mode.
  - cgroup_clear(): New function to request cgroups be removed.
  - cgroup_create(): Don't mark cgroups 'remove-on-empty' since Upstart
    can race with cgmanager.
* init/job.c: job_last_process(): New helper function.
* init/job_process.c:
  - job_process_spawn_with_fd(): Request that the
    cgroup manager destroy all job cgroups after upstart has created
    required cgroups for last job process which avoids the
    'remove-on-empty' race (LP: #1357252).
  - job_process_error_handler(): Added handling for new
    JOB_PROCESS_ERROR_CGROUP_CLEAR error.
* init/job_process.h: JobProcessErrorType: Added new
  JOB_PROCESS_ERROR_CGROUP_CLEAR error.
* init/tests/test_job.c: test_job_last_process(): New test for
  job_last_process().

To post a comment you must log in.
St├ęphane Graber (stgraber) wrote :

I suspect there may be some unwanted complications when two jobs share the same cgroup depending on the order they each move between states, but I guess it's a tiny tiny corner case.

The current fix seems sane.

review: Approve
James Hunt (jamesodhunt) wrote :

Yes, ideally we'd check to see if any other job instances were using the cgroup before calling cgroup_clear(). However, this is rather more tricky than it appears since:

1) the cgroup stanzas only get expanded in the child processes, and hence only the children know what cgroups they are actually members of.

2) the children don't have the current list of running cgroup job instances - that's back in the parent.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'ChangeLog'
2--- ChangeLog 2014-08-14 11:19:43 +0000
3+++ ChangeLog 2014-08-29 09:06:28 +0000
4@@ -1,3 +1,24 @@
5+2014-08-29 James Hunt <james.hunt@ubuntu.com>
6+
7+ * init/cgroup.c:
8+ - Removed nih_debug() and nih_warn() calls since, although
9+ useful, this output pollutes job logs when running in debug mode.
10+ - cgroup_clear(): New function to request cgroups be removed.
11+ - cgroup_create(): Don't mark cgroups 'remove-on-empty' since Upstart
12+ can race with cgmanager.
13+ * init/job.c: job_last_process(): New helper function.
14+ * init/job_process.c:
15+ - job_process_spawn_with_fd(): Request that the
16+ cgroup manager destroy all job cgroups after upstart has created
17+ required cgroups for last job process which avoids the
18+ 'remove-on-empty' race (LP: #1357252).
19+ - job_process_error_handler(): Added handling for new
20+ JOB_PROCESS_ERROR_CGROUP_CLEAR error.
21+ * init/job_process.h: JobProcessErrorType: Added new
22+ JOB_PROCESS_ERROR_CGROUP_CLEAR error.
23+ * init/tests/test_job.c: test_job_last_process(): New test for
24+ job_last_process().
25+
26 2014-08-14 James Hunt <james.hunt@ubuntu.com>
27
28 * init/control.c: Disallow modifying system jobs via SetEnv,
29
30=== modified file 'init/cgroup.c'
31--- init/cgroup.c 2014-07-01 13:17:52 +0000
32+++ init/cgroup.c 2014-08-29 09:06:28 +0000
33@@ -302,6 +302,57 @@
34 }
35
36 /**
37+ * cgroup_clear:
38+ *
39+ * @cgroups: list of CGroup objects,
40+ *
41+ * Remove all cgroups associated with this job.
42+ *
43+ * Should be called by the _last_ job process to avoid racing with the
44+ * cgroup manager which may remove an empty cgroup before the latest job
45+ * process (which needs the cgroup) has been spawned.
46+ *
47+ * Returns: TRUE on success, FALSE on raised error.
48+ **/
49+int
50+cgroup_clear (NihList *cgroups)
51+{
52+ nih_assert (cgroups);
53+ nih_assert (cgroup_manager);
54+
55+ if (! cgroup_support_enabled ())
56+ return TRUE;
57+
58+ if (NIH_LIST_EMPTY (cgroups))
59+ return TRUE;
60+
61+ NIH_LIST_FOREACH (cgroups, iter) {
62+ CGroup *cgroup = (CGroup *)iter;
63+
64+ NIH_LIST_FOREACH (&cgroup->names, iter2) {
65+ CGroupName *cgname = (CGroupName *)iter2;
66+ char *cgpath;
67+ int ret;
68+
69+ cgpath = cgname->expanded ? cgname->expanded : cgname->name;
70+
71+ /* Get the cgroup manager to delete the cgroup once no more job
72+ * processes remain in it.
73+ */
74+ ret = cgmanager_remove_on_empty_sync (NULL,
75+ cgroup_manager,
76+ cgroup->controller,
77+ cgpath);
78+
79+ if (ret < 0)
80+ return FALSE;
81+ }
82+ }
83+
84+ return TRUE;
85+}
86+
87+/**
88 * cgroup_setup:
89 *
90 * @cgroups: list of CGroup objects,
91@@ -316,7 +367,10 @@
92 * Returns: TRUE on success, FALSE on raised error.
93 **/
94 int
95-cgroup_setup (NihList *cgroups, char * const *env, uid_t uid, gid_t gid)
96+cgroup_setup (NihList *cgroups,
97+ char * const *env,
98+ uid_t uid,
99+ gid_t gid)
100 {
101 const char *upstart_job = NULL;
102 const char *upstart_instance = NULL;
103@@ -1004,8 +1058,6 @@
104 /* Drop initial reference now the proxy holds one */
105 dbus_connection_unref (connection);
106
107- nih_debug ("Connected to cgroup manager");
108-
109 return 0;
110 }
111
112@@ -1021,8 +1073,6 @@
113 nih_assert (connection);
114 nih_assert (cgroup_manager_address);
115
116- nih_warn (_("Disconnected from cgroup manager"));
117-
118 cgroup_manager = NULL;
119 nih_free (cgroup_manager_address);
120 cgroup_manager_address = NULL;
121@@ -1075,15 +1125,11 @@
122 UPSTART_CGROUP_ROOT,
123 pid);
124
125-
126 if (ret < 0)
127 return FALSE;
128-
129- nih_debug ("Moved pid %d to root of '%s' controller cgroup",
130- pid, controller);
131 }
132
133- /* Ask cgmanager to create the cgroup */
134+ /* Ask the cgroup manager to create the cgroup */
135 ret = cgmanager_create_sync (NULL,
136 cgroup_manager,
137 controller,
138@@ -1093,40 +1139,6 @@
139 if (ret < 0)
140 return FALSE;
141
142- nih_debug ("%s '%s' controller cgroup '%s'",
143- ! existed ? "Created" : "Using existing",
144- controller, path);
145-
146- /* Get the cgroup manager to delete the cgroup once no more job
147- * processes remain in it. Never mind if auto-deletion occurs between
148- * a jobs processes since the group will be recreated anyway by
149- * cgroup_create().
150- *
151- * This may seem incorrect since if we create the group,
152- * then mark it to be auto-removed when empty, surely
153- * it will be immediately deleted? However, the way this works
154- * is that the group will be deleted once it has _become_ empty
155- * (having at some time *not* been empty).
156- *
157- * The logic of using auto-delete is slightly inefficient
158- * in terms of cgmanager usage, but is hugely beneficial to
159- * Upstart since it avoids having to store details of which
160- * groups were created by jobs and also avoids the complexity of
161- * the child (which is responsible for creating the cgroups)
162- * pass back these details asynchronously to the parent to avoid
163- * it blocking.
164- */
165- ret = cgmanager_remove_on_empty_sync (NULL,
166- cgroup_manager,
167- controller,
168- path);
169-
170- if (ret < 0)
171- return FALSE;
172-
173- nih_debug ("Set remove on empty for '%s' controller cgroup '%s'",
174- controller, path);
175-
176 return TRUE;
177 }
178
179@@ -1162,9 +1174,6 @@
180 if (ret < 0)
181 return FALSE;
182
183- nih_debug ("Moved pid %d to '%s' controller cgroup '%s'",
184- pid, controller, path);
185-
186 return TRUE;
187 }
188
189@@ -1371,9 +1380,6 @@
190 return FALSE;
191 }
192
193- nih_debug ("Applied cgroup settings to '%s' controller cgroup '%s'",
194- controller, path);
195-
196 return TRUE;
197 }
198
199@@ -1455,8 +1461,5 @@
200 if (ret < 0)
201 return FALSE;
202
203- nih_debug ("Changed ownership of '%s' controller cgroup '%s'",
204- controller, path);
205-
206 return TRUE;
207 }
208
209=== modified file 'init/cgroup.h'
210--- init/cgroup.h 2014-07-01 13:11:34 +0000
211+++ init/cgroup.h 2014-08-29 09:06:28 +0000
212@@ -172,6 +172,8 @@
213 int cgroup_manager_available (void)
214 __attribute__ ((warn_unused_result));
215
216+int cgroup_clear (NihList *cgroups);
217+
218 int cgroup_setup (NihList *cgroups, char * const *env,
219 uid_t uid, gid_t gid)
220 __attribute__ ((warn_unused_result));
221
222=== modified file 'init/job.c'
223--- init/job.c 2014-07-10 16:05:04 +0000
224+++ init/job.c 2014-08-29 09:06:28 +0000
225@@ -2683,3 +2683,33 @@
226 nih_assert_not_reached ();
227 }
228 }
229+
230+#ifdef ENABLE_CGROUPS
231+/**
232+ * job_last_process:
233+ *
234+ * @job: job,
235+ * @process: process.
236+ *
237+ * Returns: TRUE if the last defined process for @job is @process,
238+ * else FALSE.
239+ **/
240+int
241+job_last_process (const Job *job, ProcessType process)
242+{
243+ ProcessType i;
244+ ProcessType last = PROCESS_INVALID;
245+
246+ nih_assert (job);
247+ nih_assert (process >= PROCESS_MAIN);
248+ nih_assert (process < PROCESS_LAST);
249+
250+ for (i = 0; i < PROCESS_LAST; i++) {
251+ if (job->class->process[i])
252+ last = i;
253+ }
254+
255+ return last == process ? TRUE : FALSE;
256+}
257+#endif /* ENABLE_CGROUPS */
258+
259
260=== modified file 'init/job.h'
261--- init/job.h 2014-06-02 20:29:33 +0000
262+++ init/job.h 2014-08-29 09:06:28 +0000
263@@ -300,6 +300,13 @@
264 int job_needs_cgroups (const Job *job)
265 __attribute__ ((warn_unused_result));
266
267+#ifdef ENABLE_CGROUPS
268+
269+int job_last_process (const Job *job, ProcessType process)
270+ __attribute__ ((warn_unused_result));
271+
272+#endif /* ENABLE_CGROUPS */
273+
274 NIH_END_EXTERN
275
276 #endif /* INIT_JOB_H */
277
278=== modified file 'init/job_process.c'
279--- init/job_process.c 2014-07-10 16:07:57 +0000
280+++ init/job_process.c 2014-08-29 09:06:28 +0000
281@@ -890,6 +890,16 @@
282 job_process_error_abort (fds[1], JOB_PROCESS_ERROR_CGROUP_SETUP, 0);
283 }
284
285+ /* If spawning the last process for the job,
286+ * arrange for the cgroup manager to destroy
287+ * all job cgroups relating to this job once all
288+ * job processes have completed.
289+ */
290+ if (job_last_process (job, process)) {
291+ if (! cgroup_clear (&job->class->cgroups)) {
292+ job_process_error_abort (fds[1], JOB_PROCESS_ERROR_CGROUP_CLEAR, 0);
293+ }
294+ }
295 }
296 #endif /* ENABLE_CGROUPS */
297
298@@ -1248,6 +1258,11 @@
299 err, _("unable to enter cgroup: %s"),
300 strerror (err->errnum)));
301 break;
302+ case JOB_PROCESS_ERROR_CGROUP_CLEAR:
303+ err->error.message = NIH_MUST (nih_sprintf (
304+ err, _("unable to mark cgroups for removal: %s"),
305+ strerror (err->errnum)));
306+ break;
307
308 default:
309 nih_assert_not_reached ();
310@@ -1806,7 +1821,6 @@
311 nih_assert_not_reached ();
312 }
313
314-
315 /* Cancel any timer trying to kill the job, since it's just
316 * died. We could do this inside the main process block above, but
317 * leaving it here for now means we can use the timer for any
318
319=== modified file 'init/job_process.h'
320--- init/job_process.h 2014-07-09 16:12:52 +0000
321+++ init/job_process.h 2014-08-29 09:06:28 +0000
322@@ -98,7 +98,8 @@
323 JOB_PROCESS_ERROR_SECURITY,
324 JOB_PROCESS_ERROR_CGROUP_MGR_CONNECT,
325 JOB_PROCESS_ERROR_CGROUP_SETUP,
326- JOB_PROCESS_ERROR_CGROUP_ENTER
327+ JOB_PROCESS_ERROR_CGROUP_ENTER,
328+ JOB_PROCESS_ERROR_CGROUP_CLEAR
329 } JobProcessErrorType;
330
331 /**
332
333=== modified file 'init/tests/test_job.c'
334--- init/tests/test_job.c 2014-05-21 21:12:08 +0000
335+++ init/tests/test_job.c 2014-08-29 09:06:28 +0000
336@@ -7790,6 +7790,159 @@
337 NIH_OPTION_LAST
338 };
339
340+void
341+test_job_last_process (void)
342+{
343+ ConfFile *file;
344+ ConfSource *source;
345+ JobClass *class;
346+ Job *job;
347+ int i;
348+ int ret;
349+
350+ TEST_FUNCTION ("job_last_process");
351+
352+ nih_error_init ();
353+ conf_init ();
354+ job_class_init ();
355+
356+ TEST_HASH_EMPTY (job_classes);
357+
358+ source = conf_source_new (NULL, "/tmp", CONF_JOB_DIR);
359+ TEST_NE_P (source, NULL);
360+
361+ file = conf_file_new (source, "/tmp/test");
362+ TEST_NE_P (file, NULL);
363+
364+ class = file->job = job_class_new (NULL, "test", NULL);
365+ TEST_NE_P (class, NULL);
366+
367+ job = job_new (class, "");
368+ TEST_NE_P (job, NULL);
369+
370+ TEST_HASH_EMPTY (job_classes);
371+ TEST_TRUE (job_class_consider (class));
372+ TEST_HASH_NOT_EMPTY (job_classes);
373+
374+ /*******************************************************************/
375+ TEST_FEATURE ("no job processes");
376+
377+ for (i = 0; i < PROCESS_LAST; i++) {
378+ ret = job_last_process (job, i);
379+ TEST_FALSE (ret);
380+ }
381+
382+ /*******************************************************************/
383+ TEST_FEATURE ("first job process");
384+
385+ class->process[PROCESS_MAIN] = process_new (class);
386+ TEST_NE_P (class->process[PROCESS_MAIN], NULL);
387+
388+ for (i = 0; i < PROCESS_LAST; i++) {
389+ ret = job_last_process (job, i);
390+ if (i == PROCESS_MAIN) {
391+ TEST_TRUE (ret);
392+ } else {
393+ TEST_FALSE (ret);
394+ }
395+ }
396+
397+ nih_free (class->process[PROCESS_MAIN]);
398+ class->process[PROCESS_MAIN] = NULL;
399+
400+ /*******************************************************************/
401+ TEST_FEATURE ("last job process");
402+
403+ class->process[PROCESS_SECURITY] = process_new (class);
404+ TEST_NE_P (class->process[PROCESS_SECURITY], NULL);
405+
406+ for (i = 0; i < PROCESS_LAST; i++) {
407+ ret = job_last_process (job, i);
408+ if (i == PROCESS_SECURITY) {
409+ TEST_TRUE (ret);
410+ } else {
411+ TEST_FALSE (ret);
412+ }
413+ }
414+
415+ nih_free (class->process[PROCESS_SECURITY]);
416+ class->process[PROCESS_SECURITY] = NULL;
417+
418+ /*******************************************************************/
419+ TEST_FEATURE ("PROCESS_PRE_STOP job process");
420+
421+ class->process[PROCESS_PRE_STOP] = process_new (class);
422+ TEST_NE_P (class->process[PROCESS_PRE_STOP], NULL);
423+
424+ for (i = 0; i < PROCESS_LAST; i++) {
425+ ret = job_last_process (job, i);
426+ if (i == PROCESS_PRE_STOP) {
427+ TEST_TRUE (ret);
428+ } else {
429+ TEST_FALSE (ret);
430+ }
431+ }
432+
433+ nih_free (class->process[PROCESS_PRE_STOP]);
434+ class->process[PROCESS_PRE_STOP] = NULL;
435+
436+ /*******************************************************************/
437+ TEST_FEATURE ("all job processes set");
438+
439+ for (i = 0; i < PROCESS_LAST; i++) {
440+ class->process[i] = process_new (class);
441+ TEST_NE_P (class->process[i], NULL);
442+ }
443+
444+ for (i = 0; i < PROCESS_LAST; i++) {
445+ ret = job_last_process (job, i);
446+ if (i == PROCESS_SECURITY) {
447+ TEST_TRUE (ret);
448+ } else {
449+ TEST_FALSE (ret);
450+ }
451+ }
452+
453+ for (i = 0; i < PROCESS_LAST; i++) {
454+ nih_free (class->process[i]);
455+ class->process[i] = NULL;
456+ }
457+
458+ /*******************************************************************/
459+ TEST_FEATURE ("all job processes set except PROCESS_SECURITY");
460+
461+ for (i = 0; i < PROCESS_SECURITY; i++) {
462+ class->process[i] = process_new (class);
463+ TEST_NE_P (class->process[i], NULL);
464+ }
465+
466+ for (i = 0; i < PROCESS_LAST; i++) {
467+ ret = job_last_process (job, i);
468+ if (i == PROCESS_POST_STOP) {
469+ TEST_TRUE (ret);
470+ } else {
471+ TEST_FALSE (ret);
472+ }
473+ }
474+
475+ for (i = 0; i < PROCESS_SECURITY; i++) {
476+ nih_free (class->process[i]);
477+ class->process[i] = NULL;
478+ }
479+
480+ /***********************************************************/
481+ /* clean up */
482+
483+ nih_free (conf_sources);
484+ nih_free (job_classes);
485+
486+ conf_sources = NULL;
487+ job_classes = NULL;
488+
489+ conf_init ();
490+ job_class_init ();
491+}
492+
493
494 int
495 main (int argc,
496@@ -7841,6 +7994,7 @@
497 test_deserialise_ptrace ();
498
499 test_job_find ();
500+ test_job_last_process ();
501
502 return 0;
503 }

Subscribers

People subscribed via source and target branches