Merge lp:~jamesodhunt/upstart/bug-1357252 into lp:upstart

Proposed by James Hunt
Status: Merged
Merged at revision: 1661
Proposed branch: lp:~jamesodhunt/upstart/bug-1357252
Merge into: lp:upstart
Diff against target: 503 lines (+287/-55)
8 files modified
ChangeLog (+21/-0)
init/cgroup.c (+56/-53)
init/cgroup.h (+2/-0)
init/job.c (+30/-0)
init/job.h (+7/-0)
init/job_process.c (+15/-1)
init/job_process.h (+2/-1)
init/tests/test_job.c (+154/-0)
To merge this branch: bzr merge lp:~jamesodhunt/upstart/bug-1357252
Reviewer Review Type Date Requested Status
Stéphane Graber (community) Approve
Review via email: mp+232680@code.launchpad.net

Description of the change

* init/cgroup.c:
  - Removed nih_debug() and nih_warn() calls since, although
    useful, this output pollutes job logs when running in debug mode.
  - cgroup_clear(): New function to request cgroups be removed.
  - cgroup_create(): Don't mark cgroups 'remove-on-empty' since Upstart
    can race with cgmanager.
* init/job.c: job_last_process(): New helper function.
* init/job_process.c:
  - job_process_spawn_with_fd(): Request that the
    cgroup manager destroy all job cgroups after upstart has created
    required cgroups for last job process which avoids the
    'remove-on-empty' race (LP: #1357252).
  - job_process_error_handler(): Added handling for new
    JOB_PROCESS_ERROR_CGROUP_CLEAR error.
* init/job_process.h: JobProcessErrorType: Added new
  JOB_PROCESS_ERROR_CGROUP_CLEAR error.
* init/tests/test_job.c: test_job_last_process(): New test for
  job_last_process().

To post a comment you must log in.
Revision history for this message
Stéphane Graber (stgraber) wrote :

I suspect there may be some unwanted complications when two jobs share the same cgroup depending on the order they each move between states, but I guess it's a tiny tiny corner case.

The current fix seems sane.

review: Approve
Revision history for this message
James Hunt (jamesodhunt) wrote :

Yes, ideally we'd check to see if any other job instances were using the cgroup before calling cgroup_clear(). However, this is rather more tricky than it appears since:

1) the cgroup stanzas only get expanded in the child processes, and hence only the children know what cgroups they are actually members of.

2) the children don't have the current list of running cgroup job instances - that's back in the parent.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'ChangeLog'
--- ChangeLog 2014-08-14 11:19:43 +0000
+++ ChangeLog 2014-08-29 09:06:28 +0000
@@ -1,3 +1,24 @@
12014-08-29 James Hunt <james.hunt@ubuntu.com>
2
3 * init/cgroup.c:
4 - Removed nih_debug() and nih_warn() calls since, although
5 useful, this output pollutes job logs when running in debug mode.
6 - cgroup_clear(): New function to request cgroups be removed.
7 - cgroup_create(): Don't mark cgroups 'remove-on-empty' since Upstart
8 can race with cgmanager.
9 * init/job.c: job_last_process(): New helper function.
10 * init/job_process.c:
11 - job_process_spawn_with_fd(): Request that the
12 cgroup manager destroy all job cgroups after upstart has created
13 required cgroups for last job process which avoids the
14 'remove-on-empty' race (LP: #1357252).
15 - job_process_error_handler(): Added handling for new
16 JOB_PROCESS_ERROR_CGROUP_CLEAR error.
17 * init/job_process.h: JobProcessErrorType: Added new
18 JOB_PROCESS_ERROR_CGROUP_CLEAR error.
19 * init/tests/test_job.c: test_job_last_process(): New test for
20 job_last_process().
21
12014-08-14 James Hunt <james.hunt@ubuntu.com>222014-08-14 James Hunt <james.hunt@ubuntu.com>
223
3 * init/control.c: Disallow modifying system jobs via SetEnv,24 * init/control.c: Disallow modifying system jobs via SetEnv,
425
=== modified file 'init/cgroup.c'
--- init/cgroup.c 2014-07-01 13:17:52 +0000
+++ init/cgroup.c 2014-08-29 09:06:28 +0000
@@ -302,6 +302,57 @@
302}302}
303303
304/**304/**
305 * cgroup_clear:
306 *
307 * @cgroups: list of CGroup objects,
308 *
309 * Remove all cgroups associated with this job.
310 *
311 * Should be called by the _last_ job process to avoid racing with the
312 * cgroup manager which may remove an empty cgroup before the latest job
313 * process (which needs the cgroup) has been spawned.
314 *
315 * Returns: TRUE on success, FALSE on raised error.
316 **/
317int
318cgroup_clear (NihList *cgroups)
319{
320 nih_assert (cgroups);
321 nih_assert (cgroup_manager);
322
323 if (! cgroup_support_enabled ())
324 return TRUE;
325
326 if (NIH_LIST_EMPTY (cgroups))
327 return TRUE;
328
329 NIH_LIST_FOREACH (cgroups, iter) {
330 CGroup *cgroup = (CGroup *)iter;
331
332 NIH_LIST_FOREACH (&cgroup->names, iter2) {
333 CGroupName *cgname = (CGroupName *)iter2;
334 char *cgpath;
335 int ret;
336
337 cgpath = cgname->expanded ? cgname->expanded : cgname->name;
338
339 /* Get the cgroup manager to delete the cgroup once no more job
340 * processes remain in it.
341 */
342 ret = cgmanager_remove_on_empty_sync (NULL,
343 cgroup_manager,
344 cgroup->controller,
345 cgpath);
346
347 if (ret < 0)
348 return FALSE;
349 }
350 }
351
352 return TRUE;
353}
354
355/**
305 * cgroup_setup:356 * cgroup_setup:
306 *357 *
307 * @cgroups: list of CGroup objects,358 * @cgroups: list of CGroup objects,
@@ -316,7 +367,10 @@
316 * Returns: TRUE on success, FALSE on raised error.367 * Returns: TRUE on success, FALSE on raised error.
317 **/368 **/
318int369int
319cgroup_setup (NihList *cgroups, char * const *env, uid_t uid, gid_t gid)370cgroup_setup (NihList *cgroups,
371 char * const *env,
372 uid_t uid,
373 gid_t gid)
320{374{
321 const char *upstart_job = NULL;375 const char *upstart_job = NULL;
322 const char *upstart_instance = NULL;376 const char *upstart_instance = NULL;
@@ -1004,8 +1058,6 @@
1004 /* Drop initial reference now the proxy holds one */1058 /* Drop initial reference now the proxy holds one */
1005 dbus_connection_unref (connection);1059 dbus_connection_unref (connection);
10061060
1007 nih_debug ("Connected to cgroup manager");
1008
1009 return 0;1061 return 0;
1010}1062}
10111063
@@ -1021,8 +1073,6 @@
1021 nih_assert (connection);1073 nih_assert (connection);
1022 nih_assert (cgroup_manager_address);1074 nih_assert (cgroup_manager_address);
10231075
1024 nih_warn (_("Disconnected from cgroup manager"));
1025
1026 cgroup_manager = NULL;1076 cgroup_manager = NULL;
1027 nih_free (cgroup_manager_address);1077 nih_free (cgroup_manager_address);
1028 cgroup_manager_address = NULL;1078 cgroup_manager_address = NULL;
@@ -1075,15 +1125,11 @@
1075 UPSTART_CGROUP_ROOT,1125 UPSTART_CGROUP_ROOT,
1076 pid);1126 pid);
10771127
1078
1079 if (ret < 0)1128 if (ret < 0)
1080 return FALSE;1129 return FALSE;
1081
1082 nih_debug ("Moved pid %d to root of '%s' controller cgroup",
1083 pid, controller);
1084 }1130 }
10851131
1086 /* Ask cgmanager to create the cgroup */1132 /* Ask the cgroup manager to create the cgroup */
1087 ret = cgmanager_create_sync (NULL,1133 ret = cgmanager_create_sync (NULL,
1088 cgroup_manager,1134 cgroup_manager,
1089 controller,1135 controller,
@@ -1093,40 +1139,6 @@
1093 if (ret < 0)1139 if (ret < 0)
1094 return FALSE;1140 return FALSE;
10951141
1096 nih_debug ("%s '%s' controller cgroup '%s'",
1097 ! existed ? "Created" : "Using existing",
1098 controller, path);
1099
1100 /* Get the cgroup manager to delete the cgroup once no more job
1101 * processes remain in it. Never mind if auto-deletion occurs between
1102 * a jobs processes since the group will be recreated anyway by
1103 * cgroup_create().
1104 *
1105 * This may seem incorrect since if we create the group,
1106 * then mark it to be auto-removed when empty, surely
1107 * it will be immediately deleted? However, the way this works
1108 * is that the group will be deleted once it has _become_ empty
1109 * (having at some time *not* been empty).
1110 *
1111 * The logic of using auto-delete is slightly inefficient
1112 * in terms of cgmanager usage, but is hugely beneficial to
1113 * Upstart since it avoids having to store details of which
1114 * groups were created by jobs and also avoids the complexity of
1115 * the child (which is responsible for creating the cgroups)
1116 * pass back these details asynchronously to the parent to avoid
1117 * it blocking.
1118 */
1119 ret = cgmanager_remove_on_empty_sync (NULL,
1120 cgroup_manager,
1121 controller,
1122 path);
1123
1124 if (ret < 0)
1125 return FALSE;
1126
1127 nih_debug ("Set remove on empty for '%s' controller cgroup '%s'",
1128 controller, path);
1129
1130 return TRUE;1142 return TRUE;
1131}1143}
11321144
@@ -1162,9 +1174,6 @@
1162 if (ret < 0)1174 if (ret < 0)
1163 return FALSE;1175 return FALSE;
11641176
1165 nih_debug ("Moved pid %d to '%s' controller cgroup '%s'",
1166 pid, controller, path);
1167
1168 return TRUE;1177 return TRUE;
1169}1178}
11701179
@@ -1371,9 +1380,6 @@
1371 return FALSE;1380 return FALSE;
1372 }1381 }
13731382
1374 nih_debug ("Applied cgroup settings to '%s' controller cgroup '%s'",
1375 controller, path);
1376
1377 return TRUE;1383 return TRUE;
1378}1384}
13791385
@@ -1455,8 +1461,5 @@
1455 if (ret < 0)1461 if (ret < 0)
1456 return FALSE;1462 return FALSE;
14571463
1458 nih_debug ("Changed ownership of '%s' controller cgroup '%s'",
1459 controller, path);
1460
1461 return TRUE;1464 return TRUE;
1462}1465}
14631466
=== modified file 'init/cgroup.h'
--- init/cgroup.h 2014-07-01 13:11:34 +0000
+++ init/cgroup.h 2014-08-29 09:06:28 +0000
@@ -172,6 +172,8 @@
172int cgroup_manager_available (void)172int cgroup_manager_available (void)
173 __attribute__ ((warn_unused_result));173 __attribute__ ((warn_unused_result));
174174
175int cgroup_clear (NihList *cgroups);
176
175int cgroup_setup (NihList *cgroups, char * const *env,177int cgroup_setup (NihList *cgroups, char * const *env,
176 uid_t uid, gid_t gid)178 uid_t uid, gid_t gid)
177 __attribute__ ((warn_unused_result));179 __attribute__ ((warn_unused_result));
178180
=== modified file 'init/job.c'
--- init/job.c 2014-07-10 16:05:04 +0000
+++ init/job.c 2014-08-29 09:06:28 +0000
@@ -2683,3 +2683,33 @@
2683 nih_assert_not_reached ();2683 nih_assert_not_reached ();
2684 }2684 }
2685}2685}
2686
2687#ifdef ENABLE_CGROUPS
2688/**
2689 * job_last_process:
2690 *
2691 * @job: job,
2692 * @process: process.
2693 *
2694 * Returns: TRUE if the last defined process for @job is @process,
2695 * else FALSE.
2696 **/
2697int
2698job_last_process (const Job *job, ProcessType process)
2699{
2700 ProcessType i;
2701 ProcessType last = PROCESS_INVALID;
2702
2703 nih_assert (job);
2704 nih_assert (process >= PROCESS_MAIN);
2705 nih_assert (process < PROCESS_LAST);
2706
2707 for (i = 0; i < PROCESS_LAST; i++) {
2708 if (job->class->process[i])
2709 last = i;
2710 }
2711
2712 return last == process ? TRUE : FALSE;
2713}
2714#endif /* ENABLE_CGROUPS */
2715
26862716
=== modified file 'init/job.h'
--- init/job.h 2014-06-02 20:29:33 +0000
+++ init/job.h 2014-08-29 09:06:28 +0000
@@ -300,6 +300,13 @@
300int job_needs_cgroups (const Job *job)300int job_needs_cgroups (const Job *job)
301 __attribute__ ((warn_unused_result));301 __attribute__ ((warn_unused_result));
302302
303#ifdef ENABLE_CGROUPS
304
305int job_last_process (const Job *job, ProcessType process)
306 __attribute__ ((warn_unused_result));
307
308#endif /* ENABLE_CGROUPS */
309
303NIH_END_EXTERN310NIH_END_EXTERN
304311
305#endif /* INIT_JOB_H */312#endif /* INIT_JOB_H */
306313
=== modified file 'init/job_process.c'
--- init/job_process.c 2014-07-10 16:07:57 +0000
+++ init/job_process.c 2014-08-29 09:06:28 +0000
@@ -890,6 +890,16 @@
890 job_process_error_abort (fds[1], JOB_PROCESS_ERROR_CGROUP_SETUP, 0);890 job_process_error_abort (fds[1], JOB_PROCESS_ERROR_CGROUP_SETUP, 0);
891 }891 }
892892
893 /* If spawning the last process for the job,
894 * arrange for the cgroup manager to destroy
895 * all job cgroups relating to this job once all
896 * job processes have completed.
897 */
898 if (job_last_process (job, process)) {
899 if (! cgroup_clear (&job->class->cgroups)) {
900 job_process_error_abort (fds[1], JOB_PROCESS_ERROR_CGROUP_CLEAR, 0);
901 }
902 }
893 }903 }
894#endif /* ENABLE_CGROUPS */904#endif /* ENABLE_CGROUPS */
895905
@@ -1248,6 +1258,11 @@
1248 err, _("unable to enter cgroup: %s"),1258 err, _("unable to enter cgroup: %s"),
1249 strerror (err->errnum)));1259 strerror (err->errnum)));
1250 break;1260 break;
1261 case JOB_PROCESS_ERROR_CGROUP_CLEAR:
1262 err->error.message = NIH_MUST (nih_sprintf (
1263 err, _("unable to mark cgroups for removal: %s"),
1264 strerror (err->errnum)));
1265 break;
12511266
1252 default:1267 default:
1253 nih_assert_not_reached ();1268 nih_assert_not_reached ();
@@ -1806,7 +1821,6 @@
1806 nih_assert_not_reached ();1821 nih_assert_not_reached ();
1807 }1822 }
18081823
1809
1810 /* Cancel any timer trying to kill the job, since it's just1824 /* Cancel any timer trying to kill the job, since it's just
1811 * died. We could do this inside the main process block above, but1825 * died. We could do this inside the main process block above, but
1812 * leaving it here for now means we can use the timer for any1826 * leaving it here for now means we can use the timer for any
18131827
=== modified file 'init/job_process.h'
--- init/job_process.h 2014-07-09 16:12:52 +0000
+++ init/job_process.h 2014-08-29 09:06:28 +0000
@@ -98,7 +98,8 @@
98 JOB_PROCESS_ERROR_SECURITY,98 JOB_PROCESS_ERROR_SECURITY,
99 JOB_PROCESS_ERROR_CGROUP_MGR_CONNECT,99 JOB_PROCESS_ERROR_CGROUP_MGR_CONNECT,
100 JOB_PROCESS_ERROR_CGROUP_SETUP,100 JOB_PROCESS_ERROR_CGROUP_SETUP,
101 JOB_PROCESS_ERROR_CGROUP_ENTER101 JOB_PROCESS_ERROR_CGROUP_ENTER,
102 JOB_PROCESS_ERROR_CGROUP_CLEAR
102} JobProcessErrorType;103} JobProcessErrorType;
103104
104/**105/**
105106
=== modified file 'init/tests/test_job.c'
--- init/tests/test_job.c 2014-05-21 21:12:08 +0000
+++ init/tests/test_job.c 2014-08-29 09:06:28 +0000
@@ -7790,6 +7790,159 @@
7790 NIH_OPTION_LAST7790 NIH_OPTION_LAST
7791};7791};
77927792
7793void
7794test_job_last_process (void)
7795{
7796 ConfFile *file;
7797 ConfSource *source;
7798 JobClass *class;
7799 Job *job;
7800 int i;
7801 int ret;
7802
7803 TEST_FUNCTION ("job_last_process");
7804
7805 nih_error_init ();
7806 conf_init ();
7807 job_class_init ();
7808
7809 TEST_HASH_EMPTY (job_classes);
7810
7811 source = conf_source_new (NULL, "/tmp", CONF_JOB_DIR);
7812 TEST_NE_P (source, NULL);
7813
7814 file = conf_file_new (source, "/tmp/test");
7815 TEST_NE_P (file, NULL);
7816
7817 class = file->job = job_class_new (NULL, "test", NULL);
7818 TEST_NE_P (class, NULL);
7819
7820 job = job_new (class, "");
7821 TEST_NE_P (job, NULL);
7822
7823 TEST_HASH_EMPTY (job_classes);
7824 TEST_TRUE (job_class_consider (class));
7825 TEST_HASH_NOT_EMPTY (job_classes);
7826
7827 /*******************************************************************/
7828 TEST_FEATURE ("no job processes");
7829
7830 for (i = 0; i < PROCESS_LAST; i++) {
7831 ret = job_last_process (job, i);
7832 TEST_FALSE (ret);
7833 }
7834
7835 /*******************************************************************/
7836 TEST_FEATURE ("first job process");
7837
7838 class->process[PROCESS_MAIN] = process_new (class);
7839 TEST_NE_P (class->process[PROCESS_MAIN], NULL);
7840
7841 for (i = 0; i < PROCESS_LAST; i++) {
7842 ret = job_last_process (job, i);
7843 if (i == PROCESS_MAIN) {
7844 TEST_TRUE (ret);
7845 } else {
7846 TEST_FALSE (ret);
7847 }
7848 }
7849
7850 nih_free (class->process[PROCESS_MAIN]);
7851 class->process[PROCESS_MAIN] = NULL;
7852
7853 /*******************************************************************/
7854 TEST_FEATURE ("last job process");
7855
7856 class->process[PROCESS_SECURITY] = process_new (class);
7857 TEST_NE_P (class->process[PROCESS_SECURITY], NULL);
7858
7859 for (i = 0; i < PROCESS_LAST; i++) {
7860 ret = job_last_process (job, i);
7861 if (i == PROCESS_SECURITY) {
7862 TEST_TRUE (ret);
7863 } else {
7864 TEST_FALSE (ret);
7865 }
7866 }
7867
7868 nih_free (class->process[PROCESS_SECURITY]);
7869 class->process[PROCESS_SECURITY] = NULL;
7870
7871 /*******************************************************************/
7872 TEST_FEATURE ("PROCESS_PRE_STOP job process");
7873
7874 class->process[PROCESS_PRE_STOP] = process_new (class);
7875 TEST_NE_P (class->process[PROCESS_PRE_STOP], NULL);
7876
7877 for (i = 0; i < PROCESS_LAST; i++) {
7878 ret = job_last_process (job, i);
7879 if (i == PROCESS_PRE_STOP) {
7880 TEST_TRUE (ret);
7881 } else {
7882 TEST_FALSE (ret);
7883 }
7884 }
7885
7886 nih_free (class->process[PROCESS_PRE_STOP]);
7887 class->process[PROCESS_PRE_STOP] = NULL;
7888
7889 /*******************************************************************/
7890 TEST_FEATURE ("all job processes set");
7891
7892 for (i = 0; i < PROCESS_LAST; i++) {
7893 class->process[i] = process_new (class);
7894 TEST_NE_P (class->process[i], NULL);
7895 }
7896
7897 for (i = 0; i < PROCESS_LAST; i++) {
7898 ret = job_last_process (job, i);
7899 if (i == PROCESS_SECURITY) {
7900 TEST_TRUE (ret);
7901 } else {
7902 TEST_FALSE (ret);
7903 }
7904 }
7905
7906 for (i = 0; i < PROCESS_LAST; i++) {
7907 nih_free (class->process[i]);
7908 class->process[i] = NULL;
7909 }
7910
7911 /*******************************************************************/
7912 TEST_FEATURE ("all job processes set except PROCESS_SECURITY");
7913
7914 for (i = 0; i < PROCESS_SECURITY; i++) {
7915 class->process[i] = process_new (class);
7916 TEST_NE_P (class->process[i], NULL);
7917 }
7918
7919 for (i = 0; i < PROCESS_LAST; i++) {
7920 ret = job_last_process (job, i);
7921 if (i == PROCESS_POST_STOP) {
7922 TEST_TRUE (ret);
7923 } else {
7924 TEST_FALSE (ret);
7925 }
7926 }
7927
7928 for (i = 0; i < PROCESS_SECURITY; i++) {
7929 nih_free (class->process[i]);
7930 class->process[i] = NULL;
7931 }
7932
7933 /***********************************************************/
7934 /* clean up */
7935
7936 nih_free (conf_sources);
7937 nih_free (job_classes);
7938
7939 conf_sources = NULL;
7940 job_classes = NULL;
7941
7942 conf_init ();
7943 job_class_init ();
7944}
7945
77937946
7794int7947int
7795main (int argc,7948main (int argc,
@@ -7841,6 +7994,7 @@
7841 test_deserialise_ptrace ();7994 test_deserialise_ptrace ();
78427995
7843 test_job_find ();7996 test_job_find ();
7997 test_job_last_process ();
78447998
7845 return 0;7999 return 0;
7846}8000}

Subscribers

People subscribed via source and target branches