Branch data Line data Source code
1 : : /*
2 : : * Generic process-grouping system.
3 : : *
4 : : * Based originally on the cpuset system, extracted by Paul Menage
5 : : * Copyright (C) 2006 Google, Inc
6 : : *
7 : : * Notifications support
8 : : * Copyright (C) 2009 Nokia Corporation
9 : : * Author: Kirill A. Shutemov
10 : : *
11 : : * Copyright notices from the original cpuset code:
12 : : * --------------------------------------------------
13 : : * Copyright (C) 2003 BULL SA.
14 : : * Copyright (C) 2004-2006 Silicon Graphics, Inc.
15 : : *
16 : : * Portions derived from Patrick Mochel's sysfs code.
17 : : * sysfs is Copyright (c) 2001-3 Patrick Mochel
18 : : *
19 : : * 2003-10-10 Written by Simon Derr.
20 : : * 2003-10-22 Updates by Stephen Hemminger.
21 : : * 2004 May-July Rework by Paul Jackson.
22 : : * ---------------------------------------------------
23 : : *
24 : : * This file is subject to the terms and conditions of the GNU General Public
25 : : * License. See the file COPYING in the main directory of the Linux
26 : : * distribution for more details.
27 : : */
28 : :
29 : : #include <linux/cgroup.h>
30 : : #include <linux/cred.h>
31 : : #include <linux/ctype.h>
32 : : #include <linux/errno.h>
33 : : #include <linux/init_task.h>
34 : : #include <linux/kernel.h>
35 : : #include <linux/list.h>
36 : : #include <linux/mm.h>
37 : : #include <linux/mutex.h>
38 : : #include <linux/mount.h>
39 : : #include <linux/pagemap.h>
40 : : #include <linux/proc_fs.h>
41 : : #include <linux/rcupdate.h>
42 : : #include <linux/sched.h>
43 : : #include <linux/backing-dev.h>
44 : : #include <linux/slab.h>
45 : : #include <linux/magic.h>
46 : : #include <linux/spinlock.h>
47 : : #include <linux/string.h>
48 : : #include <linux/sort.h>
49 : : #include <linux/kmod.h>
50 : : #include <linux/module.h>
51 : : #include <linux/delayacct.h>
52 : : #include <linux/cgroupstats.h>
53 : : #include <linux/hashtable.h>
54 : : #include <linux/namei.h>
55 : : #include <linux/pid_namespace.h>
56 : : #include <linux/idr.h>
57 : : #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58 : : #include <linux/flex_array.h> /* used in cgroup_attach_task */
59 : : #include <linux/kthread.h>
60 : :
61 : : #include <linux/atomic.h>
62 : :
63 : : /*
64 : : * pidlists linger the following amount before being destroyed. The goal
65 : : * is avoiding frequent destruction in the middle of consecutive read calls
66 : : * Expiring in the middle is a performance problem not a correctness one.
67 : : * 1 sec should be enough.
68 : : */
69 : : #define CGROUP_PIDLIST_DESTROY_DELAY HZ
70 : :
71 : : /*
72 : : * cgroup_mutex is the master lock. Any modification to cgroup or its
73 : : * hierarchy must be performed while holding it.
74 : : *
75 : : * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
76 : : * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
77 : : * release_agent_path and so on. Modifying requires both cgroup_mutex and
78 : : * cgroup_root_mutex. Readers can acquire either of the two. This is to
79 : : * break the following locking order cycle.
80 : : *
81 : : * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
82 : : * B. namespace_sem -> cgroup_mutex
83 : : *
84 : : * B happens only through cgroup_show_options() and using cgroup_root_mutex
85 : : * breaks it.
86 : : */
87 : : #ifdef CONFIG_PROVE_RCU
88 : : DEFINE_MUTEX(cgroup_mutex);
89 : : EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
90 : : #else
91 : : static DEFINE_MUTEX(cgroup_mutex);
92 : : #endif
93 : :
94 : : static DEFINE_MUTEX(cgroup_root_mutex);
95 : :
96 : : #define cgroup_assert_mutex_or_rcu_locked() \
97 : : rcu_lockdep_assert(rcu_read_lock_held() || \
98 : : lockdep_is_held(&cgroup_mutex), \
99 : : "cgroup_mutex or RCU read lock required");
100 : :
101 : : #ifdef CONFIG_LOCKDEP
102 : : #define cgroup_assert_mutex_or_root_locked() \
103 : : WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 : : !lockdep_is_held(&cgroup_root_mutex)))
105 : : #else
106 : : #define cgroup_assert_mutex_or_root_locked() do { } while (0)
107 : : #endif
108 : :
109 : : /*
110 : : * cgroup destruction makes heavy use of work items and there can be a lot
111 : : * of concurrent destructions. Use a separate workqueue so that cgroup
112 : : * destruction work items don't end up filling up max_active of system_wq
113 : : * which may lead to deadlock.
114 : : */
115 : : static struct workqueue_struct *cgroup_destroy_wq;
116 : :
117 : : /*
118 : : * pidlist destructions need to be flushed on cgroup destruction. Use a
119 : : * separate workqueue as flush domain.
120 : : */
121 : : static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122 : :
123 : : /*
124 : : * Generate an array of cgroup subsystem pointers. At boot time, this is
125 : : * populated with the built in subsystems, and modular subsystems are
126 : : * registered after that. The mutable section of this array is protected by
127 : : * cgroup_mutex.
128 : : */
129 : : #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
130 : : #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
131 : : static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
132 : : #include <linux/cgroup_subsys.h>
133 : : };
134 : :
135 : : /*
136 : : * The dummy hierarchy, reserved for the subsystems that are otherwise
137 : : * unattached - it never has more than a single cgroup, and all tasks are
138 : : * part of that cgroup.
139 : : */
140 : : static struct cgroupfs_root cgroup_dummy_root;
141 : :
142 : : /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
143 : : static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
144 : :
145 : : /* The list of hierarchy roots */
146 : :
147 : : static LIST_HEAD(cgroup_roots);
148 : : static int cgroup_root_count;
149 : :
150 : : /*
151 : : * Hierarchy ID allocation and mapping. It follows the same exclusion
152 : : * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
153 : : * writes, either for reads.
154 : : */
155 : : static DEFINE_IDR(cgroup_hierarchy_idr);
156 : :
157 : : static struct cgroup_name root_cgroup_name = { .name = "/" };
158 : :
159 : : /*
160 : : * Assign a monotonically increasing serial number to cgroups. It
161 : : * guarantees cgroups with bigger numbers are newer than those with smaller
162 : : * numbers. Also, as cgroups are always appended to the parent's
163 : : * ->children list, it guarantees that sibling cgroups are always sorted in
164 : : * the ascending serial number order on the list. Protected by
165 : : * cgroup_mutex.
166 : : */
167 : : static u64 cgroup_serial_nr_next = 1;
168 : :
169 : : /* This flag indicates whether tasks in the fork and exit paths should
170 : : * check for fork/exit handlers to call. This avoids us having to do
171 : : * extra work in the fork/exit path if none of the subsystems need to
172 : : * be called.
173 : : */
174 : : static int need_forkexit_callback __read_mostly;
175 : :
176 : : static struct cftype cgroup_base_files[];
177 : :
178 : : static void cgroup_destroy_css_killed(struct cgroup *cgrp);
179 : : static int cgroup_destroy_locked(struct cgroup *cgrp);
180 : : static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
181 : : bool is_add);
182 : : static int cgroup_file_release(struct inode *inode, struct file *file);
183 : : static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
184 : :
185 : : /**
186 : : * cgroup_css - obtain a cgroup's css for the specified subsystem
187 : : * @cgrp: the cgroup of interest
188 : : * @ss: the subsystem of interest (%NULL returns the dummy_css)
189 : : *
190 : : * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
191 : : * function must be called either under cgroup_mutex or rcu_read_lock() and
192 : : * the caller is responsible for pinning the returned css if it wants to
193 : : * keep accessing it outside the said locks. This function may return
194 : : * %NULL if @cgrp doesn't have @subsys_id enabled.
195 : : */
196 : : static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
197 : : struct cgroup_subsys *ss)
198 : : {
199 [ # # ][ # # : 5 : if (ss)
# # # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # - + ]
[ # # ]
200 : 0 : return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
201 : : lockdep_is_held(&cgroup_mutex));
202 : : else
203 : 5 : return &cgrp->dummy_css;
204 : : }
205 : :
206 : : /* convenient tests for these bits */
207 : : static inline bool cgroup_is_dead(const struct cgroup *cgrp)
208 : : {
209 : 0 : return test_bit(CGRP_DEAD, &cgrp->flags);
210 : : }
211 : :
212 : : /**
213 : : * cgroup_is_descendant - test ancestry
214 : : * @cgrp: the cgroup to be tested
215 : : * @ancestor: possible ancestor of @cgrp
216 : : *
217 : : * Test whether @cgrp is a descendant of @ancestor. It also returns %true
218 : : * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
219 : : * and @ancestor are accessible.
220 : : */
221 : 0 : bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
222 : : {
223 [ # # ]: 0 : while (cgrp) {
224 [ # # ]: 0 : if (cgrp == ancestor)
225 : : return true;
226 : 0 : cgrp = cgrp->parent;
227 : : }
228 : : return false;
229 : : }
230 : : EXPORT_SYMBOL_GPL(cgroup_is_descendant);
231 : :
232 : : static int cgroup_is_releasable(const struct cgroup *cgrp)
233 : : {
234 : : const int bits =
235 : : (1 << CGRP_RELEASABLE) |
236 : : (1 << CGRP_NOTIFY_ON_RELEASE);
237 : 2 : return (cgrp->flags & bits) == bits;
238 : : }
239 : :
240 : : static int notify_on_release(const struct cgroup *cgrp)
241 : : {
242 : : return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
243 : : }
244 : :
245 : : /**
246 : : * for_each_css - iterate all css's of a cgroup
247 : : * @css: the iteration cursor
248 : : * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
249 : : * @cgrp: the target cgroup to iterate css's of
250 : : *
251 : : * Should be called under cgroup_mutex.
252 : : */
253 : : #define for_each_css(css, ssid, cgrp) \
254 : : for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 : : if (!((css) = rcu_dereference_check( \
256 : : (cgrp)->subsys[(ssid)], \
257 : : lockdep_is_held(&cgroup_mutex)))) { } \
258 : : else
259 : :
260 : : /**
261 : : * for_each_subsys - iterate all loaded cgroup subsystems
262 : : * @ss: the iteration cursor
263 : : * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
264 : : *
265 : : * Iterates through all loaded subsystems. Should be called under
266 : : * cgroup_mutex or cgroup_root_mutex.
267 : : */
268 : : #define for_each_subsys(ss, ssid) \
269 : : for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
270 : : (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
271 : : if (!((ss) = cgroup_subsys[(ssid)])) { } \
272 : : else
273 : :
274 : : /**
275 : : * for_each_builtin_subsys - iterate all built-in cgroup subsystems
276 : : * @ss: the iteration cursor
277 : : * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
278 : : *
279 : : * Bulit-in subsystems are always present and iteration itself doesn't
280 : : * require any synchronization.
281 : : */
282 : : #define for_each_builtin_subsys(ss, i) \
283 : : for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
284 : : (((ss) = cgroup_subsys[i]) || true); (i)++)
285 : :
286 : : /* iterate across the active hierarchies */
287 : : #define for_each_active_root(root) \
288 : : list_for_each_entry((root), &cgroup_roots, root_list)
289 : :
290 : : static inline struct cgroup *__d_cgrp(struct dentry *dentry)
291 : : {
292 : : return dentry->d_fsdata;
293 : : }
294 : :
295 : : static inline struct cfent *__d_cfe(struct dentry *dentry)
296 : : {
297 : : return dentry->d_fsdata;
298 : : }
299 : :
300 : : static inline struct cftype *__d_cft(struct dentry *dentry)
301 : : {
302 : 7 : return __d_cfe(dentry)->type;
303 : : }
304 : :
305 : : /**
306 : : * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
307 : : * @cgrp: the cgroup to be checked for liveness
308 : : *
309 : : * On success, returns true; the mutex should be later unlocked. On
310 : : * failure returns false with no lock held.
311 : : */
312 : 0 : static bool cgroup_lock_live_group(struct cgroup *cgrp)
313 : : {
314 : 2 : mutex_lock(&cgroup_mutex);
315 [ - + ]: 2 : if (cgroup_is_dead(cgrp)) {
316 : 0 : mutex_unlock(&cgroup_mutex);
317 : 0 : return false;
318 : : }
319 : : return true;
320 : : }
321 : :
322 : : /* the list of cgroups eligible for automatic release. Protected by
323 : : * release_list_lock */
324 : : static LIST_HEAD(release_list);
325 : : static DEFINE_RAW_SPINLOCK(release_list_lock);
326 : : static void cgroup_release_agent(struct work_struct *work);
327 : : static DECLARE_WORK(release_agent_work, cgroup_release_agent);
328 : : static void check_for_release(struct cgroup *cgrp);
329 : :
330 : : /*
331 : : * A cgroup can be associated with multiple css_sets as different tasks may
332 : : * belong to different cgroups on different hierarchies. In the other
333 : : * direction, a css_set is naturally associated with multiple cgroups.
334 : : * This M:N relationship is represented by the following link structure
335 : : * which exists for each association and allows traversing the associations
336 : : * from both sides.
337 : : */
338 : : struct cgrp_cset_link {
339 : : /* the cgroup and css_set this link associates */
340 : : struct cgroup *cgrp;
341 : : struct css_set *cset;
342 : :
343 : : /* list of cgrp_cset_links anchored at cgrp->cset_links */
344 : : struct list_head cset_link;
345 : :
346 : : /* list of cgrp_cset_links anchored at css_set->cgrp_links */
347 : : struct list_head cgrp_link;
348 : : };
349 : :
350 : : /* The default css_set - used by init and its children prior to any
351 : : * hierarchies being mounted. It contains a pointer to the root state
352 : : * for each subsystem. Also used to anchor the list of css_sets. Not
353 : : * reference-counted, to improve performance when child cgroups
354 : : * haven't been created.
355 : : */
356 : :
357 : : static struct css_set init_css_set;
358 : : static struct cgrp_cset_link init_cgrp_cset_link;
359 : :
360 : : /*
361 : : * css_set_lock protects the list of css_set objects, and the chain of
362 : : * tasks off each css_set. Nests outside task->alloc_lock due to
363 : : * css_task_iter_start().
364 : : */
365 : : static DEFINE_RWLOCK(css_set_lock);
366 : : static int css_set_count;
367 : :
368 : : /*
369 : : * hash table for cgroup groups. This improves the performance to find
370 : : * an existing css_set. This hash doesn't (currently) take into
371 : : * account cgroups in empty hierarchies.
372 : : */
373 : : #define CSS_SET_HASH_BITS 7
374 : : static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
375 : :
376 : : static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
377 : : {
378 : : unsigned long key = 0UL;
379 : : struct cgroup_subsys *ss;
380 : : int i;
381 : :
382 : : for_each_subsys(ss, i)
383 : : key += (unsigned long)css[i];
384 : : key = (key >> 16) ^ key;
385 : :
386 : : return key;
387 : : }
388 : :
389 : : /*
390 : : * We don't maintain the lists running through each css_set to its task
391 : : * until after the first call to css_task_iter_start(). This reduces the
392 : : * fork()/exit() overhead for people who have cgroups compiled into their
393 : : * kernel but not actually in use.
394 : : */
395 : : static int use_task_css_set_links __read_mostly;
396 : :
397 : 0 : static void __put_css_set(struct css_set *cset, int taskexit)
398 : : {
399 : : struct cgrp_cset_link *link, *tmp_link;
400 : :
401 : : /*
402 : : * Ensure that the refcount doesn't hit zero while any readers
403 : : * can see it. Similar to atomic_dec_and_lock(), but for an
404 : : * rwlock
405 : : */
406 [ - + ]: 1104198 : if (atomic_add_unless(&cset->refcount, -1, 1))
407 : : return;
408 : 0 : write_lock(&css_set_lock);
409 [ # # ]: 1104220 : if (!atomic_dec_and_test(&cset->refcount)) {
410 : : write_unlock(&css_set_lock);
411 : : return;
412 : : }
413 : :
414 : : /* This css_set is dead. unlink it and release cgroup refcounts */
415 : : hash_del(&cset->hlist);
416 : 0 : css_set_count--;
417 : :
418 [ # # ]: 0 : list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
419 : 0 : struct cgroup *cgrp = link->cgrp;
420 : :
421 : : list_del(&link->cset_link);
422 : : list_del(&link->cgrp_link);
423 : :
424 : : /* @cgrp can't go away while we're holding css_set_lock */
425 [ # # ][ # # ]: 0 : if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
426 [ # # ]: 0 : if (taskexit)
427 : 0 : set_bit(CGRP_RELEASABLE, &cgrp->flags);
428 : 0 : check_for_release(cgrp);
429 : : }
430 : :
431 : 0 : kfree(link);
432 : : }
433 : :
434 : : write_unlock(&css_set_lock);
435 : 0 : kfree_rcu(cset, rcu_head);
436 : : }
437 : :
438 : : /*
439 : : * refcounted get/put for css_set objects
440 : : */
441 : : static inline void get_css_set(struct css_set *cset)
442 : : {
443 : 1104223 : atomic_inc(&cset->refcount);
444 : : }
445 : :
446 : : static inline void put_css_set(struct css_set *cset)
447 : : {
448 : 0 : __put_css_set(cset, 0);
449 : : }
450 : :
451 : : static inline void put_css_set_taskexit(struct css_set *cset)
452 : : {
453 : 1104232 : __put_css_set(cset, 1);
454 : : }
455 : :
456 : : /**
457 : : * compare_css_sets - helper function for find_existing_css_set().
458 : : * @cset: candidate css_set being tested
459 : : * @old_cset: existing css_set for a task
460 : : * @new_cgrp: cgroup that's being entered by the task
461 : : * @template: desired set of css pointers in css_set (pre-calculated)
462 : : *
463 : : * Returns true if "cset" matches "old_cset" except for the hierarchy
464 : : * which "new_cgrp" belongs to, for which it should match "new_cgrp".
465 : : */
466 : 0 : static bool compare_css_sets(struct css_set *cset,
467 : : struct css_set *old_cset,
468 : : struct cgroup *new_cgrp,
469 : : struct cgroup_subsys_state *template[])
470 : : {
471 : : struct list_head *l1, *l2;
472 : :
473 : : if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
474 : : /* Not all subsystems matched */
475 : : return false;
476 : : }
477 : :
478 : : /*
479 : : * Compare cgroup pointers in order to distinguish between
480 : : * different cgroups in heirarchies with no subsystems. We
481 : : * could get by with just this check alone (and skip the
482 : : * memcmp above) but on most setups the memcmp check will
483 : : * avoid the need for this more expensive check on almost all
484 : : * candidates.
485 : : */
486 : :
487 : 0 : l1 = &cset->cgrp_links;
488 : 0 : l2 = &old_cset->cgrp_links;
489 : : while (1) {
490 : : struct cgrp_cset_link *link1, *link2;
491 : : struct cgroup *cgrp1, *cgrp2;
492 : :
493 : 0 : l1 = l1->next;
494 : 0 : l2 = l2->next;
495 : : /* See if we reached the end - both lists are equal length. */
496 [ # # ]: 0 : if (l1 == &cset->cgrp_links) {
497 [ # # ]: 0 : BUG_ON(l2 != &old_cset->cgrp_links);
498 : : break;
499 : : } else {
500 [ # # ]: 0 : BUG_ON(l2 == &old_cset->cgrp_links);
501 : : }
502 : : /* Locate the cgroups associated with these links. */
503 : : link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
504 : : link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
505 : 0 : cgrp1 = link1->cgrp;
506 : 0 : cgrp2 = link2->cgrp;
507 : : /* Hierarchies should be linked in the same order. */
508 [ # # ]: 0 : BUG_ON(cgrp1->root != cgrp2->root);
509 : :
510 : : /*
511 : : * If this hierarchy is the hierarchy of the cgroup
512 : : * that's changing, then we need to check that this
513 : : * css_set points to the new cgroup; if it's any other
514 : : * hierarchy, then this css_set should point to the
515 : : * same cgroup as the old css_set.
516 : : */
517 [ # # ]: 0 : if (cgrp1->root == new_cgrp->root) {
518 [ # # ]: 0 : if (cgrp1 != new_cgrp)
519 : : return false;
520 : : } else {
521 [ # # ]: 0 : if (cgrp1 != cgrp2)
522 : : return false;
523 : : }
524 : : }
525 : : return true;
526 : : }
527 : :
528 : : /**
529 : : * find_existing_css_set - init css array and find the matching css_set
530 : : * @old_cset: the css_set that we're using before the cgroup transition
531 : : * @cgrp: the cgroup that we're moving into
532 : : * @template: out param for the new set of csses, should be clear on entry
533 : : */
534 : 0 : static struct css_set *find_existing_css_set(struct css_set *old_cset,
535 : : struct cgroup *cgrp,
536 : : struct cgroup_subsys_state *template[])
537 : : {
538 : : struct cgroupfs_root *root = cgrp->root;
539 : : struct cgroup_subsys *ss;
540 : : struct css_set *cset;
541 : : unsigned long key;
542 : : int i;
543 : :
544 : : /*
545 : : * Build the set of subsystem state objects that we want to see in the
546 : : * new css_set. while subsystems can change globally, the entries here
547 : : * won't change, so no need for locking.
548 : : */
549 : : for_each_subsys(ss, i) {
550 : : if (root->subsys_mask & (1UL << i)) {
551 : : /* Subsystem is in this hierarchy. So we want
552 : : * the subsystem state from the new
553 : : * cgroup */
554 : : template[i] = cgroup_css(cgrp, ss);
555 : : } else {
556 : : /* Subsystem is not in this hierarchy, so we
557 : : * don't want to change the subsystem state */
558 : : template[i] = old_cset->subsys[i];
559 : : }
560 : : }
561 : :
562 : : key = css_set_hash(template);
563 [ # # ][ # # ]: 0 : hash_for_each_possible(css_set_table, cset, hlist, key) {
[ # # ]
564 [ # # ]: 0 : if (!compare_css_sets(cset, old_cset, cgrp, template))
565 : 0 : continue;
566 : :
567 : : /* This css_set matches what we need */
568 : : return cset;
569 : : }
570 : :
571 : : /* No existing cgroup group matched */
572 : : return NULL;
573 : : }
574 : :
575 : 0 : static void free_cgrp_cset_links(struct list_head *links_to_free)
576 : : {
577 : : struct cgrp_cset_link *link, *tmp_link;
578 : :
579 [ - + ]: 3 : list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
580 : : list_del(&link->cset_link);
581 : 0 : kfree(link);
582 : : }
583 : 3 : }
584 : :
585 : : /**
586 : : * allocate_cgrp_cset_links - allocate cgrp_cset_links
587 : : * @count: the number of links to allocate
588 : : * @tmp_links: list_head the allocated links are put on
589 : : *
590 : : * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
591 : : * through ->cset_link. Returns 0 on success or -errno.
592 : : */
593 : 0 : static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
594 : : {
595 : : struct cgrp_cset_link *link;
596 : : int i;
597 : :
598 : : INIT_LIST_HEAD(tmp_links);
599 : :
600 [ + + ]: 6 : for (i = 0; i < count; i++) {
601 : : link = kzalloc(sizeof(*link), GFP_KERNEL);
602 [ - + ]: 3 : if (!link) {
603 : 0 : free_cgrp_cset_links(tmp_links);
604 : 0 : return -ENOMEM;
605 : : }
606 : 3 : list_add(&link->cset_link, tmp_links);
607 : : }
608 : : return 0;
609 : : }
610 : :
611 : : /**
612 : : * link_css_set - a helper function to link a css_set to a cgroup
613 : : * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
614 : : * @cset: the css_set to be linked
615 : : * @cgrp: the destination cgroup
616 : : */
617 : 0 : static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
618 : : struct cgroup *cgrp)
619 : : {
620 : : struct cgrp_cset_link *link;
621 : :
622 [ - + ]: 3 : BUG_ON(list_empty(tmp_links));
623 : : link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
624 : 3 : link->cset = cset;
625 : 3 : link->cgrp = cgrp;
626 : 3 : list_move(&link->cset_link, &cgrp->cset_links);
627 : : /*
628 : : * Always add links to the tail of the list so that the list
629 : : * is sorted by order of hierarchy creation
630 : : */
631 : 3 : list_add_tail(&link->cgrp_link, &cset->cgrp_links);
632 : 3 : }
633 : :
634 : : /**
635 : : * find_css_set - return a new css_set with one cgroup updated
636 : : * @old_cset: the baseline css_set
637 : : * @cgrp: the cgroup to be updated
638 : : *
639 : : * Return a new css_set that's equivalent to @old_cset, but with @cgrp
640 : : * substituted into the appropriate hierarchy.
641 : : */
642 : 0 : static struct css_set *find_css_set(struct css_set *old_cset,
643 : : struct cgroup *cgrp)
644 : : {
645 : : struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
646 : : struct css_set *cset;
647 : : struct list_head tmp_links;
648 : : struct cgrp_cset_link *link;
649 : : unsigned long key;
650 : :
651 : : lockdep_assert_held(&cgroup_mutex);
652 : :
653 : : /* First see if we already have a cgroup group that matches
654 : : * the desired set */
655 : 0 : read_lock(&css_set_lock);
656 : 0 : cset = find_existing_css_set(old_cset, cgrp, template);
657 [ # # ]: 0 : if (cset)
658 : : get_css_set(cset);
659 : : read_unlock(&css_set_lock);
660 : :
661 [ # # ]: 0 : if (cset)
662 : : return cset;
663 : :
664 : : cset = kzalloc(sizeof(*cset), GFP_KERNEL);
665 [ # # ]: 0 : if (!cset)
666 : : return NULL;
667 : :
668 : : /* Allocate all the cgrp_cset_link objects that we'll need */
669 [ # # ]: 0 : if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
670 : 0 : kfree(cset);
671 : 0 : return NULL;
672 : : }
673 : :
674 : 0 : atomic_set(&cset->refcount, 1);
675 : 0 : INIT_LIST_HEAD(&cset->cgrp_links);
676 : 0 : INIT_LIST_HEAD(&cset->tasks);
677 : : INIT_HLIST_NODE(&cset->hlist);
678 : :
679 : : /* Copy the set of subsystem state objects generated in
680 : : * find_existing_css_set() */
681 : : memcpy(cset->subsys, template, sizeof(cset->subsys));
682 : :
683 : 0 : write_lock(&css_set_lock);
684 : : /* Add reference counts and links from the new css_set. */
685 [ # # ]: 0 : list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
686 : 0 : struct cgroup *c = link->cgrp;
687 : :
688 [ # # ]: 0 : if (c->root == cgrp->root)
689 : : c = cgrp;
690 : 0 : link_css_set(&tmp_links, cset, c);
691 : : }
692 : :
693 [ # # ]: 0 : BUG_ON(!list_empty(&tmp_links));
694 : :
695 : 0 : css_set_count++;
696 : :
697 : : /* Add this cgroup group to the hash table */
698 : : key = css_set_hash(cset->subsys);
699 : 0 : hash_add(css_set_table, &cset->hlist, key);
700 : :
701 : : write_unlock(&css_set_lock);
702 : :
703 : 0 : return cset;
704 : : }
705 : :
706 : : /*
707 : : * Return the cgroup for "task" from the given hierarchy. Must be
708 : : * called with cgroup_mutex held.
709 : : */
710 : 0 : static struct cgroup *task_cgroup_from_root(struct task_struct *task,
711 : : struct cgroupfs_root *root)
712 : : {
713 : : struct css_set *cset;
714 : : struct cgroup *res = NULL;
715 : :
716 [ # # ]: 0 : BUG_ON(!mutex_is_locked(&cgroup_mutex));
717 : 0 : read_lock(&css_set_lock);
718 : : /*
719 : : * No need to lock the task - since we hold cgroup_mutex the
720 : : * task can't change groups, so the only thing that can happen
721 : : * is that it exits and its css is set back to init_css_set.
722 : : */
723 : : cset = task_css_set(task);
724 [ # # ]: 0 : if (cset == &init_css_set) {
725 : 0 : res = &root->top_cgroup;
726 : : } else {
727 : : struct cgrp_cset_link *link;
728 : :
729 [ # # ]: 0 : list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
730 : 0 : struct cgroup *c = link->cgrp;
731 : :
732 [ # # ]: 0 : if (c->root == root) {
733 : : res = c;
734 : : break;
735 : : }
736 : : }
737 : : }
738 : : read_unlock(&css_set_lock);
739 [ # # ]: 0 : BUG_ON(!res);
740 : 0 : return res;
741 : : }
742 : :
743 : : /*
744 : : * There is one global cgroup mutex. We also require taking
745 : : * task_lock() when dereferencing a task's cgroup subsys pointers.
746 : : * See "The task_lock() exception", at the end of this comment.
747 : : *
748 : : * A task must hold cgroup_mutex to modify cgroups.
749 : : *
750 : : * Any task can increment and decrement the count field without lock.
751 : : * So in general, code holding cgroup_mutex can't rely on the count
752 : : * field not changing. However, if the count goes to zero, then only
753 : : * cgroup_attach_task() can increment it again. Because a count of zero
754 : : * means that no tasks are currently attached, therefore there is no
755 : : * way a task attached to that cgroup can fork (the other way to
756 : : * increment the count). So code holding cgroup_mutex can safely
757 : : * assume that if the count is zero, it will stay zero. Similarly, if
758 : : * a task holds cgroup_mutex on a cgroup with zero count, it
759 : : * knows that the cgroup won't be removed, as cgroup_rmdir()
760 : : * needs that mutex.
761 : : *
762 : : * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
763 : : * (usually) take cgroup_mutex. These are the two most performance
764 : : * critical pieces of code here. The exception occurs on cgroup_exit(),
765 : : * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
766 : : * is taken, and if the cgroup count is zero, a usermode call made
767 : : * to the release agent with the name of the cgroup (path relative to
768 : : * the root of cgroup file system) as the argument.
769 : : *
770 : : * A cgroup can only be deleted if both its 'count' of using tasks
771 : : * is zero, and its list of 'children' cgroups is empty. Since all
772 : : * tasks in the system use _some_ cgroup, and since there is always at
773 : : * least one task in the system (init, pid == 1), therefore, top_cgroup
774 : : * always has either children cgroups and/or using tasks. So we don't
775 : : * need a special hack to ensure that top_cgroup cannot be deleted.
776 : : *
777 : : * The task_lock() exception
778 : : *
779 : : * The need for this exception arises from the action of
780 : : * cgroup_attach_task(), which overwrites one task's cgroup pointer with
781 : : * another. It does so using cgroup_mutex, however there are
782 : : * several performance critical places that need to reference
783 : : * task->cgroup without the expense of grabbing a system global
784 : : * mutex. Therefore except as noted below, when dereferencing or, as
785 : : * in cgroup_attach_task(), modifying a task's cgroup pointer we use
786 : : * task_lock(), which acts on a spinlock (task->alloc_lock) already in
787 : : * the task_struct routinely used for such matters.
788 : : *
789 : : * P.S. One more locking exception. RCU is used to guard the
790 : : * update of a tasks cgroup pointer by cgroup_attach_task()
791 : : */
792 : :
793 : : /*
794 : : * A couple of forward declarations required, due to cyclic reference loop:
795 : : * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
796 : : * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
797 : : * -> cgroup_mkdir.
798 : : */
799 : :
800 : : static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
801 : : static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
802 : : static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
803 : : static const struct inode_operations cgroup_dir_inode_operations;
804 : : static const struct file_operations proc_cgroupstats_operations;
805 : :
806 : : static struct backing_dev_info cgroup_backing_dev_info = {
807 : : .name = "cgroup",
808 : : .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
809 : : };
810 : :
811 : 0 : static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
812 : : {
813 : 31 : struct inode *inode = new_inode(sb);
814 : :
815 [ + - ]: 31 : if (inode) {
816 : 31 : inode->i_ino = get_next_ino();
817 : 31 : inode->i_mode = mode;
818 : 31 : inode->i_uid = current_fsuid();
819 : 31 : inode->i_gid = current_fsgid();
820 : 31 : inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
821 : 31 : inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
822 : : }
823 : 0 : return inode;
824 : : }
825 : :
826 : 0 : static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
827 : : {
828 : : struct cgroup_name *name;
829 : :
830 : 2 : name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
831 [ + - ]: 2 : if (!name)
832 : : return NULL;
833 : 2 : strcpy(name->name, dentry->d_name.name);
834 : : return name;
835 : : }
836 : :
837 : 0 : static void cgroup_free_fn(struct work_struct *work)
838 : : {
839 : 2 : struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
840 : :
841 : 2 : mutex_lock(&cgroup_mutex);
842 : 2 : cgrp->root->number_of_cgroups--;
843 : 2 : mutex_unlock(&cgroup_mutex);
844 : :
845 : : /*
846 : : * We get a ref to the parent's dentry, and put the ref when
847 : : * this cgroup is being freed, so it's guaranteed that the
848 : : * parent won't be destroyed before its children.
849 : : */
850 : 2 : dput(cgrp->parent->dentry);
851 : :
852 : : /*
853 : : * Drop the active superblock reference that we took when we
854 : : * created the cgroup. This will free cgrp->root, if we are
855 : : * holding the last reference to @sb.
856 : : */
857 : 2 : deactivate_super(cgrp->root->sb);
858 : :
859 : 2 : cgroup_pidlist_destroy_all(cgrp);
860 : :
861 : 2 : simple_xattrs_free(&cgrp->xattrs);
862 : :
863 : 2 : kfree(rcu_dereference_raw(cgrp->name));
864 : 2 : kfree(cgrp);
865 : 2 : }
866 : :
867 : 0 : static void cgroup_free_rcu(struct rcu_head *head)
868 : : {
869 : : struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
870 : :
871 : 4 : INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
872 : 2 : queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
873 : 2 : }
874 : :
875 : 0 : static void cgroup_diput(struct dentry *dentry, struct inode *inode)
876 : : {
877 : : /* is dentry a directory ? if so, kfree() associated cgroup */
878 [ + + ]: 28 : if (S_ISDIR(inode->i_mode)) {
879 : 2 : struct cgroup *cgrp = dentry->d_fsdata;
880 : :
881 [ - + ]: 2 : BUG_ON(!(cgroup_is_dead(cgrp)));
882 : :
883 : : /*
884 : : * XXX: cgrp->id is only used to look up css's. As cgroup
885 : : * and css's lifetimes will be decoupled, it should be made
886 : : * per-subsystem and moved to css->id so that lookups are
887 : : * successful until the target css is released.
888 : : */
889 : 2 : mutex_lock(&cgroup_mutex);
890 : 2 : idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
891 : 2 : mutex_unlock(&cgroup_mutex);
892 : 2 : cgrp->id = -1;
893 : :
894 : 2 : call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
895 : : } else {
896 : : struct cfent *cfe = __d_cfe(dentry);
897 : 26 : struct cgroup *cgrp = dentry->d_parent->d_fsdata;
898 : :
899 [ + + ][ + - ]: 26 : WARN_ONCE(!list_empty(&cfe->node) &&
[ - + ][ # # ]
[ - - ]
900 : : cgrp != &cgrp->root->top_cgroup,
901 : : "cfe still linked for %s\n", cfe->type->name);
902 : 26 : simple_xattrs_free(&cfe->xattrs);
903 : 26 : kfree(cfe);
904 : : }
905 : 28 : iput(inode);
906 : 28 : }
907 : :
908 : 0 : static void remove_dir(struct dentry *d)
909 : : {
910 : 2 : struct dentry *parent = dget(d->d_parent);
911 : :
912 : 2 : d_delete(d);
913 : 2 : simple_rmdir(parent->d_inode, d);
914 : 2 : dput(parent);
915 : 2 : }
916 : :
917 : 0 : static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
918 : : {
919 : : struct cfent *cfe;
920 : :
921 : : lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
922 : : lockdep_assert_held(&cgroup_mutex);
923 : :
924 : : /*
925 : : * If we're doing cleanup due to failure of cgroup_create(),
926 : : * the corresponding @cfe may not exist.
927 : : */
928 [ + - ]: 8 : list_for_each_entry(cfe, &cgrp->files, node) {
929 : 8 : struct dentry *d = cfe->dentry;
930 : :
931 [ + - ][ - + ]: 8 : if (cft && cfe->type != cft)
932 : 0 : continue;
933 : :
934 : : dget(d);
935 : 8 : d_delete(d);
936 : 8 : simple_unlink(cgrp->dentry->d_inode, d);
937 : : list_del_init(&cfe->node);
938 : 8 : dput(d);
939 : :
940 : 8 : break;
941 : : }
942 : 8 : }
943 : :
944 : : /**
945 : : * cgroup_clear_dir - remove subsys files in a cgroup directory
946 : : * @cgrp: target cgroup
947 : : * @subsys_mask: mask of the subsystem ids whose files should be removed
948 : : */
949 : : static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
950 : : {
951 : : struct cgroup_subsys *ss;
952 : : int i;
953 : :
954 : : for_each_subsys(ss, i) {
955 : : struct cftype_set *set;
956 : :
957 : : if (!test_bit(i, &subsys_mask))
958 : : continue;
959 : : list_for_each_entry(set, &ss->cftsets, node)
960 : : cgroup_addrm_files(cgrp, set->cfts, false);
961 : : }
962 : : }
963 : :
964 : : /*
965 : : * NOTE : the dentry must have been dget()'ed
966 : : */
967 : 0 : static void cgroup_d_remove_dir(struct dentry *dentry)
968 : : {
969 : : struct dentry *parent;
970 : :
971 : 2 : parent = dentry->d_parent;
972 : : spin_lock(&parent->d_lock);
973 : 2 : spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
974 : 2 : list_del_init(&dentry->d_u.d_child);
975 : : spin_unlock(&dentry->d_lock);
976 : : spin_unlock(&parent->d_lock);
977 : 2 : remove_dir(dentry);
978 : 2 : }
979 : :
980 : : /*
981 : : * Call with cgroup_mutex held. Drops reference counts on modules, including
982 : : * any duplicate ones that parse_cgroupfs_options took. If this function
983 : : * returns an error, no reference counts are touched.
984 : : */
985 : 0 : static int rebind_subsystems(struct cgroupfs_root *root,
986 : : unsigned long added_mask, unsigned removed_mask)
987 : : {
988 : : struct cgroup *cgrp = &root->top_cgroup;
989 : : struct cgroup_subsys *ss;
990 : : unsigned long pinned = 0;
991 : : int i, ret;
992 : :
993 [ - + ]: 6 : BUG_ON(!mutex_is_locked(&cgroup_mutex));
994 [ - + ]: 6 : BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
995 : :
996 : : /* Check that any added subsystems are currently free */
997 : : for_each_subsys(ss, i) {
998 : : if (!(added_mask & (1 << i)))
999 : : continue;
1000 : :
1001 : : /* is the subsystem mounted elsewhere? */
1002 : : if (ss->root != &cgroup_dummy_root) {
1003 : : ret = -EBUSY;
1004 : : goto out_put;
1005 : : }
1006 : :
1007 : : /* pin the module */
1008 : : if (!try_module_get(ss->module)) {
1009 : : ret = -ENOENT;
1010 : : goto out_put;
1011 : : }
1012 : : pinned |= 1 << i;
1013 : : }
1014 : :
1015 : : /* subsys could be missing if unloaded between parsing and here */
1016 [ + - ]: 6 : if (added_mask != pinned) {
1017 : : ret = -ENOENT;
1018 : : goto out_put;
1019 : : }
1020 : :
1021 : : ret = cgroup_populate_dir(cgrp, added_mask);
1022 : : if (ret)
1023 : : goto out_put;
1024 : :
1025 : : /*
1026 : : * Nothing can fail from this point on. Remove files for the
1027 : : * removed subsystems and rebind each subsystem.
1028 : : */
1029 : : cgroup_clear_dir(cgrp, removed_mask);
1030 : :
1031 : : for_each_subsys(ss, i) {
1032 : : unsigned long bit = 1UL << i;
1033 : :
1034 : : if (bit & added_mask) {
1035 : : /* We're binding this subsystem to this hierarchy */
1036 : : BUG_ON(cgroup_css(cgrp, ss));
1037 : : BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1038 : : BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1039 : :
1040 : : rcu_assign_pointer(cgrp->subsys[i],
1041 : : cgroup_css(cgroup_dummy_top, ss));
1042 : : cgroup_css(cgrp, ss)->cgroup = cgrp;
1043 : :
1044 : : ss->root = root;
1045 : : if (ss->bind)
1046 : : ss->bind(cgroup_css(cgrp, ss));
1047 : :
1048 : : /* refcount was already taken, and we're keeping it */
1049 : : root->subsys_mask |= bit;
1050 : : } else if (bit & removed_mask) {
1051 : : /* We're removing this subsystem */
1052 : : BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1053 : : BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1054 : :
1055 : : if (ss->bind)
1056 : : ss->bind(cgroup_css(cgroup_dummy_top, ss));
1057 : :
1058 : : cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
1059 : : RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1060 : :
1061 : : cgroup_subsys[i]->root = &cgroup_dummy_root;
1062 : :
1063 : : /* subsystem is now free - drop reference on module */
1064 : : module_put(ss->module);
1065 : : root->subsys_mask &= ~bit;
1066 : : }
1067 : : }
1068 : :
1069 : : /*
1070 : : * Mark @root has finished binding subsystems. @root->subsys_mask
1071 : : * now matches the bound subsystems.
1072 : : */
1073 : 6 : root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1074 : :
1075 : : return 0;
1076 : :
1077 : : out_put:
1078 : : for_each_subsys(ss, i)
1079 : : if (pinned & (1 << i))
1080 : : module_put(ss->module);
1081 : : return ret;
1082 : : }
1083 : :
1084 : 0 : static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1085 : : {
1086 : 0 : struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1087 : : struct cgroup_subsys *ss;
1088 : : int ssid;
1089 : :
1090 : 0 : mutex_lock(&cgroup_root_mutex);
1091 : : for_each_subsys(ss, ssid)
1092 : : if (root->subsys_mask & (1 << ssid))
1093 : : seq_printf(seq, ",%s", ss->name);
1094 [ # # ]: 0 : if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1095 : 0 : seq_puts(seq, ",sane_behavior");
1096 [ # # ]: 0 : if (root->flags & CGRP_ROOT_NOPREFIX)
1097 : 0 : seq_puts(seq, ",noprefix");
1098 [ # # ]: 0 : if (root->flags & CGRP_ROOT_XATTR)
1099 : 0 : seq_puts(seq, ",xattr");
1100 [ # # ]: 0 : if (strlen(root->release_agent_path))
1101 : 0 : seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1102 [ # # ]: 0 : if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
1103 : 0 : seq_puts(seq, ",clone_children");
1104 [ # # ]: 0 : if (strlen(root->name))
1105 : 0 : seq_printf(seq, ",name=%s", root->name);
1106 : 0 : mutex_unlock(&cgroup_root_mutex);
1107 : 0 : return 0;
1108 : : }
1109 : :
1110 : : struct cgroup_sb_opts {
1111 : : unsigned long subsys_mask;
1112 : : unsigned long flags;
1113 : : char *release_agent;
1114 : : bool cpuset_clone_children;
1115 : : char *name;
1116 : : /* User explicitly requested empty subsystem */
1117 : : bool none;
1118 : :
1119 : : struct cgroupfs_root *new_root;
1120 : :
1121 : : };
1122 : :
1123 : : /*
1124 : : * Convert a hierarchy specifier into a bitmask of subsystems and
1125 : : * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1126 : : * array. This function takes refcounts on subsystems to be used, unless it
1127 : : * returns error, in which case no refcounts are taken.
1128 : : */
1129 : 0 : static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1130 : : {
1131 : 2983 : char *token, *o = data;
1132 : : bool all_ss = false, one_ss = false;
1133 : : unsigned long mask = (unsigned long)-1;
1134 : : struct cgroup_subsys *ss;
1135 : : int i;
1136 : :
1137 [ - + ]: 2983 : BUG_ON(!mutex_is_locked(&cgroup_mutex));
1138 : :
1139 : : #ifdef CONFIG_CPUSETS
1140 : : mask = ~(1UL << cpuset_subsys_id);
1141 : : #endif
1142 : :
1143 : 2983 : memset(opts, 0, sizeof(*opts));
1144 : :
1145 [ + + ]: 2989 : while ((token = strsep(&o, ",")) != NULL) {
1146 [ + ]: 6 : if (!*token)
1147 : : return -EINVAL;
1148 [ + + ]: 2989 : if (!strcmp(token, "none")) {
1149 : : /* Explicitly have no subsystems */
1150 : 3 : opts->none = true;
1151 : 3 : continue;
1152 : : }
1153 [ - + ]: 2986 : if (!strcmp(token, "all")) {
1154 : : /* Mutually exclusive option 'all' + subsystem name */
1155 : : if (one_ss)
1156 : : return -EINVAL;
1157 : : all_ss = true;
1158 : 0 : continue;
1159 : : }
1160 [ - + ]: 3 : if (!strcmp(token, "__DEVEL__sane_behavior")) {
1161 : 0 : opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1162 : 0 : continue;
1163 : : }
1164 [ - + ]: 3 : if (!strcmp(token, "noprefix")) {
1165 : 0 : opts->flags |= CGRP_ROOT_NOPREFIX;
1166 : 0 : continue;
1167 : : }
1168 [ - + ]: 3 : if (!strcmp(token, "clone_children")) {
1169 : 0 : opts->cpuset_clone_children = true;
1170 : 0 : continue;
1171 : : }
1172 [ - + ]: 3 : if (!strcmp(token, "xattr")) {
1173 : 0 : opts->flags |= CGRP_ROOT_XATTR;
1174 : 0 : continue;
1175 : : }
1176 [ - + ]: 3 : if (!strncmp(token, "release_agent=", 14)) {
1177 : : /* Specifying two release agents is forbidden */
1178 [ # # ]: 0 : if (opts->release_agent)
1179 : : return -EINVAL;
1180 : 0 : opts->release_agent =
1181 : 0 : kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1182 [ # # ]: 0 : if (!opts->release_agent)
1183 : : return -ENOMEM;
1184 : 0 : continue;
1185 : : }
1186 [ + - ]: 3 : if (!strncmp(token, "name=", 5)) {
1187 : 3 : const char *name = token + 5;
1188 : : /* Can't specify an empty name */
1189 [ + - ]: 3 : if (!strlen(name))
1190 : : return -EINVAL;
1191 : : /* Must match [\w.-]+ */
1192 [ + + ]: 12 : for (i = 0; i < strlen(name); i++) {
1193 : 9 : char c = name[i];
1194 [ + - ]: 9 : if (isalnum(c))
1195 : 9 : continue;
1196 [ # # ]: 0 : if ((c == '.') || (c == '-') || (c == '_'))
1197 : 0 : continue;
1198 : : return -EINVAL;
1199 : : }
1200 : : /* Specifying two names is forbidden */
1201 [ + - ]: 3 : if (opts->name)
1202 : : return -EINVAL;
1203 : 3 : opts->name = kstrndup(name,
1204 : : MAX_CGROUP_ROOT_NAMELEN - 1,
1205 : : GFP_KERNEL);
1206 [ + - ]: 3 : if (!opts->name)
1207 : : return -ENOMEM;
1208 : :
1209 : 6 : continue;
1210 : : }
1211 : :
1212 : : for_each_subsys(ss, i) {
1213 : : if (strcmp(token, ss->name))
1214 : : continue;
1215 : : if (ss->disabled)
1216 : : continue;
1217 : :
1218 : : /* Mutually exclusive option 'all' + subsystem name */
1219 : : if (all_ss)
1220 : : return -EINVAL;
1221 : : set_bit(i, &opts->subsys_mask);
1222 : : one_ss = true;
1223 : :
1224 : : break;
1225 : : }
1226 : : if (i == CGROUP_SUBSYS_COUNT)
1227 : : return -ENOENT;
1228 : : }
1229 : :
1230 : : /*
1231 : : * If the 'all' option was specified select all the subsystems,
1232 : : * otherwise if 'none', 'name=' and a subsystem name options
1233 : : * were not specified, let's default to 'all'
1234 : : */
1235 : : if (all_ss || (!one_ss && !opts->none && !opts->name))
1236 : : for_each_subsys(ss, i)
1237 : : if (!ss->disabled)
1238 : : set_bit(i, &opts->subsys_mask);
1239 : :
1240 : : /* Consistency checks */
1241 : :
1242 [ - + ]: 2983 : if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1243 : 0 : pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1244 : :
1245 [ # # ]: 0 : if (opts->flags & CGRP_ROOT_NOPREFIX) {
1246 : 0 : pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
1247 : 0 : return -EINVAL;
1248 : : }
1249 : :
1250 [ # # ]: 0 : if (opts->cpuset_clone_children) {
1251 : 0 : pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
1252 : 0 : return -EINVAL;
1253 : : }
1254 : : }
1255 : :
1256 : : /*
1257 : : * Option noprefix was introduced just for backward compatibility
1258 : : * with the old cpuset, so we allow noprefix only if mounting just
1259 : : * the cpuset subsystem.
1260 : : */
1261 [ - + ][ # # ]: 2983 : if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1262 : : return -EINVAL;
1263 : :
1264 : :
1265 : : /* Can't specify "none" and some subsystems */
1266 [ - + ][ # # ]: 2983 : if (opts->subsys_mask && opts->none)
1267 : : return -EINVAL;
1268 : :
1269 : : /*
1270 : : * We either have to specify by name or by subsystems. (So all
1271 : : * empty hierarchies must have a name).
1272 : : */
1273 [ + - ][ + + ]: 2983 : if (!opts->subsys_mask && !opts->name)
1274 : : return -EINVAL;
1275 : :
1276 : 3 : return 0;
1277 : : }
1278 : :
1279 : 0 : static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1280 : : {
1281 : : int ret = 0;
1282 : 0 : struct cgroupfs_root *root = sb->s_fs_info;
1283 : : struct cgroup *cgrp = &root->top_cgroup;
1284 : : struct cgroup_sb_opts opts;
1285 : : unsigned long added_mask, removed_mask;
1286 : :
1287 [ # # ]: 0 : if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1288 : 0 : pr_err("cgroup: sane_behavior: remount is not allowed\n");
1289 : 0 : return -EINVAL;
1290 : : }
1291 : :
1292 : 0 : mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1293 : 0 : mutex_lock(&cgroup_mutex);
1294 : 0 : mutex_lock(&cgroup_root_mutex);
1295 : :
1296 : : /* See what subsystems are wanted */
1297 : 0 : ret = parse_cgroupfs_options(data, &opts);
1298 [ # # ]: 0 : if (ret)
1299 : : goto out_unlock;
1300 : :
1301 [ # # ][ # # ]: 0 : if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1302 : 0 : pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1303 : : task_tgid_nr(current), current->comm);
1304 : :
1305 : 0 : added_mask = opts.subsys_mask & ~root->subsys_mask;
1306 : : removed_mask = root->subsys_mask & ~opts.subsys_mask;
1307 : :
1308 : : /* Don't allow flags or name to change at remount */
1309 [ # # ][ # # ]: 0 : if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1310 [ # # ]: 0 : (opts.name && strcmp(opts.name, root->name))) {
1311 [ # # ]: 0 : pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
1312 : : opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1313 : : root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1314 : : ret = -EINVAL;
1315 : 0 : goto out_unlock;
1316 : : }
1317 : :
1318 : : /* remounting is not allowed for populated hierarchies */
1319 [ # # ]: 0 : if (root->number_of_cgroups > 1) {
1320 : : ret = -EBUSY;
1321 : : goto out_unlock;
1322 : : }
1323 : :
1324 : 0 : ret = rebind_subsystems(root, added_mask, removed_mask);
1325 [ # # ]: 0 : if (ret)
1326 : : goto out_unlock;
1327 : :
1328 [ # # ]: 0 : if (opts.release_agent)
1329 : 0 : strcpy(root->release_agent_path, opts.release_agent);
1330 : : out_unlock:
1331 : 0 : kfree(opts.release_agent);
1332 : 0 : kfree(opts.name);
1333 : 0 : mutex_unlock(&cgroup_root_mutex);
1334 : 0 : mutex_unlock(&cgroup_mutex);
1335 : 0 : mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1336 : 0 : return ret;
1337 : : }
1338 : :
1339 : : static const struct super_operations cgroup_ops = {
1340 : : .statfs = simple_statfs,
1341 : : .drop_inode = generic_delete_inode,
1342 : : .show_options = cgroup_show_options,
1343 : : .remount_fs = cgroup_remount,
1344 : : };
1345 : :
1346 : 0 : static void init_cgroup_housekeeping(struct cgroup *cgrp)
1347 : : {
1348 : 5 : INIT_LIST_HEAD(&cgrp->sibling);
1349 : 5 : INIT_LIST_HEAD(&cgrp->children);
1350 : 5 : INIT_LIST_HEAD(&cgrp->files);
1351 : 5 : INIT_LIST_HEAD(&cgrp->cset_links);
1352 : 5 : INIT_LIST_HEAD(&cgrp->release_list);
1353 : 5 : INIT_LIST_HEAD(&cgrp->pidlists);
1354 : 5 : mutex_init(&cgrp->pidlist_mutex);
1355 : 5 : cgrp->dummy_css.cgroup = cgrp;
1356 : : simple_xattrs_init(&cgrp->xattrs);
1357 : 5 : }
1358 : :
1359 : 0 : static void init_cgroup_root(struct cgroupfs_root *root)
1360 : : {
1361 : 3 : struct cgroup *cgrp = &root->top_cgroup;
1362 : :
1363 : 3 : INIT_LIST_HEAD(&root->root_list);
1364 : 3 : root->number_of_cgroups = 1;
1365 : 3 : cgrp->root = root;
1366 : 3 : RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1367 : 3 : init_cgroup_housekeeping(cgrp);
1368 : 3 : idr_init(&root->cgroup_idr);
1369 : 3 : }
1370 : :
1371 : : static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1372 : : {
1373 : : int id;
1374 : :
1375 : : lockdep_assert_held(&cgroup_mutex);
1376 : : lockdep_assert_held(&cgroup_root_mutex);
1377 : :
1378 : 3 : id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1379 : : GFP_KERNEL);
1380 [ # # + - ]: 3 : if (id < 0)
1381 : : return id;
1382 : :
1383 : 3 : root->hierarchy_id = id;
1384 : : return 0;
1385 : : }
1386 : :
1387 : : static void cgroup_exit_root_id(struct cgroupfs_root *root)
1388 : : {
1389 : : lockdep_assert_held(&cgroup_mutex);
1390 : : lockdep_assert_held(&cgroup_root_mutex);
1391 : :
1392 [ + - ][ # # ]: 3 : if (root->hierarchy_id) {
1393 : 3 : idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1394 : 3 : root->hierarchy_id = 0;
1395 : : }
1396 : : }
1397 : :
1398 : 0 : static int cgroup_test_super(struct super_block *sb, void *data)
1399 : : {
1400 : : struct cgroup_sb_opts *opts = data;
1401 : 0 : struct cgroupfs_root *root = sb->s_fs_info;
1402 : :
1403 : : /* If we asked for a name then it must match */
1404 [ # # ][ # # ]: 0 : if (opts->name && strcmp(opts->name, root->name))
1405 : : return 0;
1406 : :
1407 : : /*
1408 : : * If we asked for subsystems (or explicitly for no
1409 : : * subsystems) then they must match
1410 : : */
1411 [ # # ][ # # ]: 0 : if ((opts->subsys_mask || opts->none)
1412 [ # # ]: 0 : && (opts->subsys_mask != root->subsys_mask))
1413 : : return 0;
1414 : :
1415 : 0 : return 1;
1416 : : }
1417 : :
1418 : 0 : static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1419 : : {
1420 : : struct cgroupfs_root *root;
1421 : :
1422 [ + - ][ + - ]: 3 : if (!opts->subsys_mask && !opts->none)
1423 : : return NULL;
1424 : :
1425 : : root = kzalloc(sizeof(*root), GFP_KERNEL);
1426 [ + - ]: 3 : if (!root)
1427 : : return ERR_PTR(-ENOMEM);
1428 : :
1429 : 3 : init_cgroup_root(root);
1430 : :
1431 : : /*
1432 : : * We need to set @root->subsys_mask now so that @root can be
1433 : : * matched by cgroup_test_super() before it finishes
1434 : : * initialization; otherwise, competing mounts with the same
1435 : : * options may try to bind the same subsystems instead of waiting
1436 : : * for the first one leading to unexpected mount errors.
1437 : : * SUBSYS_BOUND will be set once actual binding is complete.
1438 : : */
1439 : 3 : root->subsys_mask = opts->subsys_mask;
1440 : 3 : root->flags = opts->flags;
1441 [ - + ]: 3 : if (opts->release_agent)
1442 : 0 : strcpy(root->release_agent_path, opts->release_agent);
1443 [ + - ]: 3 : if (opts->name)
1444 : 3 : strcpy(root->name, opts->name);
1445 [ # # ]: 3 : if (opts->cpuset_clone_children)
1446 : 0 : set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
1447 : 3 : return root;
1448 : : }
1449 : :
1450 : 0 : static void cgroup_free_root(struct cgroupfs_root *root)
1451 : : {
1452 [ + - ]: 3 : if (root) {
1453 : : /* hierarhcy ID shoulid already have been released */
1454 [ - + ][ # # ]: 3 : WARN_ON_ONCE(root->hierarchy_id);
[ # # ]
1455 : :
1456 : 3 : idr_destroy(&root->cgroup_idr);
1457 : 3 : kfree(root);
1458 : : }
1459 : 3 : }
1460 : :
1461 : 0 : static int cgroup_set_super(struct super_block *sb, void *data)
1462 : : {
1463 : : int ret;
1464 : : struct cgroup_sb_opts *opts = data;
1465 : :
1466 : : /* If we don't have a new root, we can't set up a new sb */
1467 [ + - ]: 3 : if (!opts->new_root)
1468 : : return -EINVAL;
1469 : :
1470 [ + - ][ - + ]: 3 : BUG_ON(!opts->subsys_mask && !opts->none);
1471 : :
1472 : 3 : ret = set_anon_super(sb, NULL);
1473 [ + - ]: 3 : if (ret)
1474 : : return ret;
1475 : :
1476 : 3 : sb->s_fs_info = opts->new_root;
1477 : 3 : opts->new_root->sb = sb;
1478 : :
1479 : 3 : sb->s_blocksize = PAGE_CACHE_SIZE;
1480 : 3 : sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1481 : 3 : sb->s_magic = CGROUP_SUPER_MAGIC;
1482 : 3 : sb->s_op = &cgroup_ops;
1483 : :
1484 : 3 : return 0;
1485 : : }
1486 : :
1487 : 0 : static int cgroup_get_rootdir(struct super_block *sb)
1488 : : {
1489 : : static const struct dentry_operations cgroup_dops = {
1490 : : .d_iput = cgroup_diput,
1491 : : .d_delete = always_delete_dentry,
1492 : : };
1493 : :
1494 : 3 : struct inode *inode =
1495 : : cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1496 : :
1497 [ + - ]: 3 : if (!inode)
1498 : : return -ENOMEM;
1499 : :
1500 : 3 : inode->i_fop = &simple_dir_operations;
1501 : 3 : inode->i_op = &cgroup_dir_inode_operations;
1502 : : /* directories start off with i_nlink == 2 (for "." entry) */
1503 : 3 : inc_nlink(inode);
1504 : 3 : sb->s_root = d_make_root(inode);
1505 [ + - ]: 3 : if (!sb->s_root)
1506 : : return -ENOMEM;
1507 : : /* for everything else we want ->d_op set */
1508 : 3 : sb->s_d_op = &cgroup_dops;
1509 : 3 : return 0;
1510 : : }
1511 : :
1512 : 0 : static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1513 : : int flags, const char *unused_dev_name,
1514 : : void *data)
1515 : : {
1516 : : struct cgroup_sb_opts opts;
1517 : : struct cgroupfs_root *root;
1518 : : int ret = 0;
1519 : : struct super_block *sb;
1520 : : struct cgroupfs_root *new_root;
1521 : : struct list_head tmp_links;
1522 : : struct inode *inode;
1523 : : const struct cred *cred;
1524 : :
1525 : : /* First find the desired set of subsystems */
1526 : 2983 : mutex_lock(&cgroup_mutex);
1527 : 2983 : ret = parse_cgroupfs_options(data, &opts);
1528 : 2983 : mutex_unlock(&cgroup_mutex);
1529 [ + + ]: 2983 : if (ret)
1530 : : goto out_err;
1531 : :
1532 : : /*
1533 : : * Allocate a new cgroup root. We may not need it if we're
1534 : : * reusing an existing hierarchy.
1535 : : */
1536 : 3 : new_root = cgroup_root_from_opts(&opts);
1537 [ - + ]: 3 : if (IS_ERR(new_root)) {
1538 : : ret = PTR_ERR(new_root);
1539 : 0 : goto out_err;
1540 : : }
1541 : 3 : opts.new_root = new_root;
1542 : :
1543 : : /* Locate an existing or new sb for this hierarchy */
1544 : 3 : sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
1545 [ - + ]: 3 : if (IS_ERR(sb)) {
1546 : : ret = PTR_ERR(sb);
1547 : 0 : cgroup_free_root(opts.new_root);
1548 : 0 : goto out_err;
1549 : : }
1550 : :
1551 : 3 : root = sb->s_fs_info;
1552 [ - + ]: 3 : BUG_ON(!root);
1553 [ + - ]: 3 : if (root == opts.new_root) {
1554 : : /* We used the new root structure, so this is a new hierarchy */
1555 : 3 : struct cgroup *root_cgrp = &root->top_cgroup;
1556 : : struct cgroupfs_root *existing_root;
1557 : : int i;
1558 : : struct css_set *cset;
1559 : :
1560 [ - + ]: 3 : BUG_ON(sb->s_root != NULL);
1561 : :
1562 : 3 : ret = cgroup_get_rootdir(sb);
1563 [ + - ]: 3 : if (ret)
1564 : : goto drop_new_super;
1565 : 3 : inode = sb->s_root->d_inode;
1566 : :
1567 : 3 : mutex_lock(&inode->i_mutex);
1568 : 3 : mutex_lock(&cgroup_mutex);
1569 : 3 : mutex_lock(&cgroup_root_mutex);
1570 : :
1571 : 3 : ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1572 [ + - ]: 3 : if (ret < 0)
1573 : : goto unlock_drop;
1574 : 3 : root_cgrp->id = ret;
1575 : :
1576 : : /* Check for name clashes with existing mounts */
1577 : : ret = -EBUSY;
1578 [ + - ]: 3 : if (strlen(root->name))
1579 [ - + ]: 3 : for_each_active_root(existing_root)
1580 [ # # ]: 0 : if (!strcmp(existing_root->name, root->name))
1581 : : goto unlock_drop;
1582 : :
1583 : : /*
1584 : : * We're accessing css_set_count without locking
1585 : : * css_set_lock here, but that's OK - it can only be
1586 : : * increased by someone holding cgroup_lock, and
1587 : : * that's us. The worst that can happen is that we
1588 : : * have some link structures left over
1589 : : */
1590 : 3 : ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1591 [ + - ]: 3 : if (ret)
1592 : : goto unlock_drop;
1593 : :
1594 : : /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
1595 : : ret = cgroup_init_root_id(root, 2, 0);
1596 [ + - ]: 3 : if (ret)
1597 : : goto unlock_drop;
1598 : :
1599 : 3 : sb->s_root->d_fsdata = root_cgrp;
1600 : 3 : root_cgrp->dentry = sb->s_root;
1601 : :
1602 : : /*
1603 : : * We're inside get_sb() and will call lookup_one_len() to
1604 : : * create the root files, which doesn't work if SELinux is
1605 : : * in use. The following cred dancing somehow works around
1606 : : * it. See 2ce9738ba ("cgroupfs: use init_cred when
1607 : : * populating new cgroupfs mount") for more details.
1608 : : */
1609 : 3 : cred = override_creds(&init_cred);
1610 : :
1611 : 3 : ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1612 [ + - ]: 3 : if (ret)
1613 : : goto rm_base_files;
1614 : :
1615 : 3 : ret = rebind_subsystems(root, root->subsys_mask, 0);
1616 [ + - ]: 3 : if (ret)
1617 : : goto rm_base_files;
1618 : :
1619 : 3 : revert_creds(cred);
1620 : :
1621 : : /*
1622 : : * There must be no failure case after here, since rebinding
1623 : : * takes care of subsystems' refcounts, which are explicitly
1624 : : * dropped in the failure exit path.
1625 : : */
1626 : :
1627 : 3 : list_add(&root->root_list, &cgroup_roots);
1628 : 3 : cgroup_root_count++;
1629 : :
1630 : : /* Link the top cgroup in this hierarchy into all
1631 : : * the css_set objects */
1632 : 3 : write_lock(&css_set_lock);
1633 [ + + ][ - + ]: 3373 : hash_for_each(css_set_table, i, cset, hlist)
[ + + ][ + + ]
1634 : 3 : link_css_set(&tmp_links, cset, root_cgrp);
1635 : : write_unlock(&css_set_lock);
1636 : :
1637 : 3 : free_cgrp_cset_links(&tmp_links);
1638 : :
1639 [ - + ]: 3 : BUG_ON(!list_empty(&root_cgrp->children));
1640 [ - + ]: 3 : BUG_ON(root->number_of_cgroups != 1);
1641 : :
1642 : 3 : mutex_unlock(&cgroup_root_mutex);
1643 : 3 : mutex_unlock(&cgroup_mutex);
1644 : 3 : mutex_unlock(&inode->i_mutex);
1645 : : } else {
1646 : : /*
1647 : : * We re-used an existing hierarchy - the new root (if
1648 : : * any) is not needed
1649 : : */
1650 : 0 : cgroup_free_root(opts.new_root);
1651 : :
1652 [ # # ]: 0 : if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1653 [ # # ]: 0 : if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1654 : 0 : pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1655 : : ret = -EINVAL;
1656 : 0 : goto drop_new_super;
1657 : : } else {
1658 : 0 : pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1659 : : }
1660 : : }
1661 : : }
1662 : :
1663 : 3 : kfree(opts.release_agent);
1664 : 3 : kfree(opts.name);
1665 : 3 : return dget(sb->s_root);
1666 : :
1667 : : rm_base_files:
1668 : 0 : free_cgrp_cset_links(&tmp_links);
1669 : 0 : cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1670 : 0 : revert_creds(cred);
1671 : : unlock_drop:
1672 : : cgroup_exit_root_id(root);
1673 : 0 : mutex_unlock(&cgroup_root_mutex);
1674 : 0 : mutex_unlock(&cgroup_mutex);
1675 : 0 : mutex_unlock(&inode->i_mutex);
1676 : : drop_new_super:
1677 : 0 : deactivate_locked_super(sb);
1678 : : out_err:
1679 : 2980 : kfree(opts.release_agent);
1680 : 2980 : kfree(opts.name);
1681 : 2980 : return ERR_PTR(ret);
1682 : : }
1683 : :
1684 : 0 : static void cgroup_kill_sb(struct super_block *sb)
1685 : : {
1686 : 3 : struct cgroupfs_root *root = sb->s_fs_info;
1687 : : struct cgroup *cgrp = &root->top_cgroup;
1688 : : struct cgrp_cset_link *link, *tmp_link;
1689 : : int ret;
1690 : :
1691 [ - + ]: 3 : BUG_ON(!root);
1692 : :
1693 [ - + ]: 3 : BUG_ON(root->number_of_cgroups != 1);
1694 [ - + ]: 3 : BUG_ON(!list_empty(&cgrp->children));
1695 : :
1696 : 3 : mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1697 : 3 : mutex_lock(&cgroup_mutex);
1698 : 3 : mutex_lock(&cgroup_root_mutex);
1699 : :
1700 : : /* Rebind all subsystems back to the default hierarchy */
1701 [ + - ]: 3 : if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
1702 : 3 : ret = rebind_subsystems(root, 0, root->subsys_mask);
1703 : : /* Shouldn't be able to fail ... */
1704 [ - + ]: 3 : BUG_ON(ret);
1705 : : }
1706 : :
1707 : : /*
1708 : : * Release all the links from cset_links to this hierarchy's
1709 : : * root cgroup
1710 : : */
1711 : 3 : write_lock(&css_set_lock);
1712 : :
1713 [ + + ]: 6 : list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1714 : : list_del(&link->cset_link);
1715 : : list_del(&link->cgrp_link);
1716 : 3 : kfree(link);
1717 : : }
1718 : : write_unlock(&css_set_lock);
1719 : :
1720 [ + - ]: 3 : if (!list_empty(&root->root_list)) {
1721 : : list_del(&root->root_list);
1722 : 3 : cgroup_root_count--;
1723 : : }
1724 : :
1725 : : cgroup_exit_root_id(root);
1726 : :
1727 : 3 : mutex_unlock(&cgroup_root_mutex);
1728 : 3 : mutex_unlock(&cgroup_mutex);
1729 : 3 : mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1730 : :
1731 : 3 : simple_xattrs_free(&cgrp->xattrs);
1732 : :
1733 : 3 : kill_litter_super(sb);
1734 : 3 : cgroup_free_root(root);
1735 : 3 : }
1736 : :
1737 : : static struct file_system_type cgroup_fs_type = {
1738 : : .name = "cgroup",
1739 : : .mount = cgroup_mount,
1740 : : .kill_sb = cgroup_kill_sb,
1741 : : };
1742 : :
1743 : : static struct kobject *cgroup_kobj;
1744 : :
1745 : : /**
1746 : : * cgroup_path - generate the path of a cgroup
1747 : : * @cgrp: the cgroup in question
1748 : : * @buf: the buffer to write the path into
1749 : : * @buflen: the length of the buffer
1750 : : *
1751 : : * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1752 : : *
1753 : : * We can't generate cgroup path using dentry->d_name, as accessing
1754 : : * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1755 : : * inode's i_mutex, while on the other hand cgroup_path() can be called
1756 : : * with some irq-safe spinlocks held.
1757 : : */
1758 : 0 : int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1759 : : {
1760 : : int ret = -ENAMETOOLONG;
1761 : : char *start;
1762 : :
1763 [ # # ]: 0 : if (!cgrp->parent) {
1764 [ # # ]: 0 : if (strlcpy(buf, "/", buflen) >= buflen)
1765 : : return -ENAMETOOLONG;
1766 : 0 : return 0;
1767 : : }
1768 : :
1769 : 0 : start = buf + buflen - 1;
1770 : 0 : *start = '\0';
1771 : :
1772 : : rcu_read_lock();
1773 : : do {
1774 : 0 : const char *name = cgroup_name(cgrp);
1775 : : int len;
1776 : :
1777 : 0 : len = strlen(name);
1778 [ # # ]: 0 : if ((start -= len) < buf)
1779 : : goto out;
1780 : 0 : memcpy(start, name, len);
1781 : :
1782 [ # # ]: 0 : if (--start < buf)
1783 : : goto out;
1784 : 0 : *start = '/';
1785 : :
1786 : 0 : cgrp = cgrp->parent;
1787 [ # # ]: 0 : } while (cgrp->parent);
1788 : : ret = 0;
1789 : 0 : memmove(buf, start, buf + buflen - start);
1790 : : out:
1791 : : rcu_read_unlock();
1792 : 0 : return ret;
1793 : : }
1794 : : EXPORT_SYMBOL_GPL(cgroup_path);
1795 : :
1796 : : /**
1797 : : * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1798 : : * @task: target task
1799 : : * @buf: the buffer to write the path into
1800 : : * @buflen: the length of the buffer
1801 : : *
1802 : : * Determine @task's cgroup on the first (the one with the lowest non-zero
1803 : : * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
1804 : : * function grabs cgroup_mutex and shouldn't be used inside locks used by
1805 : : * cgroup controller callbacks.
1806 : : *
1807 : : * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.
1808 : : */
1809 : 0 : int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1810 : : {
1811 : : struct cgroupfs_root *root;
1812 : : struct cgroup *cgrp;
1813 : 0 : int hierarchy_id = 1, ret = 0;
1814 : :
1815 [ # # ]: 0 : if (buflen < 2)
1816 : : return -ENAMETOOLONG;
1817 : :
1818 : 0 : mutex_lock(&cgroup_mutex);
1819 : :
1820 : 0 : root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1821 : :
1822 [ # # ]: 0 : if (root) {
1823 : 0 : cgrp = task_cgroup_from_root(task, root);
1824 : 0 : ret = cgroup_path(cgrp, buf, buflen);
1825 : : } else {
1826 : : /* if no hierarchy exists, everyone is in "/" */
1827 : 0 : memcpy(buf, "/", 2);
1828 : : }
1829 : :
1830 : 0 : mutex_unlock(&cgroup_mutex);
1831 : 0 : return ret;
1832 : : }
1833 : : EXPORT_SYMBOL_GPL(task_cgroup_path);
1834 : :
1835 : : /*
1836 : : * Control Group taskset
1837 : : */
1838 : : struct task_and_cgroup {
1839 : : struct task_struct *task;
1840 : : struct cgroup *cgrp;
1841 : : struct css_set *cset;
1842 : : };
1843 : :
1844 : : struct cgroup_taskset {
1845 : : struct task_and_cgroup single;
1846 : : struct flex_array *tc_array;
1847 : : int tc_array_len;
1848 : : int idx;
1849 : : struct cgroup *cur_cgrp;
1850 : : };
1851 : :
1852 : : /**
1853 : : * cgroup_taskset_first - reset taskset and return the first task
1854 : : * @tset: taskset of interest
1855 : : *
1856 : : * @tset iteration is initialized and the first task is returned.
1857 : : */
1858 : 0 : struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1859 : : {
1860 [ # # ]: 0 : if (tset->tc_array) {
1861 : 0 : tset->idx = 0;
1862 : 0 : return cgroup_taskset_next(tset);
1863 : : } else {
1864 : 0 : tset->cur_cgrp = tset->single.cgrp;
1865 : 0 : return tset->single.task;
1866 : : }
1867 : : }
1868 : : EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1869 : :
1870 : : /**
1871 : : * cgroup_taskset_next - iterate to the next task in taskset
1872 : : * @tset: taskset of interest
1873 : : *
1874 : : * Return the next task in @tset. Iteration must have been initialized
1875 : : * with cgroup_taskset_first().
1876 : : */
1877 : 0 : struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1878 : : {
1879 : : struct task_and_cgroup *tc;
1880 : :
1881 [ # # ][ # # ]: 0 : if (!tset->tc_array || tset->idx >= tset->tc_array_len)
1882 : : return NULL;
1883 : :
1884 : 0 : tc = flex_array_get(tset->tc_array, tset->idx++);
1885 : 0 : tset->cur_cgrp = tc->cgrp;
1886 : 0 : return tc->task;
1887 : : }
1888 : : EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1889 : :
1890 : : /**
1891 : : * cgroup_taskset_cur_css - return the matching css for the current task
1892 : : * @tset: taskset of interest
1893 : : * @subsys_id: the ID of the target subsystem
1894 : : *
1895 : : * Return the css for the current (last returned) task of @tset for
1896 : : * subsystem specified by @subsys_id. This function must be preceded by
1897 : : * either cgroup_taskset_first() or cgroup_taskset_next().
1898 : : */
1899 : 0 : struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1900 : : int subsys_id)
1901 : : {
1902 : 0 : return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1903 : : }
1904 : : EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1905 : :
1906 : : /**
1907 : : * cgroup_taskset_size - return the number of tasks in taskset
1908 : : * @tset: taskset of interest
1909 : : */
1910 : 0 : int cgroup_taskset_size(struct cgroup_taskset *tset)
1911 : : {
1912 [ # # ]: 0 : return tset->tc_array ? tset->tc_array_len : 1;
1913 : : }
1914 : : EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1915 : :
1916 : :
1917 : : /*
1918 : : * cgroup_task_migrate - move a task from one cgroup to another.
1919 : : *
1920 : : * Must be called with cgroup_mutex and threadgroup locked.
1921 : : */
1922 : 0 : static void cgroup_task_migrate(struct cgroup *old_cgrp,
1923 : : struct task_struct *tsk,
1924 : : struct css_set *new_cset)
1925 : : {
1926 : : struct css_set *old_cset;
1927 : :
1928 : : /*
1929 : : * We are synchronized through threadgroup_lock() against PF_EXITING
1930 : : * setting such that we can't race against cgroup_exit() changing the
1931 : : * css_set to init_css_set and dropping the old one.
1932 : : */
1933 [ # # ][ # # ]: 0 : WARN_ON_ONCE(tsk->flags & PF_EXITING);
[ # # ]
1934 : : old_cset = task_css_set(tsk);
1935 : :
1936 : : task_lock(tsk);
1937 : 0 : rcu_assign_pointer(tsk->cgroups, new_cset);
1938 : : task_unlock(tsk);
1939 : :
1940 : : /* Update the css_set linked lists if we're using them */
1941 : 0 : write_lock(&css_set_lock);
1942 [ # # ]: 0 : if (!list_empty(&tsk->cg_list))
1943 : 0 : list_move(&tsk->cg_list, &new_cset->tasks);
1944 : : write_unlock(&css_set_lock);
1945 : :
1946 : : /*
1947 : : * We just gained a reference on old_cset by taking it from the
1948 : : * task. As trading it for new_cset is protected by cgroup_mutex,
1949 : : * we're safe to drop it here; it will be freed under RCU.
1950 : : */
1951 : 0 : set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1952 : : put_css_set(old_cset);
1953 : 0 : }
1954 : :
1955 : : /**
1956 : : * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
1957 : : * @cgrp: the cgroup to attach to
1958 : : * @tsk: the task or the leader of the threadgroup to be attached
1959 : : * @threadgroup: attach the whole threadgroup?
1960 : : *
1961 : : * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
1962 : : * task_lock of @tsk or each thread in the threadgroup individually in turn.
1963 : : */
1964 : 0 : static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1965 : : bool threadgroup)
1966 : : {
1967 : : int retval, i, group_size;
1968 : 0 : struct cgroupfs_root *root = cgrp->root;
1969 : : struct cgroup_subsys_state *css, *failed_css = NULL;
1970 : : /* threadgroup list cursor and array */
1971 : : struct task_struct *leader = tsk;
1972 : : struct task_and_cgroup *tc;
1973 : : struct flex_array *group;
1974 : : struct cgroup_taskset tset = { };
1975 : :
1976 : : /*
1977 : : * step 0: in order to do expensive, possibly blocking operations for
1978 : : * every thread, we cannot iterate the thread group list, since it needs
1979 : : * rcu or tasklist locked. instead, build an array of all threads in the
1980 : : * group - group_rwsem prevents new threads from appearing, and if
1981 : : * threads exit, this will just be an over-estimate.
1982 : : */
1983 [ # # ]: 0 : if (threadgroup)
1984 : : group_size = get_nr_threads(tsk);
1985 : : else
1986 : : group_size = 1;
1987 : : /* flex_array supports very large thread-groups better than kmalloc. */
1988 : 0 : group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
1989 [ # # ]: 0 : if (!group)
1990 : : return -ENOMEM;
1991 : : /* pre-allocate to guarantee space while iterating in rcu read-side. */
1992 : 0 : retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
1993 [ # # ]: 0 : if (retval)
1994 : : goto out_free_group_list;
1995 : :
1996 : : i = 0;
1997 : : /*
1998 : : * Prevent freeing of tasks while we take a snapshot. Tasks that are
1999 : : * already PF_EXITING could be freed from underneath us unless we
2000 : : * take an rcu_read_lock.
2001 : : */
2002 : : rcu_read_lock();
2003 : : do {
2004 : : struct task_and_cgroup ent;
2005 : :
2006 : : /* @tsk either already exited or can't exit until the end */
2007 [ # # ]: 0 : if (tsk->flags & PF_EXITING)
2008 : : goto next;
2009 : :
2010 : : /* as per above, nr_threads may decrease, but not increase. */
2011 [ # # ]: 0 : BUG_ON(i >= group_size);
2012 : 0 : ent.task = tsk;
2013 : 0 : ent.cgrp = task_cgroup_from_root(tsk, root);
2014 : : /* nothing to do if this task is already in the cgroup */
2015 [ # # ]: 0 : if (ent.cgrp == cgrp)
2016 : : goto next;
2017 : : /*
2018 : : * saying GFP_ATOMIC has no effect here because we did prealloc
2019 : : * earlier, but it's good form to communicate our expectations.
2020 : : */
2021 : 0 : retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2022 [ # # ]: 0 : BUG_ON(retval != 0);
2023 : 0 : i++;
2024 : : next:
2025 [ # # ]: 0 : if (!threadgroup)
2026 : : break;
2027 [ # # ]: 0 : } while_each_thread(leader, tsk);
2028 : : rcu_read_unlock();
2029 : : /* remember the number of threads in the array for later. */
2030 : : group_size = i;
2031 : : tset.tc_array = group;
2032 : : tset.tc_array_len = group_size;
2033 : :
2034 : : /* methods shouldn't be called if no task is actually migrating */
2035 : : retval = 0;
2036 [ # # ]: 0 : if (!group_size)
2037 : : goto out_free_group_list;
2038 : :
2039 : : /*
2040 : : * step 1: check that we can legitimately attach to the cgroup.
2041 : : */
2042 : : for_each_css(css, i, cgrp) {
2043 : : if (css->ss->can_attach) {
2044 : : retval = css->ss->can_attach(css, &tset);
2045 : : if (retval) {
2046 : : failed_css = css;
2047 : : goto out_cancel_attach;
2048 : : }
2049 : : }
2050 : : }
2051 : :
2052 : : /*
2053 : : * step 2: make sure css_sets exist for all threads to be migrated.
2054 : : * we use find_css_set, which allocates a new one if necessary.
2055 : : */
2056 [ # # ]: 0 : for (i = 0; i < group_size; i++) {
2057 : : struct css_set *old_cset;
2058 : :
2059 : 0 : tc = flex_array_get(group, i);
2060 : 0 : old_cset = task_css_set(tc->task);
2061 : 0 : tc->cset = find_css_set(old_cset, cgrp);
2062 [ # # ]: 0 : if (!tc->cset) {
2063 : : retval = -ENOMEM;
2064 : : goto out_put_css_set_refs;
2065 : : }
2066 : : }
2067 : :
2068 : : /*
2069 : : * step 3: now that we're guaranteed success wrt the css_sets,
2070 : : * proceed to move all tasks to the new cgroup. There are no
2071 : : * failure cases after here, so this is the commit point.
2072 : : */
2073 [ # # ]: 0 : for (i = 0; i < group_size; i++) {
2074 : 0 : tc = flex_array_get(group, i);
2075 : 0 : cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2076 : : }
2077 : : /* nothing is sensitive to fork() after this point. */
2078 : :
2079 : : /*
2080 : : * step 4: do subsystem attach callbacks.
2081 : : */
2082 : : for_each_css(css, i, cgrp)
2083 : : if (css->ss->attach)
2084 : : css->ss->attach(css, &tset);
2085 : :
2086 : : /*
2087 : : * step 5: success! and cleanup
2088 : : */
2089 : : retval = 0;
2090 : : out_put_css_set_refs:
2091 [ # # ]: 0 : if (retval) {
2092 [ # # ]: 0 : for (i = 0; i < group_size; i++) {
2093 : 0 : tc = flex_array_get(group, i);
2094 [ # # ]: 0 : if (!tc->cset)
2095 : : break;
2096 : : put_css_set(tc->cset);
2097 : : }
2098 : : }
2099 : : out_cancel_attach:
2100 : : if (retval) {
2101 : : for_each_css(css, i, cgrp) {
2102 : : if (css == failed_css)
2103 : : break;
2104 : : if (css->ss->cancel_attach)
2105 : : css->ss->cancel_attach(css, &tset);
2106 : : }
2107 : : }
2108 : : out_free_group_list:
2109 : 0 : flex_array_free(group);
2110 : 0 : return retval;
2111 : : }
2112 : :
2113 : : static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
2114 : : {
2115 : : struct cgroup_subsys_state *css;
2116 : : int i;
2117 : : int ret;
2118 : :
2119 : : for_each_css(css, i, cgrp) {
2120 : : if (css->ss->allow_attach) {
2121 : : ret = css->ss->allow_attach(css, tset);
2122 : : if (ret)
2123 : : return ret;
2124 : : } else {
2125 : : return -EACCES;
2126 : : }
2127 : : }
2128 : :
2129 : : return 0;
2130 : : }
2131 : :
2132 : : /*
2133 : : * Find the task_struct of the task to attach by vpid and pass it along to the
2134 : : * function to attach either it or all tasks in its threadgroup. Will lock
2135 : : * cgroup_mutex and threadgroup; may take task_lock of task.
2136 : : */
2137 : 0 : static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2138 : : {
2139 : : struct task_struct *tsk;
2140 : 0 : const struct cred *cred = current_cred(), *tcred;
2141 : : int ret;
2142 : :
2143 [ # # ]: 0 : if (!cgroup_lock_live_group(cgrp))
2144 : : return -ENODEV;
2145 : :
2146 : : retry_find_task:
2147 : : rcu_read_lock();
2148 [ # # ]: 0 : if (pid) {
2149 : 0 : tsk = find_task_by_vpid(pid);
2150 [ # # ]: 0 : if (!tsk) {
2151 : : rcu_read_unlock();
2152 : : ret = -ESRCH;
2153 : 0 : goto out_unlock_cgroup;
2154 : : }
2155 : : /*
2156 : : * even if we're attaching all tasks in the thread group, we
2157 : : * only need to check permissions on one of them.
2158 : : */
2159 : 0 : tcred = __task_cred(tsk);
2160 [ # # ][ # # ]: 0 : if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2161 [ # # ]: 0 : !uid_eq(cred->euid, tcred->uid) &&
2162 : : !uid_eq(cred->euid, tcred->suid)) {
2163 : : /*
2164 : : * if the default permission check fails, give each
2165 : : * cgroup a chance to extend the permission check
2166 : : */
2167 : : struct cgroup_taskset tset = { };
2168 : : tset.single.task = tsk;
2169 : : tset.single.cgrp = cgrp;
2170 : : ret = cgroup_allow_attach(cgrp, &tset);
2171 : : if (ret) {
2172 : : rcu_read_unlock();
2173 : : goto out_unlock_cgroup;
2174 : : }
2175 : : }
2176 : : } else
2177 : 0 : tsk = current;
2178 : :
2179 [ # # ]: 0 : if (threadgroup)
2180 : 0 : tsk = tsk->group_leader;
2181 : :
2182 : : /*
2183 : : * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2184 : : * trapped in a cpuset, or RT worker may be born in a cgroup
2185 : : * with no rt_runtime allocated. Just say no.
2186 : : */
2187 [ # # ][ # # ]: 0 : if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2188 : : ret = -EINVAL;
2189 : : rcu_read_unlock();
2190 : : goto out_unlock_cgroup;
2191 : : }
2192 : :
2193 : 0 : get_task_struct(tsk);
2194 : : rcu_read_unlock();
2195 : :
2196 : : threadgroup_lock(tsk);
2197 [ # # ]: 0 : if (threadgroup) {
2198 [ # # ]: 0 : if (!thread_group_leader(tsk)) {
2199 : : /*
2200 : : * a race with de_thread from another thread's exec()
2201 : : * may strip us of our leadership, if this happens,
2202 : : * there is no choice but to throw this task away and
2203 : : * try again; this is
2204 : : * "double-double-toil-and-trouble-check locking".
2205 : : */
2206 : : threadgroup_unlock(tsk);
2207 : : put_task_struct(tsk);
2208 : : goto retry_find_task;
2209 : : }
2210 : : }
2211 : :
2212 : 0 : ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2213 : :
2214 : : threadgroup_unlock(tsk);
2215 : :
2216 : : put_task_struct(tsk);
2217 : : out_unlock_cgroup:
2218 : 0 : mutex_unlock(&cgroup_mutex);
2219 : 0 : return ret;
2220 : : }
2221 : :
2222 : : /**
2223 : : * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2224 : : * @from: attach to all cgroups of a given task
2225 : : * @tsk: the task to be attached
2226 : : */
2227 : 0 : int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2228 : : {
2229 : : struct cgroupfs_root *root;
2230 : : int retval = 0;
2231 : :
2232 : 0 : mutex_lock(&cgroup_mutex);
2233 [ # # ]: 0 : for_each_active_root(root) {
2234 : 0 : struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
2235 : :
2236 : 0 : retval = cgroup_attach_task(from_cgrp, tsk, false);
2237 [ # # ]: 0 : if (retval)
2238 : : break;
2239 : : }
2240 : 0 : mutex_unlock(&cgroup_mutex);
2241 : :
2242 : 0 : return retval;
2243 : : }
2244 : : EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2245 : :
2246 : 0 : static int cgroup_tasks_write(struct cgroup_subsys_state *css,
2247 : : struct cftype *cft, u64 pid)
2248 : : {
2249 : 0 : return attach_task_by_pid(css->cgroup, pid, false);
2250 : : }
2251 : :
2252 : 0 : static int cgroup_procs_write(struct cgroup_subsys_state *css,
2253 : : struct cftype *cft, u64 tgid)
2254 : : {
2255 : 0 : return attach_task_by_pid(css->cgroup, tgid, true);
2256 : : }
2257 : :
2258 : 0 : static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2259 : : struct cftype *cft, const char *buffer)
2260 : : {
2261 : : BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
2262 [ # # ]: 0 : if (strlen(buffer) >= PATH_MAX)
2263 : : return -EINVAL;
2264 [ # # ]: 0 : if (!cgroup_lock_live_group(css->cgroup))
2265 : : return -ENODEV;
2266 : 0 : mutex_lock(&cgroup_root_mutex);
2267 : 0 : strcpy(css->cgroup->root->release_agent_path, buffer);
2268 : 0 : mutex_unlock(&cgroup_root_mutex);
2269 : 0 : mutex_unlock(&cgroup_mutex);
2270 : 0 : return 0;
2271 : : }
2272 : :
2273 : 0 : static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2274 : : {
2275 : 0 : struct cgroup *cgrp = seq_css(seq)->cgroup;
2276 : :
2277 [ # # ]: 0 : if (!cgroup_lock_live_group(cgrp))
2278 : : return -ENODEV;
2279 : 0 : seq_puts(seq, cgrp->root->release_agent_path);
2280 : 0 : seq_putc(seq, '\n');
2281 : 0 : mutex_unlock(&cgroup_mutex);
2282 : 0 : return 0;
2283 : : }
2284 : :
2285 : 0 : static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2286 : : {
2287 : 0 : struct cgroup *cgrp = seq_css(seq)->cgroup;
2288 : :
2289 : 0 : seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2290 : 0 : return 0;
2291 : : }
2292 : :
2293 : : /* A buffer size big enough for numbers or short strings */
2294 : : #define CGROUP_LOCAL_BUFFER_SIZE 64
2295 : :
2296 : 0 : static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2297 : : size_t nbytes, loff_t *ppos)
2298 : : {
2299 : 2 : struct cfent *cfe = __d_cfe(file->f_dentry);
2300 : : struct cftype *cft = __d_cft(file->f_dentry);
2301 : 2 : struct cgroup_subsys_state *css = cfe->css;
2302 [ - + ]: 2 : size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2303 : : char *buf;
2304 : : int ret;
2305 : :
2306 [ + ]: 2 : if (nbytes >= max_bytes)
2307 : : return -E2BIG;
2308 : :
2309 : 2 : buf = kmalloc(nbytes + 1, GFP_KERNEL);
2310 [ + - ]: 2 : if (!buf)
2311 : : return -ENOMEM;
2312 : :
2313 [ + - ]: 2 : if (copy_from_user(buf, userbuf, nbytes)) {
2314 : : ret = -EFAULT;
2315 : : goto out_free;
2316 : : }
2317 : :
2318 : 2 : buf[nbytes] = '\0';
2319 : :
2320 [ - + ]: 2 : if (cft->write_string) {
2321 : 0 : ret = cft->write_string(css, cft, strstrip(buf));
2322 [ + - ]: 2 : } else if (cft->write_u64) {
2323 : : unsigned long long v;
2324 : 2 : ret = kstrtoull(buf, 0, &v);
2325 [ + - ]: 2 : if (!ret)
2326 : 2 : ret = cft->write_u64(css, cft, v);
2327 [ # # ]: 0 : } else if (cft->write_s64) {
2328 : : long long v;
2329 : 0 : ret = kstrtoll(buf, 0, &v);
2330 [ # # ]: 0 : if (!ret)
2331 : 0 : ret = cft->write_s64(css, cft, v);
2332 [ # # ]: 0 : } else if (cft->trigger) {
2333 : 0 : ret = cft->trigger(css, (unsigned int)cft->private);
2334 : : } else {
2335 : : ret = -EINVAL;
2336 : : }
2337 : : out_free:
2338 : 2 : kfree(buf);
2339 [ + - ]: 2 : return ret ?: nbytes;
2340 : : }
2341 : :
2342 : : /*
2343 : : * seqfile ops/methods for returning structured data. Currently just
2344 : : * supports string->u64 maps, but can be extended in future.
2345 : : */
2346 : :
2347 : 0 : static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2348 : : {
2349 : : struct cftype *cft = seq_cft(seq);
2350 : :
2351 [ + + ]: 6 : if (cft->seq_start) {
2352 : 2 : return cft->seq_start(seq, ppos);
2353 : : } else {
2354 : : /*
2355 : : * The same behavior and code as single_open(). Returns
2356 : : * !NULL if pos is at the beginning; otherwise, NULL.
2357 : : */
2358 [ + + ]: 4 : return NULL + !*ppos;
2359 : : }
2360 : : }
2361 : :
2362 : 0 : static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2363 : : {
2364 : : struct cftype *cft = seq_cft(seq);
2365 : :
2366 [ + + ]: 99 : if (cft->seq_next) {
2367 : 97 : return cft->seq_next(seq, v, ppos);
2368 : : } else {
2369 : : /*
2370 : : * The same behavior and code as single_open(), always
2371 : : * terminate after the initial read.
2372 : : */
2373 : 2 : ++*ppos;
2374 : 2 : return NULL;
2375 : : }
2376 : : }
2377 : :
2378 : 0 : static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2379 : : {
2380 : : struct cftype *cft = seq_cft(seq);
2381 : :
2382 [ + + ]: 6 : if (cft->seq_stop)
2383 : 2 : cft->seq_stop(seq, v);
2384 : 0 : }
2385 : :
2386 : 0 : static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2387 : : {
2388 : : struct cftype *cft = seq_cft(m);
2389 : : struct cgroup_subsys_state *css = seq_css(m);
2390 : :
2391 [ + + ]: 99 : if (cft->seq_show)
2392 : 97 : return cft->seq_show(m, arg);
2393 : :
2394 [ + - ]: 2 : if (cft->read_u64)
2395 : 2 : seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2396 [ # # ]: 0 : else if (cft->read_s64)
2397 : 0 : seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2398 : : else
2399 : : return -EINVAL;
2400 : : return 0;
2401 : : }
2402 : :
2403 : : static struct seq_operations cgroup_seq_operations = {
2404 : : .start = cgroup_seqfile_start,
2405 : : .next = cgroup_seqfile_next,
2406 : : .stop = cgroup_seqfile_stop,
2407 : : .show = cgroup_seqfile_show,
2408 : : };
2409 : :
2410 : 0 : static int cgroup_file_open(struct inode *inode, struct file *file)
2411 : : {
2412 : 5 : struct cfent *cfe = __d_cfe(file->f_dentry);
2413 : : struct cftype *cft = __d_cft(file->f_dentry);
2414 : 5 : struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2415 : : struct cgroup_subsys_state *css;
2416 : : struct cgroup_open_file *of;
2417 : : int err;
2418 : :
2419 : 5 : err = generic_file_open(inode, file);
2420 [ + - ]: 5 : if (err)
2421 : : return err;
2422 : :
2423 : : /*
2424 : : * If the file belongs to a subsystem, pin the css. Will be
2425 : : * unpinned either on open failure or release. This ensures that
2426 : : * @css stays alive for all file operations.
2427 : : */
2428 : : rcu_read_lock();
2429 : 5 : css = cgroup_css(cgrp, cft->ss);
2430 [ - + ][ # # ]: 5 : if (cft->ss && !css_tryget(css))
2431 : : css = NULL;
2432 : : rcu_read_unlock();
2433 : :
2434 [ + - ]: 5 : if (!css)
2435 : : return -ENODEV;
2436 : :
2437 : : /*
2438 : : * @cfe->css is used by read/write/close to determine the
2439 : : * associated css. @file->private_data would be a better place but
2440 : : * that's already used by seqfile. Multiple accessors may use it
2441 : : * simultaneously which is okay as the association never changes.
2442 : : */
2443 [ + + ][ + - ]: 5 : WARN_ON_ONCE(cfe->css && cfe->css != css);
[ - + ][ # # ]
[ # # ]
2444 : 5 : cfe->css = css;
2445 : :
2446 : 5 : of = __seq_open_private(file, &cgroup_seq_operations,
2447 : : sizeof(struct cgroup_open_file));
2448 [ + - ]: 5 : if (of) {
2449 : 5 : of->cfe = cfe;
2450 : 5 : return 0;
2451 : : }
2452 : :
2453 [ # # ]: 0 : if (css->ss)
2454 : : css_put(css);
2455 : : return -ENOMEM;
2456 : : }
2457 : :
2458 : 0 : static int cgroup_file_release(struct inode *inode, struct file *file)
2459 : : {
2460 : 5 : struct cfent *cfe = __d_cfe(file->f_dentry);
2461 : 5 : struct cgroup_subsys_state *css = cfe->css;
2462 : :
2463 [ - + ]: 5 : if (css->ss)
2464 : : css_put(css);
2465 : 5 : return seq_release_private(inode, file);
2466 : : }
2467 : :
2468 : : /*
2469 : : * cgroup_rename - Only allow simple rename of directories in place.
2470 : : */
2471 : 0 : static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2472 : 0 : struct inode *new_dir, struct dentry *new_dentry)
2473 : : {
2474 : : int ret;
2475 : : struct cgroup_name *name, *old_name;
2476 : 0 : struct cgroup *cgrp;
2477 : :
2478 : : /*
2479 : : * It's convinient to use parent dir's i_mutex to protected
2480 : : * cgrp->name.
2481 : : */
2482 : : lockdep_assert_held(&old_dir->i_mutex);
2483 : :
2484 [ # # ]: 0 : if (!S_ISDIR(old_dentry->d_inode->i_mode))
2485 : : return -ENOTDIR;
2486 [ # # ]: 0 : if (new_dentry->d_inode)
2487 : : return -EEXIST;
2488 [ # # ]: 0 : if (old_dir != new_dir)
2489 : : return -EIO;
2490 : :
2491 : : cgrp = __d_cgrp(old_dentry);
2492 : :
2493 : : /*
2494 : : * This isn't a proper migration and its usefulness is very
2495 : : * limited. Disallow if sane_behavior.
2496 : : */
2497 [ # # ]: 0 : if (cgroup_sane_behavior(cgrp))
2498 : : return -EPERM;
2499 : :
2500 : 0 : name = cgroup_alloc_name(new_dentry);
2501 [ # # ]: 0 : if (!name)
2502 : : return -ENOMEM;
2503 : :
2504 : 0 : ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2505 [ # # ]: 0 : if (ret) {
2506 : 0 : kfree(name);
2507 : 0 : return ret;
2508 : : }
2509 : :
2510 : 0 : old_name = rcu_dereference_protected(cgrp->name, true);
2511 : 0 : rcu_assign_pointer(cgrp->name, name);
2512 : :
2513 : 0 : kfree_rcu(old_name, rcu_head);
2514 : 0 : return 0;
2515 : : }
2516 : :
2517 : 0 : static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2518 : : {
2519 [ # # ][ # # ]: 0 : if (S_ISDIR(dentry->d_inode->i_mode))
[ # # ][ # # ]
2520 : 0 : return &__d_cgrp(dentry)->xattrs;
2521 : : else
2522 : 0 : return &__d_cfe(dentry)->xattrs;
2523 : : }
2524 : :
2525 : : static inline int xattr_enabled(struct dentry *dentry)
2526 : : {
2527 : 0 : struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2528 : 0 : return root->flags & CGRP_ROOT_XATTR;
2529 : : }
2530 : :
2531 : 0 : static bool is_valid_xattr(const char *name)
2532 : : {
2533 [ # # ][ # # ]: 0 : if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2534 : 0 : !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2535 : : return true;
2536 : 0 : return false;
2537 : : }
2538 : :
2539 : 0 : static int cgroup_setxattr(struct dentry *dentry, const char *name,
2540 : : const void *val, size_t size, int flags)
2541 : : {
2542 [ # # ]: 0 : if (!xattr_enabled(dentry))
2543 : : return -EOPNOTSUPP;
2544 [ # # ]: 0 : if (!is_valid_xattr(name))
2545 : : return -EINVAL;
2546 : 0 : return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2547 : : }
2548 : :
2549 : 0 : static int cgroup_removexattr(struct dentry *dentry, const char *name)
2550 : : {
2551 [ # # ]: 0 : if (!xattr_enabled(dentry))
2552 : : return -EOPNOTSUPP;
2553 [ # # ]: 0 : if (!is_valid_xattr(name))
2554 : : return -EINVAL;
2555 : 0 : return simple_xattr_remove(__d_xattrs(dentry), name);
2556 : : }
2557 : :
2558 : 0 : static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2559 : : void *buf, size_t size)
2560 : : {
2561 [ # # ]: 0 : if (!xattr_enabled(dentry))
2562 : : return -EOPNOTSUPP;
2563 [ # # ]: 0 : if (!is_valid_xattr(name))
2564 : : return -EINVAL;
2565 : 0 : return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2566 : : }
2567 : :
2568 : 0 : static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2569 : : {
2570 [ # # ]: 0 : if (!xattr_enabled(dentry))
2571 : : return -EOPNOTSUPP;
2572 : 0 : return simple_xattr_list(__d_xattrs(dentry), buf, size);
2573 : : }
2574 : :
2575 : : static const struct file_operations cgroup_file_operations = {
2576 : : .read = seq_read,
2577 : : .write = cgroup_file_write,
2578 : : .llseek = generic_file_llseek,
2579 : : .open = cgroup_file_open,
2580 : : .release = cgroup_file_release,
2581 : : };
2582 : :
2583 : : static const struct inode_operations cgroup_file_inode_operations = {
2584 : : .setxattr = cgroup_setxattr,
2585 : : .getxattr = cgroup_getxattr,
2586 : : .listxattr = cgroup_listxattr,
2587 : : .removexattr = cgroup_removexattr,
2588 : : };
2589 : :
2590 : : static const struct inode_operations cgroup_dir_inode_operations = {
2591 : : .lookup = simple_lookup,
2592 : : .mkdir = cgroup_mkdir,
2593 : : .rmdir = cgroup_rmdir,
2594 : : .rename = cgroup_rename,
2595 : : .setxattr = cgroup_setxattr,
2596 : : .getxattr = cgroup_getxattr,
2597 : : .listxattr = cgroup_listxattr,
2598 : : .removexattr = cgroup_removexattr,
2599 : : };
2600 : :
2601 : 0 : static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2602 : : struct super_block *sb)
2603 : : {
2604 : : struct inode *inode;
2605 : :
2606 [ + - ]: 28 : if (!dentry)
2607 : : return -ENOENT;
2608 [ + - ]: 28 : if (dentry->d_inode)
2609 : : return -EEXIST;
2610 : :
2611 : 28 : inode = cgroup_new_inode(mode, sb);
2612 [ + - ]: 28 : if (!inode)
2613 : : return -ENOMEM;
2614 : :
2615 [ + + ]: 28 : if (S_ISDIR(mode)) {
2616 : 2 : inode->i_op = &cgroup_dir_inode_operations;
2617 : 2 : inode->i_fop = &simple_dir_operations;
2618 : :
2619 : : /* start off with i_nlink == 2 (for "." entry) */
2620 : 2 : inc_nlink(inode);
2621 : 2 : inc_nlink(dentry->d_parent->d_inode);
2622 : :
2623 : : /*
2624 : : * Control reaches here with cgroup_mutex held.
2625 : : * @inode->i_mutex should nest outside cgroup_mutex but we
2626 : : * want to populate it immediately without releasing
2627 : : * cgroup_mutex. As @inode isn't visible to anyone else
2628 : : * yet, trylock will always succeed without affecting
2629 : : * lockdep checks.
2630 : : */
2631 [ - + ][ # # ]: 2 : WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
[ # # ]
2632 [ + - ]: 26 : } else if (S_ISREG(mode)) {
2633 : 26 : inode->i_size = 0;
2634 : 26 : inode->i_fop = &cgroup_file_operations;
2635 : 26 : inode->i_op = &cgroup_file_inode_operations;
2636 : : }
2637 : 28 : d_instantiate(dentry, inode);
2638 : : dget(dentry); /* Extra count - pin the dentry in core */
2639 : : return 0;
2640 : : }
2641 : :
2642 : : /**
2643 : : * cgroup_file_mode - deduce file mode of a control file
2644 : : * @cft: the control file in question
2645 : : *
2646 : : * returns cft->mode if ->mode is not 0
2647 : : * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
2648 : : * returns S_IRUGO if it has only a read handler
2649 : : * returns S_IWUSR if it has only a write hander
2650 : : */
2651 : 0 : static umode_t cgroup_file_mode(const struct cftype *cft)
2652 : : {
2653 : : umode_t mode = 0;
2654 : :
2655 [ + ]: 26 : if (cft->mode)
2656 : : return cft->mode;
2657 : :
2658 [ + + ][ + - ]: 42 : if (cft->read_u64 || cft->read_s64 || cft->seq_show)
[ + - ]
2659 : : mode |= S_IRUGO;
2660 : :
2661 [ + + ][ + - ]: 16 : if (cft->write_u64 || cft->write_s64 || cft->write_string ||
[ + + ][ - + ]
2662 : 3 : cft->trigger)
2663 : 13 : mode |= S_IWUSR;
2664 : :
2665 : 16 : return mode;
2666 : : }
2667 : :
2668 : 0 : static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2669 : : {
2670 : 26 : struct dentry *dir = cgrp->dentry;
2671 : : struct cgroup *parent = __d_cgrp(dir);
2672 : : struct dentry *dentry;
2673 : : struct cfent *cfe;
2674 : : int error;
2675 : : umode_t mode;
2676 : 26 : char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2677 : :
2678 [ - + ][ # # ]: 26 : if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
[ # # ]
2679 : 0 : !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2680 : 0 : strcpy(name, cft->ss->name);
2681 : 0 : strcat(name, ".");
2682 : : }
2683 : 26 : strcat(name, cft->name);
2684 : :
2685 [ - + ]: 26 : BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2686 : :
2687 : : cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2688 [ + - ]: 26 : if (!cfe)
2689 : : return -ENOMEM;
2690 : :
2691 : 26 : dentry = lookup_one_len(name, dir, strlen(name));
2692 [ - + ]: 26 : if (IS_ERR(dentry)) {
2693 : : error = PTR_ERR(dentry);
2694 : : goto out;
2695 : : }
2696 : :
2697 : 26 : cfe->type = (void *)cft;
2698 : 26 : cfe->dentry = dentry;
2699 : 26 : dentry->d_fsdata = cfe;
2700 : : simple_xattrs_init(&cfe->xattrs);
2701 : :
2702 : 26 : mode = cgroup_file_mode(cft);
2703 : 26 : error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2704 [ + - ]: 26 : if (!error) {
2705 : 26 : list_add_tail(&cfe->node, &parent->files);
2706 : : cfe = NULL;
2707 : : }
2708 : 26 : dput(dentry);
2709 : : out:
2710 : 26 : kfree(cfe);
2711 : : return error;
2712 : : }
2713 : :
2714 : : /**
2715 : : * cgroup_addrm_files - add or remove files to a cgroup directory
2716 : : * @cgrp: the target cgroup
2717 : : * @cfts: array of cftypes to be added
2718 : : * @is_add: whether to add or remove
2719 : : *
2720 : : * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2721 : : * For removals, this function never fails. If addition fails, this
2722 : : * function doesn't remove files already added. The caller is responsible
2723 : : * for cleaning up.
2724 : : */
2725 : 0 : static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2726 : : bool is_add)
2727 : : {
2728 : : struct cftype *cft;
2729 : : int ret;
2730 : :
2731 : : lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
2732 : : lockdep_assert_held(&cgroup_mutex);
2733 : :
2734 [ + + ]: 49 : for (cft = cfts; cft->name[0] != '\0'; cft++) {
2735 : : /* does cft->flags tell us to skip this file on @cgrp? */
2736 [ + + ][ - + ]: 42 : if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2737 : 0 : continue;
2738 [ - + ][ # # ]: 42 : if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2739 : 0 : continue;
2740 [ + + ][ + + ]: 42 : if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2741 : 8 : continue;
2742 : :
2743 [ + + ]: 34 : if (is_add) {
2744 : 26 : ret = cgroup_add_file(cgrp, cft);
2745 [ - + ]: 26 : if (ret) {
2746 : 0 : pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2747 : : cft->name, ret);
2748 : 0 : return ret;
2749 : : }
2750 : : } else {
2751 : 8 : cgroup_rm_file(cgrp, cft);
2752 : : }
2753 : : }
2754 : : return 0;
2755 : : }
2756 : :
2757 : : static void cgroup_cfts_prepare(void)
2758 : : __acquires(&cgroup_mutex)
2759 : : {
2760 : : /*
2761 : : * Thanks to the entanglement with vfs inode locking, we can't walk
2762 : : * the existing cgroups under cgroup_mutex and create files.
2763 : : * Instead, we use css_for_each_descendant_pre() and drop RCU read
2764 : : * lock before calling cgroup_addrm_files().
2765 : : */
2766 : 0 : mutex_lock(&cgroup_mutex);
2767 : : }
2768 : :
2769 : 0 : static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2770 : : __releases(&cgroup_mutex)
2771 : : {
2772 : 0 : LIST_HEAD(pending);
2773 : 0 : struct cgroup_subsys *ss = cfts[0].ss;
2774 : 0 : struct cgroup *root = &ss->root->top_cgroup;
2775 : 0 : struct super_block *sb = ss->root->sb;
2776 : : struct dentry *prev = NULL;
2777 : : struct inode *inode;
2778 : : struct cgroup_subsys_state *css;
2779 : : u64 update_before;
2780 : : int ret = 0;
2781 : :
2782 : : /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2783 [ # # ][ # # ]: 0 : if (!cfts || ss->root == &cgroup_dummy_root ||
[ # # ]
2784 : 0 : !atomic_inc_not_zero(&sb->s_active)) {
2785 : 0 : mutex_unlock(&cgroup_mutex);
2786 : 0 : return 0;
2787 : : }
2788 : :
2789 : : /*
2790 : : * All cgroups which are created after we drop cgroup_mutex will
2791 : : * have the updated set of files, so we only need to update the
2792 : : * cgroups created before the current @cgroup_serial_nr_next.
2793 : : */
2794 : 0 : update_before = cgroup_serial_nr_next;
2795 : :
2796 : : /* add/rm files for all cgroups created before */
2797 [ # # ]: 0 : css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2798 : 0 : struct cgroup *cgrp = css->cgroup;
2799 : :
2800 [ # # ]: 0 : if (cgroup_is_dead(cgrp))
2801 : 0 : continue;
2802 : :
2803 : 0 : inode = cgrp->dentry->d_inode;
2804 : : dget(cgrp->dentry);
2805 : 0 : dput(prev);
2806 : 0 : prev = cgrp->dentry;
2807 : :
2808 : 0 : mutex_unlock(&cgroup_mutex);
2809 : 0 : mutex_lock(&inode->i_mutex);
2810 : 0 : mutex_lock(&cgroup_mutex);
2811 [ # # ][ # # ]: 0 : if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2812 : 0 : ret = cgroup_addrm_files(cgrp, cfts, is_add);
2813 : 0 : mutex_unlock(&inode->i_mutex);
2814 [ # # ]: 0 : if (ret)
2815 : : break;
2816 : : }
2817 : 0 : mutex_unlock(&cgroup_mutex);
2818 : 0 : dput(prev);
2819 : 0 : deactivate_super(sb);
2820 : 0 : return ret;
2821 : : }
2822 : :
2823 : : /**
2824 : : * cgroup_add_cftypes - add an array of cftypes to a subsystem
2825 : : * @ss: target cgroup subsystem
2826 : : * @cfts: zero-length name terminated array of cftypes
2827 : : *
2828 : : * Register @cfts to @ss. Files described by @cfts are created for all
2829 : : * existing cgroups to which @ss is attached and all future cgroups will
2830 : : * have them too. This function can be called anytime whether @ss is
2831 : : * attached or not.
2832 : : *
2833 : : * Returns 0 on successful registration, -errno on failure. Note that this
2834 : : * function currently returns 0 as long as @cfts registration is successful
2835 : : * even if some file creation attempts on existing cgroups fail.
2836 : : */
2837 : 0 : int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2838 : : {
2839 : : struct cftype_set *set;
2840 : : struct cftype *cft;
2841 : : int ret;
2842 : :
2843 : : set = kzalloc(sizeof(*set), GFP_KERNEL);
2844 [ # # ]: 0 : if (!set)
2845 : : return -ENOMEM;
2846 : :
2847 [ # # ]: 0 : for (cft = cfts; cft->name[0] != '\0'; cft++)
2848 : 0 : cft->ss = ss;
2849 : :
2850 : : cgroup_cfts_prepare();
2851 : 0 : set->cfts = cfts;
2852 : 0 : list_add_tail(&set->node, &ss->cftsets);
2853 : 0 : ret = cgroup_cfts_commit(cfts, true);
2854 [ # # ]: 0 : if (ret)
2855 : 0 : cgroup_rm_cftypes(cfts);
2856 : 0 : return ret;
2857 : : }
2858 : : EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2859 : :
2860 : : /**
2861 : : * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2862 : : * @cfts: zero-length name terminated array of cftypes
2863 : : *
2864 : : * Unregister @cfts. Files described by @cfts are removed from all
2865 : : * existing cgroups and all future cgroups won't have them either. This
2866 : : * function can be called anytime whether @cfts' subsys is attached or not.
2867 : : *
2868 : : * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2869 : : * registered.
2870 : : */
2871 : 0 : int cgroup_rm_cftypes(struct cftype *cfts)
2872 : : {
2873 : : struct cftype_set *set;
2874 : :
2875 [ # # ][ # # ]: 0 : if (!cfts || !cfts[0].ss)
2876 : : return -ENOENT;
2877 : :
2878 : : cgroup_cfts_prepare();
2879 : :
2880 [ # # ]: 0 : list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
2881 [ # # ]: 0 : if (set->cfts == cfts) {
2882 : : list_del(&set->node);
2883 : 0 : kfree(set);
2884 : 0 : cgroup_cfts_commit(cfts, false);
2885 : 0 : return 0;
2886 : : }
2887 : : }
2888 : :
2889 : 0 : cgroup_cfts_commit(NULL, false);
2890 : 0 : return -ENOENT;
2891 : : }
2892 : :
2893 : : /**
2894 : : * cgroup_task_count - count the number of tasks in a cgroup.
2895 : : * @cgrp: the cgroup in question
2896 : : *
2897 : : * Return the number of tasks in the cgroup.
2898 : : */
2899 : 0 : int cgroup_task_count(const struct cgroup *cgrp)
2900 : : {
2901 : : int count = 0;
2902 : : struct cgrp_cset_link *link;
2903 : :
2904 : 1 : read_lock(&css_set_lock);
2905 [ + + ]: 2 : list_for_each_entry(link, &cgrp->cset_links, cset_link)
2906 : 1 : count += atomic_read(&link->cset->refcount);
2907 : : read_unlock(&css_set_lock);
2908 : 1 : return count;
2909 : : }
2910 : :
2911 : : /*
2912 : : * To reduce the fork() overhead for systems that are not actually using
2913 : : * their cgroups capability, we don't maintain the lists running through
2914 : : * each css_set to its tasks until we see the list actually used - in other
2915 : : * words after the first call to css_task_iter_start().
2916 : : */
2917 : 0 : static void cgroup_enable_task_cg_lists(void)
2918 : : {
2919 : : struct task_struct *p, *g;
2920 : 1 : write_lock(&css_set_lock);
2921 : 1 : use_task_css_set_links = 1;
2922 : : /*
2923 : : * We need tasklist_lock because RCU is not safe against
2924 : : * while_each_thread(). Besides, a forking task that has passed
2925 : : * cgroup_post_fork() without seeing use_task_css_set_links = 1
2926 : : * is not guaranteed to have its child immediately visible in the
2927 : : * tasklist if we walk through it with RCU.
2928 : : */
2929 : 1 : read_lock(&tasklist_lock);
2930 [ + + ]: 184 : do_each_thread(g, p) {
2931 : : task_lock(p);
2932 : : /*
2933 : : * We should check if the process is exiting, otherwise
2934 : : * it will race with cgroup_exit() in that the list
2935 : : * entry won't be deleted though the process has exited.
2936 : : * Do it while holding siglock so that we don't end up
2937 : : * racing against cgroup_exit().
2938 : : */
2939 : 186 : spin_lock_irq(&p->sighand->siglock);
2940 [ + + ][ + - ]: 186 : if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2941 : 97 : list_add(&p->cg_list, &task_css_set(p)->tasks);
2942 : 186 : spin_unlock_irq(&p->sighand->siglock);
2943 : :
2944 : : task_unlock(p);
2945 [ + + ]: 186 : } while_each_thread(g, p);
2946 : : read_unlock(&tasklist_lock);
2947 : : write_unlock(&css_set_lock);
2948 : 1 : }
2949 : :
2950 : : /**
2951 : : * css_next_child - find the next child of a given css
2952 : : * @pos_css: the current position (%NULL to initiate traversal)
2953 : : * @parent_css: css whose children to walk
2954 : : *
2955 : : * This function returns the next child of @parent_css and should be called
2956 : : * under either cgroup_mutex or RCU read lock. The only requirement is
2957 : : * that @parent_css and @pos_css are accessible. The next sibling is
2958 : : * guaranteed to be returned regardless of their states.
2959 : : */
2960 : : struct cgroup_subsys_state *
2961 : 0 : css_next_child(struct cgroup_subsys_state *pos_css,
2962 : : struct cgroup_subsys_state *parent_css)
2963 : : {
2964 [ # # ]: 0 : struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
2965 : 0 : struct cgroup *cgrp = parent_css->cgroup;
2966 : : struct cgroup *next;
2967 : :
2968 : : cgroup_assert_mutex_or_rcu_locked();
2969 : :
2970 : : /*
2971 : : * @pos could already have been removed. Once a cgroup is removed,
2972 : : * its ->sibling.next is no longer updated when its next sibling
2973 : : * changes. As CGRP_DEAD assertion is serialized and happens
2974 : : * before the cgroup is taken off the ->sibling list, if we see it
2975 : : * unasserted, it's guaranteed that the next sibling hasn't
2976 : : * finished its grace period even if it's already removed, and thus
2977 : : * safe to dereference from this RCU critical section. If
2978 : : * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
2979 : : * to be visible as %true here.
2980 : : *
2981 : : * If @pos is dead, its next pointer can't be dereferenced;
2982 : : * however, as each cgroup is given a monotonically increasing
2983 : : * unique serial number and always appended to the sibling list,
2984 : : * the next one can be found by walking the parent's children until
2985 : : * we see a cgroup with higher serial number than @pos's. While
2986 : : * this path can be slower, it's taken only when either the current
2987 : : * cgroup is removed or iteration and removal race.
2988 : : */
2989 [ # # ]: 0 : if (!pos) {
2990 : 0 : next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
2991 [ # # ]: 0 : } else if (likely(!cgroup_is_dead(pos))) {
2992 : 0 : next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
2993 : : } else {
2994 [ # # ]: 0 : list_for_each_entry_rcu(next, &cgrp->children, sibling)
2995 [ # # ]: 0 : if (next->serial_nr > pos->serial_nr)
2996 : : break;
2997 : : }
2998 : :
2999 [ # # ]: 0 : if (&next->sibling == &cgrp->children)
3000 : : return NULL;
3001 : :
3002 : 0 : return cgroup_css(next, parent_css->ss);
3003 : : }
3004 : : EXPORT_SYMBOL_GPL(css_next_child);
3005 : :
3006 : : /**
3007 : : * css_next_descendant_pre - find the next descendant for pre-order walk
3008 : : * @pos: the current position (%NULL to initiate traversal)
3009 : : * @root: css whose descendants to walk
3010 : : *
3011 : : * To be used by css_for_each_descendant_pre(). Find the next descendant
3012 : : * to visit for pre-order traversal of @root's descendants. @root is
3013 : : * included in the iteration and the first node to be visited.
3014 : : *
3015 : : * While this function requires cgroup_mutex or RCU read locking, it
3016 : : * doesn't require the whole traversal to be contained in a single critical
3017 : : * section. This function will return the correct next descendant as long
3018 : : * as both @pos and @root are accessible and @pos is a descendant of @root.
3019 : : */
3020 : : struct cgroup_subsys_state *
3021 : 0 : css_next_descendant_pre(struct cgroup_subsys_state *pos,
3022 : : struct cgroup_subsys_state *root)
3023 : : {
3024 : : struct cgroup_subsys_state *next;
3025 : :
3026 : : cgroup_assert_mutex_or_rcu_locked();
3027 : :
3028 : : /* if first iteration, visit @root */
3029 [ # # ]: 0 : if (!pos)
3030 : : return root;
3031 : :
3032 : : /* visit the first child if exists */
3033 : 0 : next = css_next_child(NULL, pos);
3034 [ # # ]: 0 : if (next)
3035 : : return next;
3036 : :
3037 : : /* no child, visit my or the closest ancestor's next sibling */
3038 [ # # ]: 0 : while (pos != root) {
3039 : 0 : next = css_next_child(pos, css_parent(pos));
3040 [ # # ]: 0 : if (next)
3041 : : return next;
3042 : : pos = css_parent(pos);
3043 : : }
3044 : :
3045 : : return NULL;
3046 : : }
3047 : : EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3048 : :
3049 : : /**
3050 : : * css_rightmost_descendant - return the rightmost descendant of a css
3051 : : * @pos: css of interest
3052 : : *
3053 : : * Return the rightmost descendant of @pos. If there's no descendant, @pos
3054 : : * is returned. This can be used during pre-order traversal to skip
3055 : : * subtree of @pos.
3056 : : *
3057 : : * While this function requires cgroup_mutex or RCU read locking, it
3058 : : * doesn't require the whole traversal to be contained in a single critical
3059 : : * section. This function will return the correct rightmost descendant as
3060 : : * long as @pos is accessible.
3061 : : */
3062 : : struct cgroup_subsys_state *
3063 : 0 : css_rightmost_descendant(struct cgroup_subsys_state *pos)
3064 : : {
3065 : : struct cgroup_subsys_state *last, *tmp;
3066 : :
3067 : : cgroup_assert_mutex_or_rcu_locked();
3068 : :
3069 : : do {
3070 : : last = pos;
3071 : : /* ->prev isn't RCU safe, walk ->next till the end */
3072 : : pos = NULL;
3073 [ # # ]: 0 : css_for_each_child(tmp, last)
3074 : : pos = tmp;
3075 [ # # ]: 0 : } while (pos);
3076 : :
3077 : 0 : return last;
3078 : : }
3079 : : EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3080 : :
3081 : : static struct cgroup_subsys_state *
3082 : : css_leftmost_descendant(struct cgroup_subsys_state *pos)
3083 : : {
3084 : : struct cgroup_subsys_state *last;
3085 : :
3086 : : do {
3087 : : last = pos;
3088 : 0 : pos = css_next_child(NULL, pos);
3089 [ # # ][ # # ]: 0 : } while (pos);
3090 : :
3091 : : return last;
3092 : : }
3093 : :
3094 : : /**
3095 : : * css_next_descendant_post - find the next descendant for post-order walk
3096 : : * @pos: the current position (%NULL to initiate traversal)
3097 : : * @root: css whose descendants to walk
3098 : : *
3099 : : * To be used by css_for_each_descendant_post(). Find the next descendant
3100 : : * to visit for post-order traversal of @root's descendants. @root is
3101 : : * included in the iteration and the last node to be visited.
3102 : : *
3103 : : * While this function requires cgroup_mutex or RCU read locking, it
3104 : : * doesn't require the whole traversal to be contained in a single critical
3105 : : * section. This function will return the correct next descendant as long
3106 : : * as both @pos and @cgroup are accessible and @pos is a descendant of
3107 : : * @cgroup.
3108 : : */
3109 : : struct cgroup_subsys_state *
3110 : 0 : css_next_descendant_post(struct cgroup_subsys_state *pos,
3111 : : struct cgroup_subsys_state *root)
3112 : : {
3113 : : struct cgroup_subsys_state *next;
3114 : :
3115 : : cgroup_assert_mutex_or_rcu_locked();
3116 : :
3117 : : /* if first iteration, visit leftmost descendant which may be @root */
3118 [ # # ]: 0 : if (!pos)
3119 : : return css_leftmost_descendant(root);
3120 : :
3121 : : /* if we visited @root, we're done */
3122 [ # # ]: 0 : if (pos == root)
3123 : : return NULL;
3124 : :
3125 : : /* if there's an unvisited sibling, visit its leftmost descendant */
3126 : 0 : next = css_next_child(pos, css_parent(pos));
3127 [ # # ]: 0 : if (next)
3128 : : return css_leftmost_descendant(next);
3129 : :
3130 : : /* no sibling left, visit parent */
3131 : 0 : return css_parent(pos);
3132 : : }
3133 : : EXPORT_SYMBOL_GPL(css_next_descendant_post);
3134 : :
3135 : : /**
3136 : : * css_advance_task_iter - advance a task itererator to the next css_set
3137 : : * @it: the iterator to advance
3138 : : *
3139 : : * Advance @it to the next css_set to walk.
3140 : : */
3141 : : static void css_advance_task_iter(struct css_task_iter *it)
3142 : : {
3143 : : struct list_head *l = it->cset_link;
3144 : : struct cgrp_cset_link *link;
3145 : : struct css_set *cset;
3146 : :
3147 : : /* Advance to the next non-empty css_set */
3148 : : do {
3149 : 2 : l = l->next;
3150 [ + - ][ - + ]: 3 : if (l == &it->origin_css->cgroup->cset_links) {
3151 : 1 : it->cset_link = NULL;
3152 : : return;
3153 : : }
3154 : : link = list_entry(l, struct cgrp_cset_link, cset_link);
3155 : 1 : cset = link->cset;
3156 [ # # ][ - + ]: 1 : } while (list_empty(&cset->tasks));
3157 : 1 : it->cset_link = l;
3158 : 1 : it->task = cset->tasks.next;
3159 : : }
3160 : :
3161 : : /**
3162 : : * css_task_iter_start - initiate task iteration
3163 : : * @css: the css to walk tasks of
3164 : : * @it: the task iterator to use
3165 : : *
3166 : : * Initiate iteration through the tasks of @css. The caller can call
3167 : : * css_task_iter_next() to walk through the tasks until the function
3168 : : * returns NULL. On completion of iteration, css_task_iter_end() must be
3169 : : * called.
3170 : : *
3171 : : * Note that this function acquires a lock which is released when the
3172 : : * iteration finishes. The caller can't sleep while iteration is in
3173 : : * progress.
3174 : : */
3175 : 0 : void css_task_iter_start(struct cgroup_subsys_state *css,
3176 : : struct css_task_iter *it)
3177 : : __acquires(css_set_lock)
3178 : : {
3179 : : /*
3180 : : * The first time anyone tries to iterate across a css, we need to
3181 : : * enable the list linking each css_set to its tasks, and fix up
3182 : : * all existing tasks.
3183 : : */
3184 [ + - ]: 1 : if (!use_task_css_set_links)
3185 : 1 : cgroup_enable_task_cg_lists();
3186 : :
3187 : 1 : read_lock(&css_set_lock);
3188 : :
3189 : 2 : it->origin_css = css;
3190 : 2 : it->cset_link = &css->cgroup->cset_links;
3191 : :
3192 : : css_advance_task_iter(it);
3193 : 1 : }
3194 : :
3195 : : /**
3196 : : * css_task_iter_next - return the next task for the iterator
3197 : : * @it: the task iterator being iterated
3198 : : *
3199 : : * The "next" function for task iteration. @it should have been
3200 : : * initialized via css_task_iter_start(). Returns NULL when the iteration
3201 : : * reaches the end.
3202 : : */
3203 : 0 : struct task_struct *css_task_iter_next(struct css_task_iter *it)
3204 : : {
3205 : : struct task_struct *res;
3206 : 98 : struct list_head *l = it->task;
3207 : : struct cgrp_cset_link *link;
3208 : :
3209 : : /* If the iterator cg is NULL, we have no tasks */
3210 [ + ]: 98 : if (!it->cset_link)
3211 : : return NULL;
3212 : : res = list_entry(l, struct task_struct, cg_list);
3213 : : /* Advance iterator to find next entry */
3214 : 195 : l = l->next;
3215 : : link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3216 [ + + ]: 195 : if (l == &link->cset->tasks) {
3217 : : /*
3218 : : * We reached the end of this task list - move on to the
3219 : : * next cgrp_cset_link.
3220 : : */
3221 : : css_advance_task_iter(it);
3222 : : } else {
3223 : 96 : it->task = l;
3224 : : }
3225 : 97 : return res;
3226 : : }
3227 : :
3228 : : /**
3229 : : * css_task_iter_end - finish task iteration
3230 : : * @it: the task iterator to finish
3231 : : *
3232 : : * Finish task iteration started by css_task_iter_start().
3233 : : */
3234 : 0 : void css_task_iter_end(struct css_task_iter *it)
3235 : : __releases(css_set_lock)
3236 : : {
3237 : : read_unlock(&css_set_lock);
3238 : 1 : }
3239 : :
3240 : : static inline int started_after_time(struct task_struct *t1,
3241 : : struct timespec *time,
3242 : : struct task_struct *t2)
3243 : : {
3244 : : int start_diff = timespec_compare(&t1->start_time, time);
3245 [ # # ][ # # ]: 0 : if (start_diff > 0) {
3246 : : return 1;
3247 [ # # ][ # # ]: 0 : } else if (start_diff < 0) {
3248 : : return 0;
3249 : : } else {
3250 : : /*
3251 : : * Arbitrarily, if two processes started at the same
3252 : : * time, we'll say that the lower pointer value
3253 : : * started first. Note that t2 may have exited by now
3254 : : * so this may not be a valid pointer any longer, but
3255 : : * that's fine - it still serves to distinguish
3256 : : * between two tasks started (effectively) simultaneously.
3257 : : */
3258 : 0 : return t1 > t2;
3259 : : }
3260 : : }
3261 : :
3262 : : /*
3263 : : * This function is a callback from heap_insert() and is used to order
3264 : : * the heap.
3265 : : * In this case we order the heap in descending task start time.
3266 : : */
3267 : 0 : static inline int started_after(void *p1, void *p2)
3268 : : {
3269 : : struct task_struct *t1 = p1;
3270 : : struct task_struct *t2 = p2;
3271 : 0 : return started_after_time(t1, &t2->start_time, t2);
3272 : : }
3273 : :
3274 : : /**
3275 : : * css_scan_tasks - iterate though all the tasks in a css
3276 : : * @css: the css to iterate tasks of
3277 : : * @test: optional test callback
3278 : : * @process: process callback
3279 : : * @data: data passed to @test and @process
3280 : : * @heap: optional pre-allocated heap used for task iteration
3281 : : *
3282 : : * Iterate through all the tasks in @css, calling @test for each, and if it
3283 : : * returns %true, call @process for it also.
3284 : : *
3285 : : * @test may be NULL, meaning always true (select all tasks), which
3286 : : * effectively duplicates css_task_iter_{start,next,end}() but does not
3287 : : * lock css_set_lock for the call to @process.
3288 : : *
3289 : : * It is guaranteed that @process will act on every task that is a member
3290 : : * of @css for the duration of this call. This function may or may not
3291 : : * call @process for tasks that exit or move to a different css during the
3292 : : * call, or are forked or move into the css during the call.
3293 : : *
3294 : : * Note that @test may be called with locks held, and may in some
3295 : : * situations be called multiple times for the same task, so it should be
3296 : : * cheap.
3297 : : *
3298 : : * If @heap is non-NULL, a heap has been pre-allocated and will be used for
3299 : : * heap operations (and its "gt" member will be overwritten), else a
3300 : : * temporary heap will be used (allocation of which may cause this function
3301 : : * to fail).
3302 : : */
3303 : 0 : int css_scan_tasks(struct cgroup_subsys_state *css,
3304 : : bool (*test)(struct task_struct *, void *),
3305 : : void (*process)(struct task_struct *, void *),
3306 : : void *data, struct ptr_heap *heap)
3307 : : {
3308 : : int retval, i;
3309 : : struct css_task_iter it;
3310 : : struct task_struct *p, *dropped;
3311 : : /* Never dereference latest_task, since it's not refcounted */
3312 : : struct task_struct *latest_task = NULL;
3313 : : struct ptr_heap tmp_heap;
3314 : : struct timespec latest_time = { 0, 0 };
3315 : :
3316 [ # # ]: 0 : if (heap) {
3317 : : /* The caller supplied our heap and pre-allocated its memory */
3318 : 0 : heap->gt = &started_after;
3319 : : } else {
3320 : : /* We need to allocate our own heap memory */
3321 : : heap = &tmp_heap;
3322 : 0 : retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
3323 [ # # ]: 0 : if (retval)
3324 : : /* cannot allocate the heap */
3325 : : return retval;
3326 : : }
3327 : :
3328 : : again:
3329 : : /*
3330 : : * Scan tasks in the css, using the @test callback to determine
3331 : : * which are of interest, and invoking @process callback on the
3332 : : * ones which need an update. Since we don't want to hold any
3333 : : * locks during the task updates, gather tasks to be processed in a
3334 : : * heap structure. The heap is sorted by descending task start
3335 : : * time. If the statically-sized heap fills up, we overflow tasks
3336 : : * that started later, and in future iterations only consider tasks
3337 : : * that started after the latest task in the previous pass. This
3338 : : * guarantees forward progress and that we don't miss any tasks.
3339 : : */
3340 : 0 : heap->size = 0;
3341 : 0 : css_task_iter_start(css, &it);
3342 [ # # ]: 0 : while ((p = css_task_iter_next(&it))) {
3343 : : /*
3344 : : * Only affect tasks that qualify per the caller's callback,
3345 : : * if he provided one
3346 : : */
3347 [ # # ][ # # ]: 0 : if (test && !test(p, data))
3348 : 0 : continue;
3349 : : /*
3350 : : * Only process tasks that started after the last task
3351 : : * we processed
3352 : : */
3353 [ # # ]: 0 : if (!started_after_time(p, &latest_time, latest_task))
3354 : 0 : continue;
3355 : 0 : dropped = heap_insert(heap, p);
3356 [ # # ]: 0 : if (dropped == NULL) {
3357 : : /*
3358 : : * The new task was inserted; the heap wasn't
3359 : : * previously full
3360 : : */
3361 : 0 : get_task_struct(p);
3362 [ # # ]: 0 : } else if (dropped != p) {
3363 : : /*
3364 : : * The new task was inserted, and pushed out a
3365 : : * different task
3366 : : */
3367 : 0 : get_task_struct(p);
3368 : : put_task_struct(dropped);
3369 : : }
3370 : : /*
3371 : : * Else the new task was newer than anything already in
3372 : : * the heap and wasn't inserted
3373 : : */
3374 : : }
3375 : 0 : css_task_iter_end(&it);
3376 : :
3377 [ # # ]: 0 : if (heap->size) {
3378 [ # # ]: 0 : for (i = 0; i < heap->size; i++) {
3379 : 0 : struct task_struct *q = heap->ptrs[i];
3380 [ # # ]: 0 : if (i == 0) {
3381 : 0 : latest_time = q->start_time;
3382 : : latest_task = q;
3383 : : }
3384 : : /* Process the task per the caller's callback */
3385 : 0 : process(q, data);
3386 : : put_task_struct(q);
3387 : : }
3388 : : /*
3389 : : * If we had to process any tasks at all, scan again
3390 : : * in case some of them were in the middle of forking
3391 : : * children that didn't get processed.
3392 : : * Not the most efficient way to do it, but it avoids
3393 : : * having to take callback_mutex in the fork path
3394 : : */
3395 : : goto again;
3396 : : }
3397 [ # # ]: 0 : if (heap == &tmp_heap)
3398 : 0 : heap_free(&tmp_heap);
3399 : : return 0;
3400 : : }
3401 : :
3402 : 0 : static void cgroup_transfer_one_task(struct task_struct *task, void *data)
3403 : : {
3404 : : struct cgroup *new_cgroup = data;
3405 : :
3406 : 0 : mutex_lock(&cgroup_mutex);
3407 : 0 : cgroup_attach_task(new_cgroup, task, false);
3408 : 0 : mutex_unlock(&cgroup_mutex);
3409 : 0 : }
3410 : :
3411 : : /**
3412 : : * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3413 : : * @to: cgroup to which the tasks will be moved
3414 : : * @from: cgroup in which the tasks currently reside
3415 : : */
3416 : 0 : int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3417 : : {
3418 : 0 : return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3419 : : to, NULL);
3420 : : }
3421 : :
3422 : : /*
3423 : : * Stuff for reading the 'tasks'/'procs' files.
3424 : : *
3425 : : * Reading this file can return large amounts of data if a cgroup has
3426 : : * *lots* of attached tasks. So it may need several calls to read(),
3427 : : * but we cannot guarantee that the information we produce is correct
3428 : : * unless we produce it entirely atomically.
3429 : : *
3430 : : */
3431 : :
3432 : : /* which pidlist file are we talking about? */
3433 : : enum cgroup_filetype {
3434 : : CGROUP_FILE_PROCS,
3435 : : CGROUP_FILE_TASKS,
3436 : : };
3437 : :
3438 : : /*
3439 : : * A pidlist is a list of pids that virtually represents the contents of one
3440 : : * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
3441 : : * a pair (one each for procs, tasks) for each pid namespace that's relevant
3442 : : * to the cgroup.
3443 : : */
3444 : : struct cgroup_pidlist {
3445 : : /*
3446 : : * used to find which pidlist is wanted. doesn't change as long as
3447 : : * this particular list stays in the list.
3448 : : */
3449 : : struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
3450 : : /* array of xids */
3451 : : pid_t *list;
3452 : : /* how many elements the above list has */
3453 : : int length;
3454 : : /* each of these stored in a list by its cgroup */
3455 : : struct list_head links;
3456 : : /* pointer to the cgroup we belong to, for list removal purposes */
3457 : : struct cgroup *owner;
3458 : : /* for delayed destruction */
3459 : : struct delayed_work destroy_dwork;
3460 : : };
3461 : :
3462 : : /*
3463 : : * The following two functions "fix" the issue where there are more pids
3464 : : * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
3465 : : * TODO: replace with a kernel-wide solution to this problem
3466 : : */
3467 : : #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
3468 : 0 : static void *pidlist_allocate(int count)
3469 : : {
3470 [ - + ]: 1 : if (PIDLIST_TOO_LARGE(count))
3471 : 0 : return vmalloc(count * sizeof(pid_t));
3472 : : else
3473 : 1 : return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3474 : : }
3475 : :
3476 : 0 : static void pidlist_free(void *p)
3477 : : {
3478 [ - + ]: 2 : if (is_vmalloc_addr(p))
3479 : 0 : vfree(p);
3480 : : else
3481 : 2 : kfree(p);
3482 : 2 : }
3483 : :
3484 : : /*
3485 : : * Used to destroy all pidlists lingering waiting for destroy timer. None
3486 : : * should be left afterwards.
3487 : : */
3488 : 0 : static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3489 : : {
3490 : : struct cgroup_pidlist *l, *tmp_l;
3491 : :
3492 : 2 : mutex_lock(&cgrp->pidlist_mutex);
3493 [ - + ]: 4 : list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3494 : 0 : mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3495 : 2 : mutex_unlock(&cgrp->pidlist_mutex);
3496 : :
3497 : 2 : flush_workqueue(cgroup_pidlist_destroy_wq);
3498 [ - + ]: 2 : BUG_ON(!list_empty(&cgrp->pidlists));
3499 : 2 : }
3500 : :
3501 : 0 : static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3502 : : {
3503 : : struct delayed_work *dwork = to_delayed_work(work);
3504 : 1 : struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3505 : : destroy_dwork);
3506 : : struct cgroup_pidlist *tofree = NULL;
3507 : :
3508 : 1 : mutex_lock(&l->owner->pidlist_mutex);
3509 : :
3510 : : /*
3511 : : * Destroy iff we didn't get queued again. The state won't change
3512 : : * as destroy_dwork can only be queued while locked.
3513 : : */
3514 [ + - ]: 1 : if (!delayed_work_pending(dwork)) {
3515 : : list_del(&l->links);
3516 : 1 : pidlist_free(l->list);
3517 : : put_pid_ns(l->key.ns);
3518 : : tofree = l;
3519 : : }
3520 : :
3521 : 1 : mutex_unlock(&l->owner->pidlist_mutex);
3522 : 1 : kfree(tofree);
3523 : 1 : }
3524 : :
3525 : : /*
3526 : : * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3527 : : * Returns the number of unique elements.
3528 : : */
3529 : 0 : static int pidlist_uniq(pid_t *list, int length)
3530 : : {
3531 : : int src, dest = 1;
3532 : :
3533 : : /*
3534 : : * we presume the 0th element is unique, so i starts at 1. trivial
3535 : : * edge cases first; no work needs to be done for either
3536 : : */
3537 [ # # ]: 0 : if (length == 0 || length == 1)
3538 : : return length;
3539 : : /* src and dest walk down the list; dest counts unique elements */
3540 [ # # ]: 0 : for (src = 1; src < length; src++) {
3541 : : /* find next unique element */
3542 [ # # ]: 0 : while (list[src] == list[src-1]) {
3543 : 0 : src++;
3544 [ # # ]: 0 : if (src == length)
3545 : : goto after;
3546 : : }
3547 : : /* dest always points to where the next unique element goes */
3548 : 0 : list[dest] = list[src];
3549 : 0 : dest++;
3550 : : }
3551 : : after:
3552 : : return dest;
3553 : : }
3554 : :
3555 : : /*
3556 : : * The two pid files - task and cgroup.procs - guaranteed that the result
3557 : : * is sorted, which forced this whole pidlist fiasco. As pid order is
3558 : : * different per namespace, each namespace needs differently sorted list,
3559 : : * making it impossible to use, for example, single rbtree of member tasks
3560 : : * sorted by task pointer. As pidlists can be fairly large, allocating one
3561 : : * per open file is dangerous, so cgroup had to implement shared pool of
3562 : : * pidlists keyed by cgroup and namespace.
3563 : : *
3564 : : * All this extra complexity was caused by the original implementation
3565 : : * committing to an entirely unnecessary property. In the long term, we
3566 : : * want to do away with it. Explicitly scramble sort order if
3567 : : * sane_behavior so that no such expectation exists in the new interface.
3568 : : *
3569 : : * Scrambling is done by swapping every two consecutive bits, which is
3570 : : * non-identity one-to-one mapping which disturbs sort order sufficiently.
3571 : : */
3572 : : static pid_t pid_fry(pid_t pid)
3573 : : {
3574 : 0 : unsigned a = pid & 0x55555555;
3575 : 0 : unsigned b = pid & 0xAAAAAAAA;
3576 : :
3577 : 0 : return (a << 1) | (b >> 1);
3578 : : }
3579 : :
3580 : 103 : static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3581 : : {
3582 [ - + ][ - + ]: 206 : if (cgroup_sane_behavior(cgrp))
[ - + ][ - + ]
3583 : : return pid_fry(pid);
3584 : : else
3585 : : return pid;
3586 : : }
3587 : :
3588 : 0 : static int cmppid(const void *a, const void *b)
3589 : : {
3590 : 913 : return *(pid_t *)a - *(pid_t *)b;
3591 : : }
3592 : :
3593 : 0 : static int fried_cmppid(const void *a, const void *b)
3594 : : {
3595 : 0 : return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3596 : : }
3597 : :
3598 : 0 : static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3599 : : enum cgroup_filetype type)
3600 : : {
3601 : : struct cgroup_pidlist *l;
3602 : : /* don't need task_nsproxy() if we're looking at ourself */
3603 : 2 : struct pid_namespace *ns = task_active_pid_ns(current);
3604 : :
3605 : : lockdep_assert_held(&cgrp->pidlist_mutex);
3606 : :
3607 [ + + ]: 2 : list_for_each_entry(l, &cgrp->pidlists, links)
3608 [ + - ][ - + ]: 1 : if (l->key.type == type && l->key.ns == ns)
3609 : : return l;
3610 : : return NULL;
3611 : : }
3612 : :
3613 : : /*
3614 : : * find the appropriate pidlist for our purpose (given procs vs tasks)
3615 : : * returns with the lock on that pidlist already held, and takes care
3616 : : * of the use count, or returns NULL with no locks held if we're out of
3617 : : * memory.
3618 : : */
3619 : 0 : static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3620 : : enum cgroup_filetype type)
3621 : : {
3622 : : struct cgroup_pidlist *l;
3623 : :
3624 : : lockdep_assert_held(&cgrp->pidlist_mutex);
3625 : :
3626 : 1 : l = cgroup_pidlist_find(cgrp, type);
3627 [ + - ]: 1 : if (l)
3628 : : return l;
3629 : :
3630 : : /* entry not found; create a new one */
3631 : : l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3632 [ + - ]: 1 : if (!l)
3633 : : return l;
3634 : :
3635 : 1 : INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3636 : 1 : l->key.type = type;
3637 : : /* don't need task_nsproxy() if we're looking at ourself */
3638 : 1 : l->key.ns = get_pid_ns(task_active_pid_ns(current));
3639 : 1 : l->owner = cgrp;
3640 : 1 : list_add(&l->links, &cgrp->pidlists);
3641 : 1 : return l;
3642 : : }
3643 : :
3644 : : /*
3645 : : * Load a cgroup's pidarray with either procs' tgids or tasks' pids
3646 : : */
3647 : 0 : static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3648 : : struct cgroup_pidlist **lp)
3649 : : {
3650 : : pid_t *array;
3651 : : int length;
3652 : : int pid, n = 0; /* used for populating the array */
3653 : : struct css_task_iter it;
3654 : : struct task_struct *tsk;
3655 : : struct cgroup_pidlist *l;
3656 : :
3657 : : lockdep_assert_held(&cgrp->pidlist_mutex);
3658 : :
3659 : : /*
3660 : : * If cgroup gets more users after we read count, we won't have
3661 : : * enough space - tough. This race is indistinguishable to the
3662 : : * caller from the case that the additional cgroup users didn't
3663 : : * show up until sometime later on.
3664 : : */
3665 : 1 : length = cgroup_task_count(cgrp);
3666 : 1 : array = pidlist_allocate(length);
3667 [ + - ]: 1 : if (!array)
3668 : : return -ENOMEM;
3669 : : /* now, populate the array */
3670 : 1 : css_task_iter_start(&cgrp->dummy_css, &it);
3671 [ + + ]: 99 : while ((tsk = css_task_iter_next(&it))) {
3672 [ + - ]: 97 : if (unlikely(n == length))
3673 : : break;
3674 : : /* get tgid or pid for procs or tasks file respectively */
3675 [ - + ]: 97 : if (type == CGROUP_FILE_PROCS)
3676 : : pid = task_tgid_vnr(tsk);
3677 : : else
3678 : : pid = task_pid_vnr(tsk);
3679 [ + - ]: 97 : if (pid > 0) /* make sure to only use valid results */
3680 : 98 : array[n++] = pid;
3681 : : }
3682 : 1 : css_task_iter_end(&it);
3683 : : length = n;
3684 : : /* now sort & (if procs) strip out duplicates */
3685 [ - + ]: 1 : if (cgroup_sane_behavior(cgrp))
3686 : 0 : sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3687 : : else
3688 : 1 : sort(array, length, sizeof(pid_t), cmppid, NULL);
3689 [ - + ]: 1 : if (type == CGROUP_FILE_PROCS)
3690 : 0 : length = pidlist_uniq(array, length);
3691 : :
3692 : 1 : l = cgroup_pidlist_find_create(cgrp, type);
3693 [ - + ]: 1 : if (!l) {
3694 : 0 : mutex_unlock(&cgrp->pidlist_mutex);
3695 : 0 : pidlist_free(array);
3696 : 0 : return -ENOMEM;
3697 : : }
3698 : :
3699 : : /* store array, freeing old if necessary */
3700 : 1 : pidlist_free(l->list);
3701 : 1 : l->list = array;
3702 : 1 : l->length = length;
3703 : 1 : *lp = l;
3704 : 1 : return 0;
3705 : : }
3706 : :
3707 : : /**
3708 : : * cgroupstats_build - build and fill cgroupstats
3709 : : * @stats: cgroupstats to fill information into
3710 : : * @dentry: A dentry entry belonging to the cgroup for which stats have
3711 : : * been requested.
3712 : : *
3713 : : * Build and fill cgroupstats so that taskstats can export it to user
3714 : : * space.
3715 : : */
3716 : 0 : int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3717 : : {
3718 : : int ret = -EINVAL;
3719 : : struct cgroup *cgrp;
3720 : : struct css_task_iter it;
3721 : : struct task_struct *tsk;
3722 : :
3723 : : /*
3724 : : * Validate dentry by checking the superblock operations,
3725 : : * and make sure it's a directory.
3726 : : */
3727 [ # # ][ # # ]: 0 : if (dentry->d_sb->s_op != &cgroup_ops ||
3728 : 0 : !S_ISDIR(dentry->d_inode->i_mode))
3729 : : goto err;
3730 : :
3731 : : ret = 0;
3732 : 0 : cgrp = dentry->d_fsdata;
3733 : :
3734 : 0 : css_task_iter_start(&cgrp->dummy_css, &it);
3735 [ # # ]: 0 : while ((tsk = css_task_iter_next(&it))) {
3736 [ # # # # : 0 : switch (tsk->state) {
# ]
3737 : : case TASK_RUNNING:
3738 : 0 : stats->nr_running++;
3739 : 0 : break;
3740 : : case TASK_INTERRUPTIBLE:
3741 : 0 : stats->nr_sleeping++;
3742 : 0 : break;
3743 : : case TASK_UNINTERRUPTIBLE:
3744 : 0 : stats->nr_uninterruptible++;
3745 : 0 : break;
3746 : : case TASK_STOPPED:
3747 : 0 : stats->nr_stopped++;
3748 : 0 : break;
3749 : : default:
3750 : : if (delayacct_is_task_waiting_on_io(tsk))
3751 : : stats->nr_io_wait++;
3752 : : break;
3753 : : }
3754 : : }
3755 : 0 : css_task_iter_end(&it);
3756 : :
3757 : : err:
3758 : 0 : return ret;
3759 : : }
3760 : :
3761 : :
3762 : : /*
3763 : : * seq_file methods for the tasks/procs files. The seq_file position is the
3764 : : * next pid to display; the seq_file iterator is a pointer to the pid
3765 : : * in the cgroup->l->list array.
3766 : : */
3767 : :
3768 : 0 : static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3769 : : {
3770 : : /*
3771 : : * Initially we receive a position value that corresponds to
3772 : : * one more than the last pid shown (or 0 on the first call or
3773 : : * after a seek to the start). Use a binary-search to find the
3774 : : * next pid to display, if any
3775 : : */
3776 : 2 : struct cgroup_open_file *of = s->private;
3777 : 2 : struct cgroup *cgrp = seq_css(s)->cgroup;
3778 : : struct cgroup_pidlist *l;
3779 : 2 : enum cgroup_filetype type = seq_cft(s)->private;
3780 : 2 : int index = 0, pid = *pos;
3781 : : int *iter, ret;
3782 : :
3783 : 2 : mutex_lock(&cgrp->pidlist_mutex);
3784 : :
3785 : : /*
3786 : : * !NULL @of->priv indicates that this isn't the first start()
3787 : : * after open. If the matching pidlist is around, we can use that.
3788 : : * Look for it. Note that @of->priv can't be used directly. It
3789 : : * could already have been destroyed.
3790 : : */
3791 [ + + ]: 2 : if (of->priv)
3792 : 1 : of->priv = cgroup_pidlist_find(cgrp, type);
3793 : :
3794 : : /*
3795 : : * Either this is the first start() after open or the matching
3796 : : * pidlist has been destroyed inbetween. Create a new one.
3797 : : */
3798 [ + + ]: 2 : if (!of->priv) {
3799 : 1 : ret = pidlist_array_load(cgrp, type,
3800 : 1 : (struct cgroup_pidlist **)&of->priv);
3801 [ - + ]: 1 : if (ret)
3802 : 0 : return ERR_PTR(ret);
3803 : : }
3804 : 2 : l = of->priv;
3805 : :
3806 [ + + ]: 2 : if (pid) {
3807 : 1 : int end = l->length;
3808 : :
3809 [ + + ]: 7 : while (index < end) {
3810 : 6 : int mid = (index + end) / 2;
3811 [ + - ]: 6 : if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3812 : : index = mid;
3813 : : break;
3814 [ + - ]: 6 : } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3815 : 6 : index = mid + 1;
3816 : : else
3817 : : end = mid;
3818 : : }
3819 : : }
3820 : : /* If we're off the end of the array, we're done */
3821 [ # # ]: 2 : if (index >= l->length)
3822 : : return NULL;
3823 : : /* Update the abstract position to be the actual pid that we found */
3824 : 1 : iter = l->list + index;
3825 : 2 : *pos = cgroup_pid_fry(cgrp, *iter);
3826 : 1 : return iter;
3827 : : }
3828 : :
3829 : 0 : static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3830 : : {
3831 : 2 : struct cgroup_open_file *of = s->private;
3832 : 2 : struct cgroup_pidlist *l = of->priv;
3833 : :
3834 [ + - ]: 2 : if (l)
3835 : 2 : mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3836 : : CGROUP_PIDLIST_DESTROY_DELAY);
3837 : 2 : mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3838 : 2 : }
3839 : :
3840 : 0 : static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3841 : : {
3842 : 97 : struct cgroup_open_file *of = s->private;
3843 : 97 : struct cgroup_pidlist *l = of->priv;
3844 : : pid_t *p = v;
3845 : 97 : pid_t *end = l->list + l->length;
3846 : : /*
3847 : : * Advance to the next pid in the array. If this goes off the
3848 : : * end, we're done
3849 : : */
3850 : 97 : p++;
3851 [ + ]: 97 : if (p >= end) {
3852 : : return NULL;
3853 : : } else {
3854 : 192 : *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3855 : 96 : return p;
3856 : : }
3857 : : }
3858 : :
3859 : 0 : static int cgroup_pidlist_show(struct seq_file *s, void *v)
3860 : : {
3861 : 97 : return seq_printf(s, "%d\n", *(int *)v);
3862 : : }
3863 : :
3864 : : /*
3865 : : * seq_operations functions for iterating on pidlists through seq_file -
3866 : : * independent of whether it's tasks or procs
3867 : : */
3868 : : static const struct seq_operations cgroup_pidlist_seq_operations = {
3869 : : .start = cgroup_pidlist_start,
3870 : : .stop = cgroup_pidlist_stop,
3871 : : .next = cgroup_pidlist_next,
3872 : : .show = cgroup_pidlist_show,
3873 : : };
3874 : :
3875 : 0 : static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3876 : : struct cftype *cft)
3877 : : {
3878 : 4 : return notify_on_release(css->cgroup);
3879 : : }
3880 : :
3881 : 0 : static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3882 : : struct cftype *cft, u64 val)
3883 : : {
3884 : 2 : clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3885 [ + + ]: 2 : if (val)
3886 : 1 : set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3887 : : else
3888 : 1 : clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3889 : 2 : return 0;
3890 : : }
3891 : :
3892 : : /*
3893 : : * When dput() is called asynchronously, if umount has been done and
3894 : : * then deactivate_super() in cgroup_free_fn() kills the superblock,
3895 : : * there's a small window that vfs will see the root dentry with non-zero
3896 : : * refcnt and trigger BUG().
3897 : : *
3898 : : * That's why we hold a reference before dput() and drop it right after.
3899 : : */
3900 : : static void cgroup_dput(struct cgroup *cgrp)
3901 : : {
3902 : : struct super_block *sb = cgrp->root->sb;
3903 : :
3904 : : atomic_inc(&sb->s_active);
3905 : : dput(cgrp->dentry);
3906 : : deactivate_super(sb);
3907 : : }
3908 : :
3909 : 0 : static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3910 : : struct cftype *cft)
3911 : : {
3912 : 0 : return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3913 : : }
3914 : :
3915 : 0 : static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
3916 : : struct cftype *cft, u64 val)
3917 : : {
3918 [ # # ]: 0 : if (val)
3919 : 0 : set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3920 : : else
3921 : 0 : clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3922 : 0 : return 0;
3923 : : }
3924 : :
3925 : : static struct cftype cgroup_base_files[] = {
3926 : : {
3927 : : .name = "cgroup.procs",
3928 : : .seq_start = cgroup_pidlist_start,
3929 : : .seq_next = cgroup_pidlist_next,
3930 : : .seq_stop = cgroup_pidlist_stop,
3931 : : .seq_show = cgroup_pidlist_show,
3932 : : .private = CGROUP_FILE_PROCS,
3933 : : .write_u64 = cgroup_procs_write,
3934 : : .mode = S_IRUGO | S_IWUSR,
3935 : : },
3936 : : {
3937 : : .name = "cgroup.clone_children",
3938 : : .flags = CFTYPE_INSANE,
3939 : : .read_u64 = cgroup_clone_children_read,
3940 : : .write_u64 = cgroup_clone_children_write,
3941 : : },
3942 : : {
3943 : : .name = "cgroup.sane_behavior",
3944 : : .flags = CFTYPE_ONLY_ON_ROOT,
3945 : : .seq_show = cgroup_sane_behavior_show,
3946 : : },
3947 : :
3948 : : /*
3949 : : * Historical crazy stuff. These don't have "cgroup." prefix and
3950 : : * don't exist if sane_behavior. If you're depending on these, be
3951 : : * prepared to be burned.
3952 : : */
3953 : : {
3954 : : .name = "tasks",
3955 : : .flags = CFTYPE_INSANE, /* use "procs" instead */
3956 : : .seq_start = cgroup_pidlist_start,
3957 : : .seq_next = cgroup_pidlist_next,
3958 : : .seq_stop = cgroup_pidlist_stop,
3959 : : .seq_show = cgroup_pidlist_show,
3960 : : .private = CGROUP_FILE_TASKS,
3961 : : .write_u64 = cgroup_tasks_write,
3962 : : .mode = S_IRUGO | S_IWUSR,
3963 : : },
3964 : : {
3965 : : .name = "notify_on_release",
3966 : : .flags = CFTYPE_INSANE,
3967 : : .read_u64 = cgroup_read_notify_on_release,
3968 : : .write_u64 = cgroup_write_notify_on_release,
3969 : : },
3970 : : {
3971 : : .name = "release_agent",
3972 : : .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3973 : : .seq_show = cgroup_release_agent_show,
3974 : : .write_string = cgroup_release_agent_write,
3975 : : .max_write_len = PATH_MAX,
3976 : : },
3977 : : { } /* terminate */
3978 : : };
3979 : :
3980 : : /**
3981 : : * cgroup_populate_dir - create subsys files in a cgroup directory
3982 : : * @cgrp: target cgroup
3983 : : * @subsys_mask: mask of the subsystem ids whose files should be added
3984 : : *
3985 : : * On failure, no file is added.
3986 : : */
3987 : : static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3988 : : {
3989 : : struct cgroup_subsys *ss;
3990 : : int i, ret = 0;
3991 : :
3992 : : /* process cftsets of each subsystem */
3993 : : for_each_subsys(ss, i) {
3994 : : struct cftype_set *set;
3995 : :
3996 : : if (!test_bit(i, &subsys_mask))
3997 : : continue;
3998 : :
3999 : : list_for_each_entry(set, &ss->cftsets, node) {
4000 : : ret = cgroup_addrm_files(cgrp, set->cfts, true);
4001 : : if (ret < 0)
4002 : : goto err;
4003 : : }
4004 : : }
4005 : : return 0;
4006 : : err:
4007 : : cgroup_clear_dir(cgrp, subsys_mask);
4008 : : return ret;
4009 : : }
4010 : :
4011 : : /*
4012 : : * css destruction is four-stage process.
4013 : : *
4014 : : * 1. Destruction starts. Killing of the percpu_ref is initiated.
4015 : : * Implemented in kill_css().
4016 : : *
4017 : : * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4018 : : * and thus css_tryget() is guaranteed to fail, the css can be offlined
4019 : : * by invoking offline_css(). After offlining, the base ref is put.
4020 : : * Implemented in css_killed_work_fn().
4021 : : *
4022 : : * 3. When the percpu_ref reaches zero, the only possible remaining
4023 : : * accessors are inside RCU read sections. css_release() schedules the
4024 : : * RCU callback.
4025 : : *
4026 : : * 4. After the grace period, the css can be freed. Implemented in
4027 : : * css_free_work_fn().
4028 : : *
4029 : : * It is actually hairier because both step 2 and 4 require process context
4030 : : * and thus involve punting to css->destroy_work adding two additional
4031 : : * steps to the already complex sequence.
4032 : : */
4033 : : static void css_free_work_fn(struct work_struct *work)
4034 : : {
4035 : : struct cgroup_subsys_state *css =
4036 : : container_of(work, struct cgroup_subsys_state, destroy_work);
4037 : : struct cgroup *cgrp = css->cgroup;
4038 : :
4039 : : if (css->parent)
4040 : : css_put(css->parent);
4041 : :
4042 : : css->ss->css_free(css);
4043 : : cgroup_dput(cgrp);
4044 : : }
4045 : :
4046 : : static void css_free_rcu_fn(struct rcu_head *rcu_head)
4047 : : {
4048 : : struct cgroup_subsys_state *css =
4049 : : container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4050 : :
4051 : : /*
4052 : : * css holds an extra ref to @cgrp->dentry which is put on the last
4053 : : * css_put(). dput() requires process context which we don't have.
4054 : : */
4055 : : INIT_WORK(&css->destroy_work, css_free_work_fn);
4056 : : queue_work(cgroup_destroy_wq, &css->destroy_work);
4057 : : }
4058 : :
4059 : : static void css_release(struct percpu_ref *ref)
4060 : : {
4061 : : struct cgroup_subsys_state *css =
4062 : : container_of(ref, struct cgroup_subsys_state, refcnt);
4063 : :
4064 : : rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
4065 : : call_rcu(&css->rcu_head, css_free_rcu_fn);
4066 : : }
4067 : :
4068 : 0 : static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
4069 : : struct cgroup *cgrp)
4070 : : {
4071 : 0 : css->cgroup = cgrp;
4072 : 0 : css->ss = ss;
4073 : 0 : css->flags = 0;
4074 : :
4075 [ # # ]: 0 : if (cgrp->parent)
4076 : 0 : css->parent = cgroup_css(cgrp->parent, ss);
4077 : : else
4078 : 0 : css->flags |= CSS_ROOT;
4079 : :
4080 [ # # ]: 0 : BUG_ON(cgroup_css(cgrp, ss));
4081 : 0 : }
4082 : :
4083 : : /* invoke ->css_online() on a new CSS and mark it online if successful */
4084 : 0 : static int online_css(struct cgroup_subsys_state *css)
4085 : : {
4086 : 0 : struct cgroup_subsys *ss = css->ss;
4087 : : int ret = 0;
4088 : :
4089 : : lockdep_assert_held(&cgroup_mutex);
4090 : :
4091 [ # # ]: 0 : if (ss->css_online)
4092 : 0 : ret = ss->css_online(css);
4093 [ # # ]: 0 : if (!ret) {
4094 : 0 : css->flags |= CSS_ONLINE;
4095 : 0 : css->cgroup->nr_css++;
4096 : 0 : rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
4097 : : }
4098 : 0 : return ret;
4099 : : }
4100 : :
4101 : : /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4102 : 0 : static void offline_css(struct cgroup_subsys_state *css)
4103 : : {
4104 : 0 : struct cgroup_subsys *ss = css->ss;
4105 : :
4106 : : lockdep_assert_held(&cgroup_mutex);
4107 : :
4108 [ # # ]: 0 : if (!(css->flags & CSS_ONLINE))
4109 : 0 : return;
4110 : :
4111 [ # # ]: 0 : if (ss->css_offline)
4112 : 0 : ss->css_offline(css);
4113 : :
4114 : 0 : css->flags &= ~CSS_ONLINE;
4115 : 0 : css->cgroup->nr_css--;
4116 : 0 : RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4117 : : }
4118 : :
4119 : : /**
4120 : : * create_css - create a cgroup_subsys_state
4121 : : * @cgrp: the cgroup new css will be associated with
4122 : : * @ss: the subsys of new css
4123 : : *
4124 : : * Create a new css associated with @cgrp - @ss pair. On success, the new
4125 : : * css is online and installed in @cgrp with all interface files created.
4126 : : * Returns 0 on success, -errno on failure.
4127 : : */
4128 : : static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4129 : : {
4130 : : struct cgroup *parent = cgrp->parent;
4131 : : struct cgroup_subsys_state *css;
4132 : : int err;
4133 : :
4134 : : lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4135 : : lockdep_assert_held(&cgroup_mutex);
4136 : :
4137 : : css = ss->css_alloc(cgroup_css(parent, ss));
4138 : : if (IS_ERR(css))
4139 : : return PTR_ERR(css);
4140 : :
4141 : : err = percpu_ref_init(&css->refcnt, css_release);
4142 : : if (err)
4143 : : goto err_free_css;
4144 : :
4145 : : init_css(css, ss, cgrp);
4146 : :
4147 : : err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
4148 : : if (err)
4149 : : goto err_free_percpu_ref;
4150 : :
4151 : : err = online_css(css);
4152 : : if (err)
4153 : : goto err_clear_dir;
4154 : :
4155 : : dget(cgrp->dentry);
4156 : : css_get(css->parent);
4157 : :
4158 : : if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4159 : : parent->parent) {
4160 : : pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4161 : : current->comm, current->pid, ss->name);
4162 : : if (!strcmp(ss->name, "memory"))
4163 : : pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4164 : : ss->warned_broken_hierarchy = true;
4165 : : }
4166 : :
4167 : : return 0;
4168 : :
4169 : : err_clear_dir:
4170 : : cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4171 : : err_free_percpu_ref:
4172 : : percpu_ref_cancel_init(&css->refcnt);
4173 : : err_free_css:
4174 : : ss->css_free(css);
4175 : : return err;
4176 : : }
4177 : :
4178 : : /*
4179 : : * cgroup_create - create a cgroup
4180 : : * @parent: cgroup that will be parent of the new cgroup
4181 : : * @dentry: dentry of the new cgroup
4182 : : * @mode: mode to set on new inode
4183 : : *
4184 : : * Must be called with the mutex on the parent inode held
4185 : : */
4186 : 0 : static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4187 : : umode_t mode)
4188 : : {
4189 : : struct cgroup *cgrp;
4190 : : struct cgroup_name *name;
4191 : 2 : struct cgroupfs_root *root = parent->root;
4192 : : int ssid, err;
4193 : : struct cgroup_subsys *ss;
4194 : 2 : struct super_block *sb = root->sb;
4195 : :
4196 : : /* allocate the cgroup and its ID, 0 is reserved for the root */
4197 : : cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4198 [ + - ]: 2 : if (!cgrp)
4199 : : return -ENOMEM;
4200 : :
4201 : 2 : name = cgroup_alloc_name(dentry);
4202 [ + - ]: 2 : if (!name) {
4203 : : err = -ENOMEM;
4204 : : goto err_free_cgrp;
4205 : : }
4206 : 2 : rcu_assign_pointer(cgrp->name, name);
4207 : :
4208 : : /*
4209 : : * Only live parents can have children. Note that the liveliness
4210 : : * check isn't strictly necessary because cgroup_mkdir() and
4211 : : * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4212 : : * anyway so that locking is contained inside cgroup proper and we
4213 : : * don't get nasty surprises if we ever grow another caller.
4214 : : */
4215 [ + - ]: 2 : if (!cgroup_lock_live_group(parent)) {
4216 : : err = -ENODEV;
4217 : : goto err_free_name;
4218 : : }
4219 : :
4220 : : /*
4221 : : * Temporarily set the pointer to NULL, so idr_find() won't return
4222 : : * a half-baked cgroup.
4223 : : */
4224 : 2 : cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4225 [ + - ]: 2 : if (cgrp->id < 0) {
4226 : : err = -ENOMEM;
4227 : : goto err_unlock;
4228 : : }
4229 : :
4230 : : /* Grab a reference on the superblock so the hierarchy doesn't
4231 : : * get deleted on unmount if there are child cgroups. This
4232 : : * can be done outside cgroup_mutex, since the sb can't
4233 : : * disappear while someone has an open control file on the
4234 : : * fs */
4235 : 2 : atomic_inc(&sb->s_active);
4236 : :
4237 : 2 : init_cgroup_housekeeping(cgrp);
4238 : :
4239 : 2 : dentry->d_fsdata = cgrp;
4240 : 2 : cgrp->dentry = dentry;
4241 : :
4242 : 2 : cgrp->parent = parent;
4243 : 2 : cgrp->dummy_css.parent = &parent->dummy_css;
4244 : 2 : cgrp->root = parent->root;
4245 : :
4246 [ + + ]: 2 : if (notify_on_release(parent))
4247 : 1 : set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4248 : :
4249 [ - + ]: 2 : if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4250 : 0 : set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4251 : :
4252 : : /*
4253 : : * Create directory. cgroup_create_file() returns with the new
4254 : : * directory locked on success so that it can be populated without
4255 : : * dropping cgroup_mutex.
4256 : : */
4257 : 2 : err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4258 [ + - ]: 2 : if (err < 0)
4259 : : goto err_free_id;
4260 : : lockdep_assert_held(&dentry->d_inode->i_mutex);
4261 : :
4262 : 2 : cgrp->serial_nr = cgroup_serial_nr_next++;
4263 : :
4264 : : /* allocation complete, commit to creation */
4265 : 2 : list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4266 : 2 : root->number_of_cgroups++;
4267 : :
4268 : : /* hold a ref to the parent's dentry */
4269 : 2 : dget(parent->dentry);
4270 : :
4271 : : /*
4272 : : * @cgrp is now fully operational. If something fails after this
4273 : : * point, it'll be released via the normal destruction path.
4274 : : */
4275 : 2 : idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4276 : :
4277 : 2 : err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4278 [ + - ]: 2 : if (err)
4279 : : goto err_destroy;
4280 : :
4281 : : /* let's create and online css's */
4282 : : for_each_subsys(ss, ssid) {
4283 : : if (root->subsys_mask & (1 << ssid)) {
4284 : : err = create_css(cgrp, ss);
4285 : : if (err)
4286 : : goto err_destroy;
4287 : : }
4288 : : }
4289 : :
4290 : 2 : mutex_unlock(&cgroup_mutex);
4291 : 2 : mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4292 : :
4293 : 2 : return 0;
4294 : :
4295 : : err_free_id:
4296 : 0 : idr_remove(&root->cgroup_idr, cgrp->id);
4297 : : /* Release the reference count that we took on the superblock */
4298 : 0 : deactivate_super(sb);
4299 : : err_unlock:
4300 : 0 : mutex_unlock(&cgroup_mutex);
4301 : : err_free_name:
4302 : 0 : kfree(rcu_dereference_raw(cgrp->name));
4303 : : err_free_cgrp:
4304 : 0 : kfree(cgrp);
4305 : 0 : return err;
4306 : :
4307 : : err_destroy:
4308 : 0 : cgroup_destroy_locked(cgrp);
4309 : 0 : mutex_unlock(&cgroup_mutex);
4310 : 0 : mutex_unlock(&dentry->d_inode->i_mutex);
4311 : 0 : return err;
4312 : : }
4313 : :
4314 : 0 : static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4315 : : {
4316 : 2 : struct cgroup *c_parent = dentry->d_parent->d_fsdata;
4317 : :
4318 : : /* the vfs holds inode->i_mutex already */
4319 : 2 : return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4320 : : }
4321 : :
4322 : : /*
4323 : : * This is called when the refcnt of a css is confirmed to be killed.
4324 : : * css_tryget() is now guaranteed to fail.
4325 : : */
4326 : : static void css_killed_work_fn(struct work_struct *work)
4327 : : {
4328 : : struct cgroup_subsys_state *css =
4329 : : container_of(work, struct cgroup_subsys_state, destroy_work);
4330 : : struct cgroup *cgrp = css->cgroup;
4331 : :
4332 : : mutex_lock(&cgroup_mutex);
4333 : :
4334 : : /*
4335 : : * css_tryget() is guaranteed to fail now. Tell subsystems to
4336 : : * initate destruction.
4337 : : */
4338 : : offline_css(css);
4339 : :
4340 : : /*
4341 : : * If @cgrp is marked dead, it's waiting for refs of all css's to
4342 : : * be disabled before proceeding to the second phase of cgroup
4343 : : * destruction. If we are the last one, kick it off.
4344 : : */
4345 : : if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4346 : : cgroup_destroy_css_killed(cgrp);
4347 : :
4348 : : mutex_unlock(&cgroup_mutex);
4349 : :
4350 : : /*
4351 : : * Put the css refs from kill_css(). Each css holds an extra
4352 : : * reference to the cgroup's dentry and cgroup removal proceeds
4353 : : * regardless of css refs. On the last put of each css, whenever
4354 : : * that may be, the extra dentry ref is put so that dentry
4355 : : * destruction happens only after all css's are released.
4356 : : */
4357 : : css_put(css);
4358 : : }
4359 : :
4360 : : /* css kill confirmation processing requires process context, bounce */
4361 : : static void css_killed_ref_fn(struct percpu_ref *ref)
4362 : : {
4363 : : struct cgroup_subsys_state *css =
4364 : : container_of(ref, struct cgroup_subsys_state, refcnt);
4365 : :
4366 : : INIT_WORK(&css->destroy_work, css_killed_work_fn);
4367 : : queue_work(cgroup_destroy_wq, &css->destroy_work);
4368 : : }
4369 : :
4370 : : /**
4371 : : * kill_css - destroy a css
4372 : : * @css: css to destroy
4373 : : *
4374 : : * This function initiates destruction of @css by removing cgroup interface
4375 : : * files and putting its base reference. ->css_offline() will be invoked
4376 : : * asynchronously once css_tryget() is guaranteed to fail and when the
4377 : : * reference count reaches zero, @css will be released.
4378 : : */
4379 : : static void kill_css(struct cgroup_subsys_state *css)
4380 : : {
4381 : : cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4382 : :
4383 : : /*
4384 : : * Killing would put the base ref, but we need to keep it alive
4385 : : * until after ->css_offline().
4386 : : */
4387 : : css_get(css);
4388 : :
4389 : : /*
4390 : : * cgroup core guarantees that, by the time ->css_offline() is
4391 : : * invoked, no new css reference will be given out via
4392 : : * css_tryget(). We can't simply call percpu_ref_kill() and
4393 : : * proceed to offlining css's because percpu_ref_kill() doesn't
4394 : : * guarantee that the ref is seen as killed on all CPUs on return.
4395 : : *
4396 : : * Use percpu_ref_kill_and_confirm() to get notifications as each
4397 : : * css is confirmed to be seen as killed on all CPUs.
4398 : : */
4399 : : percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4400 : : }
4401 : :
4402 : : /**
4403 : : * cgroup_destroy_locked - the first stage of cgroup destruction
4404 : : * @cgrp: cgroup to be destroyed
4405 : : *
4406 : : * css's make use of percpu refcnts whose killing latency shouldn't be
4407 : : * exposed to userland and are RCU protected. Also, cgroup core needs to
4408 : : * guarantee that css_tryget() won't succeed by the time ->css_offline() is
4409 : : * invoked. To satisfy all the requirements, destruction is implemented in
4410 : : * the following two steps.
4411 : : *
4412 : : * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
4413 : : * userland visible parts and start killing the percpu refcnts of
4414 : : * css's. Set up so that the next stage will be kicked off once all
4415 : : * the percpu refcnts are confirmed to be killed.
4416 : : *
4417 : : * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4418 : : * rest of destruction. Once all cgroup references are gone, the
4419 : : * cgroup is RCU-freed.
4420 : : *
4421 : : * This function implements s1. After this step, @cgrp is gone as far as
4422 : : * the userland is concerned and a new cgroup with the same name may be
4423 : : * created. As cgroup doesn't care about the names internally, this
4424 : : * doesn't cause any problem.
4425 : : */
4426 : 0 : static int cgroup_destroy_locked(struct cgroup *cgrp)
4427 : : __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4428 : : {
4429 : 2 : struct dentry *d = cgrp->dentry;
4430 : : struct cgroup_subsys_state *css;
4431 : : struct cgroup *child;
4432 : : bool empty;
4433 : : int ssid;
4434 : :
4435 : : lockdep_assert_held(&d->d_inode->i_mutex);
4436 : : lockdep_assert_held(&cgroup_mutex);
4437 : :
4438 : : /*
4439 : : * css_set_lock synchronizes access to ->cset_links and prevents
4440 : : * @cgrp from being removed while __put_css_set() is in progress.
4441 : : */
4442 : 2 : read_lock(&css_set_lock);
4443 : 2 : empty = list_empty(&cgrp->cset_links);
4444 : : read_unlock(&css_set_lock);
4445 [ + - ]: 2 : if (!empty)
4446 : : return -EBUSY;
4447 : :
4448 : : /*
4449 : : * Make sure there's no live children. We can't test ->children
4450 : : * emptiness as dead children linger on it while being destroyed;
4451 : : * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
4452 : : */
4453 : : empty = true;
4454 : : rcu_read_lock();
4455 [ - + ]: 2 : list_for_each_entry_rcu(child, &cgrp->children, sibling) {
4456 : : empty = cgroup_is_dead(child);
4457 [ # # ]: 0 : if (!empty)
4458 : : break;
4459 : : }
4460 : : rcu_read_unlock();
4461 [ + - ]: 2 : if (!empty)
4462 : : return -EBUSY;
4463 : :
4464 : : /*
4465 : : * Initiate massacre of all css's. cgroup_destroy_css_killed()
4466 : : * will be invoked to perform the rest of destruction once the
4467 : : * percpu refs of all css's are confirmed to be killed.
4468 : : */
4469 : : for_each_css(css, ssid, cgrp)
4470 : : kill_css(css);
4471 : :
4472 : : /*
4473 : : * Mark @cgrp dead. This prevents further task migration and child
4474 : : * creation by disabling cgroup_lock_live_group(). Note that
4475 : : * CGRP_DEAD assertion is depended upon by css_next_child() to
4476 : : * resume iteration after dropping RCU read lock. See
4477 : : * css_next_child() for details.
4478 : : */
4479 : 2 : set_bit(CGRP_DEAD, &cgrp->flags);
4480 : :
4481 : : /* CGRP_DEAD is set, remove from ->release_list for the last time */
4482 : 2 : raw_spin_lock(&release_list_lock);
4483 [ - + ]: 2 : if (!list_empty(&cgrp->release_list))
4484 : : list_del_init(&cgrp->release_list);
4485 : : raw_spin_unlock(&release_list_lock);
4486 : :
4487 : : /*
4488 : : * If @cgrp has css's attached, the second stage of cgroup
4489 : : * destruction is kicked off from css_killed_work_fn() after the
4490 : : * refs of all attached css's are killed. If @cgrp doesn't have
4491 : : * any css, we kick it off here.
4492 : : */
4493 [ + - ]: 2 : if (!cgrp->nr_css)
4494 : 2 : cgroup_destroy_css_killed(cgrp);
4495 : :
4496 : : /*
4497 : : * Clear the base files and remove @cgrp directory. The removal
4498 : : * puts the base ref but we aren't quite done with @cgrp yet, so
4499 : : * hold onto it.
4500 : : */
4501 : 2 : cgroup_addrm_files(cgrp, cgroup_base_files, false);
4502 : : dget(d);
4503 : 2 : cgroup_d_remove_dir(d);
4504 : :
4505 : 2 : return 0;
4506 : : };
4507 : :
4508 : : /**
4509 : : * cgroup_destroy_css_killed - the second step of cgroup destruction
4510 : : * @work: cgroup->destroy_free_work
4511 : : *
4512 : : * This function is invoked from a work item for a cgroup which is being
4513 : : * destroyed after all css's are offlined and performs the rest of
4514 : : * destruction. This is the second step of destruction described in the
4515 : : * comment above cgroup_destroy_locked().
4516 : : */
4517 : 0 : static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4518 : : {
4519 : 2 : struct cgroup *parent = cgrp->parent;
4520 : 2 : struct dentry *d = cgrp->dentry;
4521 : :
4522 : : lockdep_assert_held(&cgroup_mutex);
4523 : :
4524 : : /* delete this cgroup from parent->children */
4525 : : list_del_rcu(&cgrp->sibling);
4526 : :
4527 : 2 : dput(d);
4528 : :
4529 : 2 : set_bit(CGRP_RELEASABLE, &parent->flags);
4530 : 2 : check_for_release(parent);
4531 : 2 : }
4532 : :
4533 : 0 : static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4534 : : {
4535 : : int ret;
4536 : :
4537 : 2 : mutex_lock(&cgroup_mutex);
4538 : 2 : ret = cgroup_destroy_locked(dentry->d_fsdata);
4539 : 2 : mutex_unlock(&cgroup_mutex);
4540 : :
4541 : 2 : return ret;
4542 : : }
4543 : :
4544 : : static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4545 : : {
4546 : 0 : INIT_LIST_HEAD(&ss->cftsets);
4547 : :
4548 : : /*
4549 : : * base_cftset is embedded in subsys itself, no need to worry about
4550 : : * deregistration.
4551 : : */
4552 [ # # ]: 0 : if (ss->base_cftypes) {
4553 : : struct cftype *cft;
4554 : :
4555 [ # # ]: 0 : for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
4556 : 0 : cft->ss = ss;
4557 : :
4558 : 0 : ss->base_cftset.cfts = ss->base_cftypes;
4559 : 0 : list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4560 : : }
4561 : : }
4562 : :
4563 : : static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4564 : : {
4565 : : struct cgroup_subsys_state *css;
4566 : :
4567 : : printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4568 : :
4569 : : mutex_lock(&cgroup_mutex);
4570 : :
4571 : : /* init base cftset */
4572 : : cgroup_init_cftsets(ss);
4573 : :
4574 : : /* Create the top cgroup state for this subsystem */
4575 : : ss->root = &cgroup_dummy_root;
4576 : : css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4577 : : /* We don't handle early failures gracefully */
4578 : : BUG_ON(IS_ERR(css));
4579 : : init_css(css, ss, cgroup_dummy_top);
4580 : :
4581 : : /* Update the init_css_set to contain a subsys
4582 : : * pointer to this state - since the subsystem is
4583 : : * newly registered, all tasks and hence the
4584 : : * init_css_set is in the subsystem's top cgroup. */
4585 : : init_css_set.subsys[ss->subsys_id] = css;
4586 : :
4587 : : need_forkexit_callback |= ss->fork || ss->exit;
4588 : :
4589 : : /* At system boot, before all subsystems have been
4590 : : * registered, no tasks have been forked, so we don't
4591 : : * need to invoke fork callbacks here. */
4592 : : BUG_ON(!list_empty(&init_task.tasks));
4593 : :
4594 : : BUG_ON(online_css(css));
4595 : :
4596 : : mutex_unlock(&cgroup_mutex);
4597 : :
4598 : : /* this function shouldn't be used with modular subsystems, since they
4599 : : * need to register a subsys_id, among other things */
4600 : : BUG_ON(ss->module);
4601 : : }
4602 : :
4603 : : /**
4604 : : * cgroup_load_subsys: load and register a modular subsystem at runtime
4605 : : * @ss: the subsystem to load
4606 : : *
4607 : : * This function should be called in a modular subsystem's initcall. If the
4608 : : * subsystem is built as a module, it will be assigned a new subsys_id and set
4609 : : * up for use. If the subsystem is built-in anyway, work is delegated to the
4610 : : * simpler cgroup_init_subsys.
4611 : : */
4612 : 0 : int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4613 : : {
4614 : : struct cgroup_subsys_state *css;
4615 : : int i, ret;
4616 : : struct hlist_node *tmp;
4617 : : struct css_set *cset;
4618 : : unsigned long key;
4619 : :
4620 : : /* check name and function validity */
4621 [ # # ][ # # ]: 0 : if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
[ # # ]
4622 [ # # ]: 0 : ss->css_alloc == NULL || ss->css_free == NULL)
4623 : : return -EINVAL;
4624 : :
4625 : : /*
4626 : : * we don't support callbacks in modular subsystems. this check is
4627 : : * before the ss->module check for consistency; a subsystem that could
4628 : : * be a module should still have no callbacks even if the user isn't
4629 : : * compiling it as one.
4630 : : */
4631 [ # # ][ # # ]: 0 : if (ss->fork || ss->exit)
4632 : : return -EINVAL;
4633 : :
4634 : : /*
4635 : : * an optionally modular subsystem is built-in: we want to do nothing,
4636 : : * since cgroup_init_subsys will have already taken care of it.
4637 : : */
4638 [ # # ]: 0 : if (ss->module == NULL) {
4639 : : /* a sanity check */
4640 [ # # ]: 0 : BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4641 : : return 0;
4642 : : }
4643 : :
4644 : : /* init base cftset */
4645 : : cgroup_init_cftsets(ss);
4646 : :
4647 : 0 : mutex_lock(&cgroup_mutex);
4648 : 0 : mutex_lock(&cgroup_root_mutex);
4649 : 0 : cgroup_subsys[ss->subsys_id] = ss;
4650 : :
4651 : : /*
4652 : : * no ss->css_alloc seems to need anything important in the ss
4653 : : * struct, so this can happen first (i.e. before the dummy root
4654 : : * attachment).
4655 : : */
4656 : 0 : css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4657 [ # # ]: 0 : if (IS_ERR(css)) {
4658 : : /* failure case - need to deassign the cgroup_subsys[] slot. */
4659 : 0 : cgroup_subsys[ss->subsys_id] = NULL;
4660 : 0 : mutex_unlock(&cgroup_root_mutex);
4661 : 0 : mutex_unlock(&cgroup_mutex);
4662 : 0 : return PTR_ERR(css);
4663 : : }
4664 : :
4665 : 0 : ss->root = &cgroup_dummy_root;
4666 : :
4667 : : /* our new subsystem will be attached to the dummy hierarchy. */
4668 : 0 : init_css(css, ss, cgroup_dummy_top);
4669 : :
4670 : : /*
4671 : : * Now we need to entangle the css into the existing css_sets. unlike
4672 : : * in cgroup_init_subsys, there are now multiple css_sets, so each one
4673 : : * will need a new pointer to it; done by iterating the css_set_table.
4674 : : * furthermore, modifying the existing css_sets will corrupt the hash
4675 : : * table state, so each changed css_set will need its hash recomputed.
4676 : : * this is all done under the css_set_lock.
4677 : : */
4678 : 0 : write_lock(&css_set_lock);
4679 [ # # ][ # # ]: 0 : hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
[ # # ][ # # ]
4680 : : /* skip entries that we already rehashed */
4681 [ # # ]: 0 : if (cset->subsys[ss->subsys_id])
4682 : 0 : continue;
4683 : : /* remove existing entry */
4684 : : hash_del(&cset->hlist);
4685 : : /* set new value */
4686 : 0 : cset->subsys[ss->subsys_id] = css;
4687 : : /* recompute hash and restore entry */
4688 : : key = css_set_hash(cset->subsys);
4689 : 0 : hash_add(css_set_table, &cset->hlist, key);
4690 : : }
4691 : : write_unlock(&css_set_lock);
4692 : :
4693 : 0 : ret = online_css(css);
4694 [ # # ]: 0 : if (ret) {
4695 : 0 : ss->css_free(css);
4696 : : goto err_unload;
4697 : : }
4698 : :
4699 : : /* success! */
4700 : 0 : mutex_unlock(&cgroup_root_mutex);
4701 : 0 : mutex_unlock(&cgroup_mutex);
4702 : 0 : return 0;
4703 : :
4704 : : err_unload:
4705 : 0 : mutex_unlock(&cgroup_root_mutex);
4706 : 0 : mutex_unlock(&cgroup_mutex);
4707 : : /* @ss can't be mounted here as try_module_get() would fail */
4708 : 0 : cgroup_unload_subsys(ss);
4709 : 0 : return ret;
4710 : : }
4711 : : EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4712 : :
4713 : : /**
4714 : : * cgroup_unload_subsys: unload a modular subsystem
4715 : : * @ss: the subsystem to unload
4716 : : *
4717 : : * This function should be called in a modular subsystem's exitcall. When this
4718 : : * function is invoked, the refcount on the subsystem's module will be 0, so
4719 : : * the subsystem will not be attached to any hierarchy.
4720 : : */
4721 : 0 : void cgroup_unload_subsys(struct cgroup_subsys *ss)
4722 : : {
4723 : : struct cgrp_cset_link *link;
4724 : : struct cgroup_subsys_state *css;
4725 : :
4726 [ # # ]: 0 : BUG_ON(ss->module == NULL);
4727 : :
4728 : : /*
4729 : : * we shouldn't be called if the subsystem is in use, and the use of
4730 : : * try_module_get() in rebind_subsystems() should ensure that it
4731 : : * doesn't start being used while we're killing it off.
4732 : : */
4733 [ # # ]: 0 : BUG_ON(ss->root != &cgroup_dummy_root);
4734 : :
4735 : 0 : mutex_lock(&cgroup_mutex);
4736 : 0 : mutex_lock(&cgroup_root_mutex);
4737 : :
4738 : : css = cgroup_css(cgroup_dummy_top, ss);
4739 [ # # ]: 0 : if (css)
4740 : 0 : offline_css(css);
4741 : :
4742 : : /* deassign the subsys_id */
4743 : 0 : cgroup_subsys[ss->subsys_id] = NULL;
4744 : :
4745 : : /*
4746 : : * disentangle the css from all css_sets attached to the dummy
4747 : : * top. as in loading, we need to pay our respects to the hashtable
4748 : : * gods.
4749 : : */
4750 : 0 : write_lock(&css_set_lock);
4751 [ # # ]: 0 : list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4752 : 0 : struct css_set *cset = link->cset;
4753 : : unsigned long key;
4754 : :
4755 : : hash_del(&cset->hlist);
4756 : 0 : cset->subsys[ss->subsys_id] = NULL;
4757 : : key = css_set_hash(cset->subsys);
4758 : 0 : hash_add(css_set_table, &cset->hlist, key);
4759 : : }
4760 : : write_unlock(&css_set_lock);
4761 : :
4762 : : /*
4763 : : * remove subsystem's css from the cgroup_dummy_top and free it -
4764 : : * need to free before marking as null because ss->css_free needs
4765 : : * the cgrp->subsys pointer to find their state.
4766 : : */
4767 [ # # ]: 0 : if (css)
4768 : 0 : ss->css_free(css);
4769 : 0 : RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4770 : :
4771 : 0 : mutex_unlock(&cgroup_root_mutex);
4772 : 0 : mutex_unlock(&cgroup_mutex);
4773 : 0 : }
4774 : : EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4775 : :
4776 : : /**
4777 : : * cgroup_init_early - cgroup initialization at system boot
4778 : : *
4779 : : * Initialize cgroups at system boot, and initialize any
4780 : : * subsystems that request early init.
4781 : : */
4782 : 0 : int __init cgroup_init_early(void)
4783 : : {
4784 : : struct cgroup_subsys *ss;
4785 : : int i;
4786 : :
4787 : 0 : atomic_set(&init_css_set.refcount, 1);
4788 : : INIT_LIST_HEAD(&init_css_set.cgrp_links);
4789 : : INIT_LIST_HEAD(&init_css_set.tasks);
4790 : : INIT_HLIST_NODE(&init_css_set.hlist);
4791 : 0 : css_set_count = 1;
4792 : 0 : init_cgroup_root(&cgroup_dummy_root);
4793 : 0 : cgroup_root_count = 1;
4794 : 0 : RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4795 : :
4796 : 0 : init_cgrp_cset_link.cset = &init_css_set;
4797 : 0 : init_cgrp_cset_link.cgrp = cgroup_dummy_top;
4798 : : list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
4799 : : list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
4800 : :
4801 : : /* at bootup time, we don't worry about modular subsystems */
4802 : : for_each_builtin_subsys(ss, i) {
4803 : : BUG_ON(!ss->name);
4804 : : BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4805 : : BUG_ON(!ss->css_alloc);
4806 : : BUG_ON(!ss->css_free);
4807 : : if (ss->subsys_id != i) {
4808 : : printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4809 : : ss->name, ss->subsys_id);
4810 : : BUG();
4811 : : }
4812 : :
4813 : : if (ss->early_init)
4814 : : cgroup_init_subsys(ss);
4815 : : }
4816 : 0 : return 0;
4817 : : }
4818 : :
4819 : : /**
4820 : : * cgroup_init - cgroup initialization
4821 : : *
4822 : : * Register cgroup filesystem and /proc file, and initialize
4823 : : * any subsystems that didn't request early init.
4824 : : */
4825 : 0 : int __init cgroup_init(void)
4826 : : {
4827 : : struct cgroup_subsys *ss;
4828 : : unsigned long key;
4829 : : int i, err;
4830 : :
4831 : 0 : err = bdi_init(&cgroup_backing_dev_info);
4832 [ # # ]: 0 : if (err)
4833 : : return err;
4834 : :
4835 : : for_each_builtin_subsys(ss, i) {
4836 : : if (!ss->early_init)
4837 : : cgroup_init_subsys(ss);
4838 : : }
4839 : :
4840 : : /* allocate id for the dummy hierarchy */
4841 : 0 : mutex_lock(&cgroup_mutex);
4842 : 0 : mutex_lock(&cgroup_root_mutex);
4843 : :
4844 : : /* Add init_css_set to the hash table */
4845 : : key = css_set_hash(init_css_set.subsys);
4846 : : hash_add(css_set_table, &init_css_set.hlist, key);
4847 : :
4848 [ # # ]: 0 : BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4849 : :
4850 : 0 : err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
4851 : : 0, 1, GFP_KERNEL);
4852 [ # # ]: 0 : BUG_ON(err < 0);
4853 : :
4854 : 0 : mutex_unlock(&cgroup_root_mutex);
4855 : 0 : mutex_unlock(&cgroup_mutex);
4856 : :
4857 : 0 : cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4858 [ # # ]: 0 : if (!cgroup_kobj) {
4859 : : err = -ENOMEM;
4860 : : goto out;
4861 : : }
4862 : :
4863 : 0 : err = register_filesystem(&cgroup_fs_type);
4864 [ # # ]: 0 : if (err < 0) {
4865 : 0 : kobject_put(cgroup_kobj);
4866 : 0 : goto out;
4867 : : }
4868 : :
4869 : : proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4870 : :
4871 : : out:
4872 [ # # ]: 0 : if (err)
4873 : 0 : bdi_destroy(&cgroup_backing_dev_info);
4874 : :
4875 : 0 : return err;
4876 : : }
4877 : :
4878 : 0 : static int __init cgroup_wq_init(void)
4879 : : {
4880 : : /*
4881 : : * There isn't much point in executing destruction path in
4882 : : * parallel. Good chunk is serialized with cgroup_mutex anyway.
4883 : : * Use 1 for @max_active.
4884 : : *
4885 : : * We would prefer to do this in cgroup_init() above, but that
4886 : : * is called before init_workqueues(): so leave this until after.
4887 : : */
4888 : 0 : cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4889 [ # # ]: 0 : BUG_ON(!cgroup_destroy_wq);
4890 : :
4891 : : /*
4892 : : * Used to destroy pidlists and separate to serve as flush domain.
4893 : : * Cap @max_active to 1 too.
4894 : : */
4895 : 0 : cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4896 : : 0, 1);
4897 [ # # ]: 0 : BUG_ON(!cgroup_pidlist_destroy_wq);
4898 : :
4899 : 0 : return 0;
4900 : : }
4901 : : core_initcall(cgroup_wq_init);
4902 : :
4903 : : /*
4904 : : * proc_cgroup_show()
4905 : : * - Print task's cgroup paths into seq_file, one line for each hierarchy
4906 : : * - Used for /proc/<pid>/cgroup.
4907 : : * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
4908 : : * doesn't really matter if tsk->cgroup changes after we read it,
4909 : : * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4910 : : * anyway. No need to check that tsk->cgroup != NULL, thanks to
4911 : : * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
4912 : : * cgroup to top_cgroup.
4913 : : */
4914 : :
4915 : : /* TODO: Use a proper seq_file iterator */
4916 : 0 : int proc_cgroup_show(struct seq_file *m, void *v)
4917 : : {
4918 : : struct pid *pid;
4919 : : struct task_struct *tsk;
4920 : : char *buf;
4921 : : int retval;
4922 : : struct cgroupfs_root *root;
4923 : :
4924 : : retval = -ENOMEM;
4925 : : buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4926 [ + - ]: 2 : if (!buf)
4927 : : goto out;
4928 : :
4929 : : retval = -ESRCH;
4930 : 2 : pid = m->private;
4931 : 2 : tsk = get_pid_task(pid, PIDTYPE_PID);
4932 [ + - ]: 2 : if (!tsk)
4933 : : goto out_free;
4934 : :
4935 : : retval = 0;
4936 : :
4937 : 2 : mutex_lock(&cgroup_mutex);
4938 : :
4939 [ - + ]: 2 : for_each_active_root(root) {
4940 : : struct cgroup_subsys *ss;
4941 : : struct cgroup *cgrp;
4942 : : int ssid, count = 0;
4943 : :
4944 : 0 : seq_printf(m, "%d:", root->hierarchy_id);
4945 : : for_each_subsys(ss, ssid)
4946 : : if (root->subsys_mask & (1 << ssid))
4947 : : seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4948 [ # # ]: 0 : if (strlen(root->name))
4949 : 0 : seq_printf(m, "%sname=%s", count ? "," : "",
4950 : 0 : root->name);
4951 : 0 : seq_putc(m, ':');
4952 : 0 : cgrp = task_cgroup_from_root(tsk, root);
4953 : 0 : retval = cgroup_path(cgrp, buf, PAGE_SIZE);
4954 [ # # ]: 0 : if (retval < 0)
4955 : : goto out_unlock;
4956 : 0 : seq_puts(m, buf);
4957 : 0 : seq_putc(m, '\n');
4958 : : }
4959 : :
4960 : : out_unlock:
4961 : 2 : mutex_unlock(&cgroup_mutex);
4962 : : put_task_struct(tsk);
4963 : : out_free:
4964 : 2 : kfree(buf);
4965 : : out:
4966 : 2 : return retval;
4967 : : }
4968 : :
4969 : : /* Display information about each subsystem and each hierarchy */
4970 : 0 : static int proc_cgroupstats_show(struct seq_file *m, void *v)
4971 : : {
4972 : : struct cgroup_subsys *ss;
4973 : : int i;
4974 : :
4975 : 56 : seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
4976 : : /*
4977 : : * ideally we don't want subsystems moving around while we do this.
4978 : : * cgroup_mutex is also necessary to guarantee an atomic snapshot of
4979 : : * subsys/hierarchy state.
4980 : : */
4981 : 56 : mutex_lock(&cgroup_mutex);
4982 : :
4983 : : for_each_subsys(ss, i)
4984 : : seq_printf(m, "%s\t%d\t%d\t%d\n",
4985 : : ss->name, ss->root->hierarchy_id,
4986 : : ss->root->number_of_cgroups, !ss->disabled);
4987 : :
4988 : 56 : mutex_unlock(&cgroup_mutex);
4989 : 56 : return 0;
4990 : : }
4991 : :
4992 : 0 : static int cgroupstats_open(struct inode *inode, struct file *file)
4993 : : {
4994 : 56 : return single_open(file, proc_cgroupstats_show, NULL);
4995 : : }
4996 : :
4997 : : static const struct file_operations proc_cgroupstats_operations = {
4998 : : .open = cgroupstats_open,
4999 : : .read = seq_read,
5000 : : .llseek = seq_lseek,
5001 : : .release = single_release,
5002 : : };
5003 : :
5004 : : /**
5005 : : * cgroup_fork - attach newly forked task to its parents cgroup.
5006 : : * @child: pointer to task_struct of forking parent process.
5007 : : *
5008 : : * Description: A task inherits its parent's cgroup at fork().
5009 : : *
5010 : : * A pointer to the shared css_set was automatically copied in
5011 : : * fork.c by dup_task_struct(). However, we ignore that copy, since
5012 : : * it was not made under the protection of RCU or cgroup_mutex, so
5013 : : * might no longer be a valid cgroup pointer. cgroup_attach_task() might
5014 : : * have already changed current->cgroups, allowing the previously
5015 : : * referenced cgroup group to be removed and freed.
5016 : : *
5017 : : * At the point that cgroup_fork() is called, 'current' is the parent
5018 : : * task, and the passed argument 'child' points to the child task.
5019 : : */
5020 : 0 : void cgroup_fork(struct task_struct *child)
5021 : : {
5022 : 1104220 : task_lock(current);
5023 : 1104223 : get_css_set(task_css_set(current));
5024 : 1104228 : child->cgroups = current->cgroups;
5025 : 1104228 : task_unlock(current);
5026 : 1104222 : INIT_LIST_HEAD(&child->cg_list);
5027 : 1104222 : }
5028 : :
5029 : : /**
5030 : : * cgroup_post_fork - called on a new task after adding it to the task list
5031 : : * @child: the task in question
5032 : : *
5033 : : * Adds the task to the list running through its css_set if necessary and
5034 : : * call the subsystem fork() callbacks. Has to be after the task is
5035 : : * visible on the task list in case we race with the first call to
5036 : : * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5037 : : * list.
5038 : : */
5039 : 0 : void cgroup_post_fork(struct task_struct *child)
5040 : : {
5041 : : struct cgroup_subsys *ss;
5042 : : int i;
5043 : :
5044 : : /*
5045 : : * use_task_css_set_links is set to 1 before we walk the tasklist
5046 : : * under the tasklist_lock and we read it here after we added the child
5047 : : * to the tasklist under the tasklist_lock as well. If the child wasn't
5048 : : * yet in the tasklist when we walked through it from
5049 : : * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
5050 : : * should be visible now due to the paired locking and barriers implied
5051 : : * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
5052 : : * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
5053 : : * lock on fork.
5054 : : */
5055 [ + + ]: 1104223 : if (use_task_css_set_links) {
5056 : 12953 : write_lock(&css_set_lock);
5057 : : task_lock(child);
5058 [ + - ]: 12953 : if (list_empty(&child->cg_list))
5059 : 12953 : list_add(&child->cg_list, &task_css_set(child)->tasks);
5060 : : task_unlock(child);
5061 : : write_unlock(&css_set_lock);
5062 : : }
5063 : :
5064 : : /*
5065 : : * Call ss->fork(). This must happen after @child is linked on
5066 : : * css_set; otherwise, @child might change state between ->fork()
5067 : : * and addition to css_set.
5068 : : */
5069 : : if (need_forkexit_callback) {
5070 : : /*
5071 : : * fork/exit callbacks are supported only for builtin
5072 : : * subsystems, and the builtin section of the subsys
5073 : : * array is immutable, so we don't need to lock the
5074 : : * subsys array here. On the other hand, modular section
5075 : : * of the array can be freed at module unload, so we
5076 : : * can't touch that.
5077 : : */
5078 : : for_each_builtin_subsys(ss, i)
5079 : : if (ss->fork)
5080 : : ss->fork(child);
5081 : : }
5082 : 1104223 : }
5083 : :
5084 : : /**
5085 : : * cgroup_exit - detach cgroup from exiting task
5086 : : * @tsk: pointer to task_struct of exiting process
5087 : : * @run_callback: run exit callbacks?
5088 : : *
5089 : : * Description: Detach cgroup from @tsk and release it.
5090 : : *
5091 : : * Note that cgroups marked notify_on_release force every task in
5092 : : * them to take the global cgroup_mutex mutex when exiting.
5093 : : * This could impact scaling on very large systems. Be reluctant to
5094 : : * use notify_on_release cgroups where very high task exit scaling
5095 : : * is required on large systems.
5096 : : *
5097 : : * the_top_cgroup_hack:
5098 : : *
5099 : : * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
5100 : : *
5101 : : * We call cgroup_exit() while the task is still competent to
5102 : : * handle notify_on_release(), then leave the task attached to the
5103 : : * root cgroup in each hierarchy for the remainder of its exit.
5104 : : *
5105 : : * To do this properly, we would increment the reference count on
5106 : : * top_cgroup, and near the very end of the kernel/exit.c do_exit()
5107 : : * code we would add a second cgroup function call, to drop that
5108 : : * reference. This would just create an unnecessary hot spot on
5109 : : * the top_cgroup reference count, to no avail.
5110 : : *
5111 : : * Normally, holding a reference to a cgroup without bumping its
5112 : : * count is unsafe. The cgroup could go away, or someone could
5113 : : * attach us to a different cgroup, decrementing the count on
5114 : : * the first cgroup that we never incremented. But in this case,
5115 : : * top_cgroup isn't going away, and either task has PF_EXITING set,
5116 : : * which wards off any cgroup_attach_task() attempts, or task is a failed
5117 : : * fork, never visible to cgroup_attach_task.
5118 : : */
5119 : 0 : void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5120 : : {
5121 : : struct cgroup_subsys *ss;
5122 : : struct css_set *cset;
5123 : : int i;
5124 : :
5125 : : /*
5126 : : * Unlink from the css_set task list if necessary.
5127 : : * Optimistically check cg_list before taking
5128 : : * css_set_lock
5129 : : */
5130 [ + + ]: 1104218 : if (!list_empty(&tsk->cg_list)) {
5131 : 12965 : write_lock(&css_set_lock);
5132 [ + - ]: 12966 : if (!list_empty(&tsk->cg_list))
5133 : : list_del_init(&tsk->cg_list);
5134 : : write_unlock(&css_set_lock);
5135 : : }
5136 : :
5137 : : /* Reassign the task to the init_css_set. */
5138 : : task_lock(tsk);
5139 : : cset = task_css_set(tsk);
5140 : 1104207 : RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5141 : :
5142 : : if (run_callbacks && need_forkexit_callback) {
5143 : : /*
5144 : : * fork/exit callbacks are supported only for builtin
5145 : : * subsystems, see cgroup_post_fork() for details.
5146 : : */
5147 : : for_each_builtin_subsys(ss, i) {
5148 : : if (ss->exit) {
5149 : : struct cgroup_subsys_state *old_css = cset->subsys[i];
5150 : : struct cgroup_subsys_state *css = task_css(tsk, i);
5151 : :
5152 : : ss->exit(css, old_css, tsk);
5153 : : }
5154 : : }
5155 : : }
5156 : : task_unlock(tsk);
5157 : :
5158 : : put_css_set_taskexit(cset);
5159 : 1104233 : }
5160 : :
5161 : 0 : static void check_for_release(struct cgroup *cgrp)
5162 : : {
5163 [ + - ][ - + ]: 2 : if (cgroup_is_releasable(cgrp) &&
5164 [ # # ]: 0 : list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
5165 : : /*
5166 : : * Control Group is currently removeable. If it's not
5167 : : * already queued for a userspace notification, queue
5168 : : * it now
5169 : : */
5170 : : int need_schedule_work = 0;
5171 : :
5172 : 0 : raw_spin_lock(&release_list_lock);
5173 [ # # ][ # # ]: 0 : if (!cgroup_is_dead(cgrp) &&
5174 : 0 : list_empty(&cgrp->release_list)) {
5175 : : list_add(&cgrp->release_list, &release_list);
5176 : : need_schedule_work = 1;
5177 : : }
5178 : : raw_spin_unlock(&release_list_lock);
5179 [ # # ]: 0 : if (need_schedule_work)
5180 : : schedule_work(&release_agent_work);
5181 : : }
5182 : 2 : }
5183 : :
5184 : : /*
5185 : : * Notify userspace when a cgroup is released, by running the
5186 : : * configured release agent with the name of the cgroup (path
5187 : : * relative to the root of cgroup file system) as the argument.
5188 : : *
5189 : : * Most likely, this user command will try to rmdir this cgroup.
5190 : : *
5191 : : * This races with the possibility that some other task will be
5192 : : * attached to this cgroup before it is removed, or that some other
5193 : : * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
5194 : : * The presumed 'rmdir' will fail quietly if this cgroup is no longer
5195 : : * unused, and this cgroup will be reprieved from its death sentence,
5196 : : * to continue to serve a useful existence. Next time it's released,
5197 : : * we will get notified again, if it still has 'notify_on_release' set.
5198 : : *
5199 : : * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
5200 : : * means only wait until the task is successfully execve()'d. The
5201 : : * separate release agent task is forked by call_usermodehelper(),
5202 : : * then control in this thread returns here, without waiting for the
5203 : : * release agent task. We don't bother to wait because the caller of
5204 : : * this routine has no use for the exit status of the release agent
5205 : : * task, so no sense holding our caller up for that.
5206 : : */
5207 : 0 : static void cgroup_release_agent(struct work_struct *work)
5208 : : {
5209 [ # # ]: 0 : BUG_ON(work != &release_agent_work);
5210 : 0 : mutex_lock(&cgroup_mutex);
5211 : 0 : raw_spin_lock(&release_list_lock);
5212 [ # # ]: 0 : while (!list_empty(&release_list)) {
5213 : : char *argv[3], *envp[3];
5214 : : int i;
5215 : : char *pathbuf = NULL, *agentbuf = NULL;
5216 : 0 : struct cgroup *cgrp = list_entry(release_list.next,
5217 : : struct cgroup,
5218 : : release_list);
5219 : 0 : list_del_init(&cgrp->release_list);
5220 : : raw_spin_unlock(&release_list_lock);
5221 : : pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
5222 [ # # ]: 0 : if (!pathbuf)
5223 : : goto continue_free;
5224 [ # # ]: 0 : if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
5225 : : goto continue_free;
5226 : 0 : agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5227 [ # # ]: 0 : if (!agentbuf)
5228 : : goto continue_free;
5229 : :
5230 : : i = 0;
5231 : 0 : argv[i++] = agentbuf;
5232 : 0 : argv[i++] = pathbuf;
5233 : 0 : argv[i] = NULL;
5234 : :
5235 : : i = 0;
5236 : : /* minimal command environment */
5237 : 0 : envp[i++] = "HOME=/";
5238 : 0 : envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5239 : 0 : envp[i] = NULL;
5240 : :
5241 : : /* Drop the lock while we invoke the usermode helper,
5242 : : * since the exec could involve hitting disk and hence
5243 : : * be a slow process */
5244 : 0 : mutex_unlock(&cgroup_mutex);
5245 : 0 : call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5246 : 0 : mutex_lock(&cgroup_mutex);
5247 : : continue_free:
5248 : 0 : kfree(pathbuf);
5249 : 0 : kfree(agentbuf);
5250 : 0 : raw_spin_lock(&release_list_lock);
5251 : : }
5252 : : raw_spin_unlock(&release_list_lock);
5253 : 0 : mutex_unlock(&cgroup_mutex);
5254 : 0 : }
5255 : :
5256 : 0 : static int __init cgroup_disable(char *str)
5257 : : {
5258 : : struct cgroup_subsys *ss;
5259 : : char *token;
5260 : : int i;
5261 : :
5262 [ # # ]: 0 : while ((token = strsep(&str, ",")) != NULL) {
5263 : : if (!*token)
5264 : : continue;
5265 : :
5266 : : /*
5267 : : * cgroup_disable, being at boot time, can't know about
5268 : : * module subsystems, so we don't worry about them.
5269 : : */
5270 : : for_each_builtin_subsys(ss, i) {
5271 : : if (!strcmp(token, ss->name)) {
5272 : : ss->disabled = 1;
5273 : : printk(KERN_INFO "Disabling %s control group"
5274 : : " subsystem\n", ss->name);
5275 : : break;
5276 : : }
5277 : : }
5278 : : }
5279 : 0 : return 1;
5280 : : }
5281 : : __setup("cgroup_disable=", cgroup_disable);
5282 : :
5283 : : /**
5284 : : * css_from_dir - get corresponding css from the dentry of a cgroup dir
5285 : : * @dentry: directory dentry of interest
5286 : : * @ss: subsystem of interest
5287 : : *
5288 : : * Must be called under cgroup_mutex or RCU read lock. The caller is
5289 : : * responsible for pinning the returned css if it needs to be accessed
5290 : : * outside the critical section.
5291 : : */
5292 : 0 : struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5293 : : struct cgroup_subsys *ss)
5294 : : {
5295 : : struct cgroup *cgrp;
5296 : :
5297 : : cgroup_assert_mutex_or_rcu_locked();
5298 : :
5299 : : /* is @dentry a cgroup dir? */
5300 [ # # ][ # # ]: 0 : if (!dentry->d_inode ||
5301 : 0 : dentry->d_inode->i_op != &cgroup_dir_inode_operations)
5302 : : return ERR_PTR(-EBADF);
5303 : :
5304 : : cgrp = __d_cgrp(dentry);
5305 [ # # ]: 0 : return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
5306 : : }
5307 : :
5308 : : /**
5309 : : * css_from_id - lookup css by id
5310 : : * @id: the cgroup id
5311 : : * @ss: cgroup subsys to be looked into
5312 : : *
5313 : : * Returns the css if there's valid one with @id, otherwise returns NULL.
5314 : : * Should be called under rcu_read_lock().
5315 : : */
5316 : 0 : struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5317 : : {
5318 : : struct cgroup *cgrp;
5319 : :
5320 : : cgroup_assert_mutex_or_rcu_locked();
5321 : :
5322 : 0 : cgrp = idr_find(&ss->root->cgroup_idr, id);
5323 [ # # ]: 0 : if (cgrp)
5324 : 0 : return cgroup_css(cgrp, ss);
5325 : : return NULL;
5326 : : }
5327 : :
5328 : : #ifdef CONFIG_CGROUP_DEBUG
5329 : : static struct cgroup_subsys_state *
5330 : : debug_css_alloc(struct cgroup_subsys_state *parent_css)
5331 : : {
5332 : : struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5333 : :
5334 : : if (!css)
5335 : : return ERR_PTR(-ENOMEM);
5336 : :
5337 : : return css;
5338 : : }
5339 : :
5340 : : static void debug_css_free(struct cgroup_subsys_state *css)
5341 : : {
5342 : : kfree(css);
5343 : : }
5344 : :
5345 : : static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5346 : : struct cftype *cft)
5347 : : {
5348 : : return cgroup_task_count(css->cgroup);
5349 : : }
5350 : :
5351 : : static u64 current_css_set_read(struct cgroup_subsys_state *css,
5352 : : struct cftype *cft)
5353 : : {
5354 : : return (u64)(unsigned long)current->cgroups;
5355 : : }
5356 : :
5357 : : static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5358 : : struct cftype *cft)
5359 : : {
5360 : : u64 count;
5361 : :
5362 : : rcu_read_lock();
5363 : : count = atomic_read(&task_css_set(current)->refcount);
5364 : : rcu_read_unlock();
5365 : : return count;
5366 : : }
5367 : :
5368 : : static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5369 : : {
5370 : : struct cgrp_cset_link *link;
5371 : : struct css_set *cset;
5372 : :
5373 : : read_lock(&css_set_lock);
5374 : : rcu_read_lock();
5375 : : cset = rcu_dereference(current->cgroups);
5376 : : list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5377 : : struct cgroup *c = link->cgrp;
5378 : : const char *name;
5379 : :
5380 : : if (c->dentry)
5381 : : name = c->dentry->d_name.name;
5382 : : else
5383 : : name = "?";
5384 : : seq_printf(seq, "Root %d group %s\n",
5385 : : c->root->hierarchy_id, name);
5386 : : }
5387 : : rcu_read_unlock();
5388 : : read_unlock(&css_set_lock);
5389 : : return 0;
5390 : : }
5391 : :
5392 : : #define MAX_TASKS_SHOWN_PER_CSS 25
5393 : : static int cgroup_css_links_read(struct seq_file *seq, void *v)
5394 : : {
5395 : : struct cgroup_subsys_state *css = seq_css(seq);
5396 : : struct cgrp_cset_link *link;
5397 : :
5398 : : read_lock(&css_set_lock);
5399 : : list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5400 : : struct css_set *cset = link->cset;
5401 : : struct task_struct *task;
5402 : : int count = 0;
5403 : : seq_printf(seq, "css_set %p\n", cset);
5404 : : list_for_each_entry(task, &cset->tasks, cg_list) {
5405 : : if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
5406 : : seq_puts(seq, " ...\n");
5407 : : break;
5408 : : } else {
5409 : : seq_printf(seq, " task %d\n",
5410 : : task_pid_vnr(task));
5411 : : }
5412 : : }
5413 : : }
5414 : : read_unlock(&css_set_lock);
5415 : : return 0;
5416 : : }
5417 : :
5418 : : static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5419 : : {
5420 : : return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5421 : : }
5422 : :
5423 : : static struct cftype debug_files[] = {
5424 : : {
5425 : : .name = "taskcount",
5426 : : .read_u64 = debug_taskcount_read,
5427 : : },
5428 : :
5429 : : {
5430 : : .name = "current_css_set",
5431 : : .read_u64 = current_css_set_read,
5432 : : },
5433 : :
5434 : : {
5435 : : .name = "current_css_set_refcount",
5436 : : .read_u64 = current_css_set_refcount_read,
5437 : : },
5438 : :
5439 : : {
5440 : : .name = "current_css_set_cg_links",
5441 : : .seq_show = current_css_set_cg_links_read,
5442 : : },
5443 : :
5444 : : {
5445 : : .name = "cgroup_css_links",
5446 : : .seq_show = cgroup_css_links_read,
5447 : : },
5448 : :
5449 : : {
5450 : : .name = "releasable",
5451 : : .read_u64 = releasable_read,
5452 : : },
5453 : :
5454 : : { } /* terminate */
5455 : : };
5456 : :
5457 : : struct cgroup_subsys debug_subsys = {
5458 : : .name = "debug",
5459 : : .css_alloc = debug_css_alloc,
5460 : : .css_free = debug_css_free,
5461 : : .subsys_id = debug_subsys_id,
5462 : : .base_cftypes = debug_files,
5463 : : };
5464 : : #endif /* CONFIG_CGROUP_DEBUG */
|