Branch data Line data Source code
1 : : /*
2 : : * Generic process-grouping system.
3 : : *
4 : : * Based originally on the cpuset system, extracted by Paul Menage
5 : : * Copyright (C) 2006 Google, Inc
6 : : *
7 : : * Notifications support
8 : : * Copyright (C) 2009 Nokia Corporation
9 : : * Author: Kirill A. Shutemov
10 : : *
11 : : * Copyright notices from the original cpuset code:
12 : : * --------------------------------------------------
13 : : * Copyright (C) 2003 BULL SA.
14 : : * Copyright (C) 2004-2006 Silicon Graphics, Inc.
15 : : *
16 : : * Portions derived from Patrick Mochel's sysfs code.
17 : : * sysfs is Copyright (c) 2001-3 Patrick Mochel
18 : : *
19 : : * 2003-10-10 Written by Simon Derr.
20 : : * 2003-10-22 Updates by Stephen Hemminger.
21 : : * 2004 May-July Rework by Paul Jackson.
22 : : * ---------------------------------------------------
23 : : *
24 : : * This file is subject to the terms and conditions of the GNU General Public
25 : : * License. See the file COPYING in the main directory of the Linux
26 : : * distribution for more details.
27 : : */
28 : :
29 : : #include <linux/cgroup.h>
30 : : #include <linux/cred.h>
31 : : #include <linux/ctype.h>
32 : : #include <linux/errno.h>
33 : : #include <linux/init_task.h>
34 : : #include <linux/kernel.h>
35 : : #include <linux/list.h>
36 : : #include <linux/mm.h>
37 : : #include <linux/mutex.h>
38 : : #include <linux/mount.h>
39 : : #include <linux/pagemap.h>
40 : : #include <linux/proc_fs.h>
41 : : #include <linux/rcupdate.h>
42 : : #include <linux/sched.h>
43 : : #include <linux/backing-dev.h>
44 : : #include <linux/seq_file.h>
45 : : #include <linux/slab.h>
46 : : #include <linux/magic.h>
47 : : #include <linux/spinlock.h>
48 : : #include <linux/string.h>
49 : : #include <linux/sort.h>
50 : : #include <linux/kmod.h>
51 : : #include <linux/module.h>
52 : : #include <linux/delayacct.h>
53 : : #include <linux/cgroupstats.h>
54 : : #include <linux/hashtable.h>
55 : : #include <linux/namei.h>
56 : : #include <linux/pid_namespace.h>
57 : : #include <linux/idr.h>
58 : : #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59 : : #include <linux/eventfd.h>
60 : : #include <linux/poll.h>
61 : : #include <linux/flex_array.h> /* used in cgroup_attach_task */
62 : : #include <linux/kthread.h>
63 : : #include <linux/file.h>
64 : :
65 : : #include <linux/atomic.h>
66 : :
67 : : /*
68 : : * cgroup_mutex is the master lock. Any modification to cgroup or its
69 : : * hierarchy must be performed while holding it.
70 : : *
71 : : * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
72 : : * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
73 : : * release_agent_path and so on. Modifying requires both cgroup_mutex and
74 : : * cgroup_root_mutex. Readers can acquire either of the two. This is to
75 : : * break the following locking order cycle.
76 : : *
77 : : * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
78 : : * B. namespace_sem -> cgroup_mutex
79 : : *
80 : : * B happens only through cgroup_show_options() and using cgroup_root_mutex
81 : : * breaks it.
82 : : */
83 : : #ifdef CONFIG_PROVE_RCU
84 : : DEFINE_MUTEX(cgroup_mutex);
85 : : EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
86 : : #else
87 : : static DEFINE_MUTEX(cgroup_mutex);
88 : : #endif
89 : :
90 : : static DEFINE_MUTEX(cgroup_root_mutex);
91 : :
92 : : /*
93 : : * cgroup destruction makes heavy use of work items and there can be a lot
94 : : * of concurrent destructions. Use a separate workqueue so that cgroup
95 : : * destruction work items don't end up filling up max_active of system_wq
96 : : * which may lead to deadlock.
97 : : */
98 : : static struct workqueue_struct *cgroup_destroy_wq;
99 : :
100 : : /*
101 : : * Generate an array of cgroup subsystem pointers. At boot time, this is
102 : : * populated with the built in subsystems, and modular subsystems are
103 : : * registered after that. The mutable section of this array is protected by
104 : : * cgroup_mutex.
105 : : */
106 : : #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
107 : : #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
108 : : static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
109 : : #include <linux/cgroup_subsys.h>
110 : : };
111 : :
112 : : /*
113 : : * The dummy hierarchy, reserved for the subsystems that are otherwise
114 : : * unattached - it never has more than a single cgroup, and all tasks are
115 : : * part of that cgroup.
116 : : */
117 : : static struct cgroupfs_root cgroup_dummy_root;
118 : :
119 : : /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
120 : : static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
121 : :
122 : : /*
123 : : * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
124 : : */
125 : : struct cfent {
126 : : struct list_head node;
127 : : struct dentry *dentry;
128 : : struct cftype *type;
129 : : struct cgroup_subsys_state *css;
130 : :
131 : : /* file xattrs */
132 : : struct simple_xattrs xattrs;
133 : : };
134 : :
135 : : /*
136 : : * cgroup_event represents events which userspace want to receive.
137 : : */
138 : : struct cgroup_event {
139 : : /*
140 : : * css which the event belongs to.
141 : : */
142 : : struct cgroup_subsys_state *css;
143 : : /*
144 : : * Control file which the event associated.
145 : : */
146 : : struct cftype *cft;
147 : : /*
148 : : * eventfd to signal userspace about the event.
149 : : */
150 : : struct eventfd_ctx *eventfd;
151 : : /*
152 : : * Each of these stored in a list by the cgroup.
153 : : */
154 : : struct list_head list;
155 : : /*
156 : : * All fields below needed to unregister event when
157 : : * userspace closes eventfd.
158 : : */
159 : : poll_table pt;
160 : : wait_queue_head_t *wqh;
161 : : wait_queue_t wait;
162 : : struct work_struct remove;
163 : : };
164 : :
165 : : /* The list of hierarchy roots */
166 : :
167 : : static LIST_HEAD(cgroup_roots);
168 : : static int cgroup_root_count;
169 : :
170 : : /*
171 : : * Hierarchy ID allocation and mapping. It follows the same exclusion
172 : : * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
173 : : * writes, either for reads.
174 : : */
175 : : static DEFINE_IDR(cgroup_hierarchy_idr);
176 : :
177 : : static struct cgroup_name root_cgroup_name = { .name = "/" };
178 : :
179 : : /*
180 : : * Assign a monotonically increasing serial number to cgroups. It
181 : : * guarantees cgroups with bigger numbers are newer than those with smaller
182 : : * numbers. Also, as cgroups are always appended to the parent's
183 : : * ->children list, it guarantees that sibling cgroups are always sorted in
184 : : * the ascending serial number order on the list. Protected by
185 : : * cgroup_mutex.
186 : : */
187 : : static u64 cgroup_serial_nr_next = 1;
188 : :
189 : : /* This flag indicates whether tasks in the fork and exit paths should
190 : : * check for fork/exit handlers to call. This avoids us having to do
191 : : * extra work in the fork/exit path if none of the subsystems need to
192 : : * be called.
193 : : */
194 : : static int need_forkexit_callback __read_mostly;
195 : :
196 : : static struct cftype cgroup_base_files[];
197 : :
198 : : static void cgroup_destroy_css_killed(struct cgroup *cgrp);
199 : : static int cgroup_destroy_locked(struct cgroup *cgrp);
200 : : static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
201 : : bool is_add);
202 : : static int cgroup_file_release(struct inode *inode, struct file *file);
203 : :
204 : : /**
205 : : * cgroup_css - obtain a cgroup's css for the specified subsystem
206 : : * @cgrp: the cgroup of interest
207 : : * @ss: the subsystem of interest (%NULL returns the dummy_css)
208 : : *
209 : : * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
210 : : * function must be called either under cgroup_mutex or rcu_read_lock() and
211 : : * the caller is responsible for pinning the returned css if it wants to
212 : : * keep accessing it outside the said locks. This function may return
213 : : * %NULL if @cgrp doesn't have @subsys_id enabled.
214 : : */
215 : : static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
216 : : struct cgroup_subsys *ss)
217 : : {
218 [ # # ][ # # : 5 : if (ss)
# # # # #
# ][ # # ]
[ # # ][ # # ]
[ # # # # ]
[ # # ]
[ # # # # ]
[ # # - + ]
[ # # ][ # # ]
[ # # ][ # # ]
219 : 0 : return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
220 : : lockdep_is_held(&cgroup_mutex));
221 : : else
222 : 5 : return &cgrp->dummy_css;
223 : : }
224 : :
225 : : /* convenient tests for these bits */
226 : : static inline bool cgroup_is_dead(const struct cgroup *cgrp)
227 : : {
228 : 0 : return test_bit(CGRP_DEAD, &cgrp->flags);
229 : : }
230 : :
231 : : /**
232 : : * cgroup_is_descendant - test ancestry
233 : : * @cgrp: the cgroup to be tested
234 : : * @ancestor: possible ancestor of @cgrp
235 : : *
236 : : * Test whether @cgrp is a descendant of @ancestor. It also returns %true
237 : : * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
238 : : * and @ancestor are accessible.
239 : : */
240 : 0 : bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
241 : : {
242 [ # # ]: 0 : while (cgrp) {
243 [ # # ]: 0 : if (cgrp == ancestor)
244 : : return true;
245 : 0 : cgrp = cgrp->parent;
246 : : }
247 : : return false;
248 : : }
249 : : EXPORT_SYMBOL_GPL(cgroup_is_descendant);
250 : :
251 : : static int cgroup_is_releasable(const struct cgroup *cgrp)
252 : : {
253 : : const int bits =
254 : : (1 << CGRP_RELEASABLE) |
255 : : (1 << CGRP_NOTIFY_ON_RELEASE);
256 : 2 : return (cgrp->flags & bits) == bits;
257 : : }
258 : :
259 : : static int notify_on_release(const struct cgroup *cgrp)
260 : : {
261 : : return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
262 : : }
263 : :
264 : : /**
265 : : * for_each_subsys - iterate all loaded cgroup subsystems
266 : : * @ss: the iteration cursor
267 : : * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
268 : : *
269 : : * Should be called under cgroup_mutex.
270 : : */
271 : : #define for_each_subsys(ss, i) \
272 : : for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \
273 : : if (({ lockdep_assert_held(&cgroup_mutex); \
274 : : !((ss) = cgroup_subsys[i]); })) { } \
275 : : else
276 : :
277 : : /**
278 : : * for_each_builtin_subsys - iterate all built-in cgroup subsystems
279 : : * @ss: the iteration cursor
280 : : * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
281 : : *
282 : : * Bulit-in subsystems are always present and iteration itself doesn't
283 : : * require any synchronization.
284 : : */
285 : : #define for_each_builtin_subsys(ss, i) \
286 : : for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
287 : : (((ss) = cgroup_subsys[i]) || true); (i)++)
288 : :
289 : : /* iterate each subsystem attached to a hierarchy */
290 : : #define for_each_root_subsys(root, ss) \
291 : : list_for_each_entry((ss), &(root)->subsys_list, sibling)
292 : :
293 : : /* iterate across the active hierarchies */
294 : : #define for_each_active_root(root) \
295 : : list_for_each_entry((root), &cgroup_roots, root_list)
296 : :
297 : : static inline struct cgroup *__d_cgrp(struct dentry *dentry)
298 : : {
299 : : return dentry->d_fsdata;
300 : : }
301 : :
302 : : static inline struct cfent *__d_cfe(struct dentry *dentry)
303 : : {
304 : : return dentry->d_fsdata;
305 : : }
306 : :
307 : 0 : static inline struct cftype *__d_cft(struct dentry *dentry)
308 : : {
309 : 0 : return __d_cfe(dentry)->type;
310 : : }
311 : :
312 : : /**
313 : : * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
314 : : * @cgrp: the cgroup to be checked for liveness
315 : : *
316 : : * On success, returns true; the mutex should be later unlocked. On
317 : : * failure returns false with no lock held.
318 : : */
319 : 0 : static bool cgroup_lock_live_group(struct cgroup *cgrp)
320 : : {
321 : 2 : mutex_lock(&cgroup_mutex);
322 [ - + ]: 2 : if (cgroup_is_dead(cgrp)) {
323 : 0 : mutex_unlock(&cgroup_mutex);
324 : 0 : return false;
325 : : }
326 : : return true;
327 : : }
328 : :
329 : : /* the list of cgroups eligible for automatic release. Protected by
330 : : * release_list_lock */
331 : : static LIST_HEAD(release_list);
332 : : static DEFINE_RAW_SPINLOCK(release_list_lock);
333 : : static void cgroup_release_agent(struct work_struct *work);
334 : : static DECLARE_WORK(release_agent_work, cgroup_release_agent);
335 : : static void check_for_release(struct cgroup *cgrp);
336 : :
337 : : /*
338 : : * A cgroup can be associated with multiple css_sets as different tasks may
339 : : * belong to different cgroups on different hierarchies. In the other
340 : : * direction, a css_set is naturally associated with multiple cgroups.
341 : : * This M:N relationship is represented by the following link structure
342 : : * which exists for each association and allows traversing the associations
343 : : * from both sides.
344 : : */
345 : : struct cgrp_cset_link {
346 : : /* the cgroup and css_set this link associates */
347 : : struct cgroup *cgrp;
348 : : struct css_set *cset;
349 : :
350 : : /* list of cgrp_cset_links anchored at cgrp->cset_links */
351 : : struct list_head cset_link;
352 : :
353 : : /* list of cgrp_cset_links anchored at css_set->cgrp_links */
354 : : struct list_head cgrp_link;
355 : : };
356 : :
357 : : /* The default css_set - used by init and its children prior to any
358 : : * hierarchies being mounted. It contains a pointer to the root state
359 : : * for each subsystem. Also used to anchor the list of css_sets. Not
360 : : * reference-counted, to improve performance when child cgroups
361 : : * haven't been created.
362 : : */
363 : :
364 : : static struct css_set init_css_set;
365 : : static struct cgrp_cset_link init_cgrp_cset_link;
366 : :
367 : : /*
368 : : * css_set_lock protects the list of css_set objects, and the chain of
369 : : * tasks off each css_set. Nests outside task->alloc_lock due to
370 : : * css_task_iter_start().
371 : : */
372 : : static DEFINE_RWLOCK(css_set_lock);
373 : : static int css_set_count;
374 : :
375 : : /*
376 : : * hash table for cgroup groups. This improves the performance to find
377 : : * an existing css_set. This hash doesn't (currently) take into
378 : : * account cgroups in empty hierarchies.
379 : : */
380 : : #define CSS_SET_HASH_BITS 7
381 : : static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
382 : :
383 : : static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
384 : : {
385 : : unsigned long key = 0UL;
386 : : struct cgroup_subsys *ss;
387 : : int i;
388 : :
389 : : for_each_subsys(ss, i)
390 : : key += (unsigned long)css[i];
391 : : key = (key >> 16) ^ key;
392 : :
393 : : return key;
394 : : }
395 : :
396 : : /*
397 : : * We don't maintain the lists running through each css_set to its task
398 : : * until after the first call to css_task_iter_start(). This reduces the
399 : : * fork()/exit() overhead for people who have cgroups compiled into their
400 : : * kernel but not actually in use.
401 : : */
402 : : static int use_task_css_set_links __read_mostly;
403 : :
404 : 0 : static void __put_css_set(struct css_set *cset, int taskexit)
405 : : {
406 : : struct cgrp_cset_link *link, *tmp_link;
407 : :
408 : : /*
409 : : * Ensure that the refcount doesn't hit zero while any readers
410 : : * can see it. Similar to atomic_dec_and_lock(), but for an
411 : : * rwlock
412 : : */
413 [ - + ]: 1122957 : if (atomic_add_unless(&cset->refcount, -1, 1))
414 : : return;
415 : 0 : write_lock(&css_set_lock);
416 [ # # ]: 1122967 : if (!atomic_dec_and_test(&cset->refcount)) {
417 : : write_unlock(&css_set_lock);
418 : : return;
419 : : }
420 : :
421 : : /* This css_set is dead. unlink it and release cgroup refcounts */
422 : : hash_del(&cset->hlist);
423 : 0 : css_set_count--;
424 : :
425 [ # # ]: 0 : list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
426 : 0 : struct cgroup *cgrp = link->cgrp;
427 : :
428 : : list_del(&link->cset_link);
429 : : list_del(&link->cgrp_link);
430 : :
431 : : /* @cgrp can't go away while we're holding css_set_lock */
432 [ # # ][ # # ]: 0 : if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
433 [ # # ]: 0 : if (taskexit)
434 : 0 : set_bit(CGRP_RELEASABLE, &cgrp->flags);
435 : 0 : check_for_release(cgrp);
436 : : }
437 : :
438 : 0 : kfree(link);
439 : : }
440 : :
441 : : write_unlock(&css_set_lock);
442 : 0 : kfree_rcu(cset, rcu_head);
443 : : }
444 : :
445 : : /*
446 : : * refcounted get/put for css_set objects
447 : : */
448 : : static inline void get_css_set(struct css_set *cset)
449 : : {
450 : 1122974 : atomic_inc(&cset->refcount);
451 : : }
452 : :
453 : : static inline void put_css_set(struct css_set *cset)
454 : : {
455 : 0 : __put_css_set(cset, 0);
456 : : }
457 : :
458 : : static inline void put_css_set_taskexit(struct css_set *cset)
459 : : {
460 : 1122970 : __put_css_set(cset, 1);
461 : : }
462 : :
463 : : /**
464 : : * compare_css_sets - helper function for find_existing_css_set().
465 : : * @cset: candidate css_set being tested
466 : : * @old_cset: existing css_set for a task
467 : : * @new_cgrp: cgroup that's being entered by the task
468 : : * @template: desired set of css pointers in css_set (pre-calculated)
469 : : *
470 : : * Returns true if "cset" matches "old_cset" except for the hierarchy
471 : : * which "new_cgrp" belongs to, for which it should match "new_cgrp".
472 : : */
473 : 0 : static bool compare_css_sets(struct css_set *cset,
474 : : struct css_set *old_cset,
475 : : struct cgroup *new_cgrp,
476 : : struct cgroup_subsys_state *template[])
477 : : {
478 : : struct list_head *l1, *l2;
479 : :
480 : : if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
481 : : /* Not all subsystems matched */
482 : : return false;
483 : : }
484 : :
485 : : /*
486 : : * Compare cgroup pointers in order to distinguish between
487 : : * different cgroups in heirarchies with no subsystems. We
488 : : * could get by with just this check alone (and skip the
489 : : * memcmp above) but on most setups the memcmp check will
490 : : * avoid the need for this more expensive check on almost all
491 : : * candidates.
492 : : */
493 : :
494 : 0 : l1 = &cset->cgrp_links;
495 : 0 : l2 = &old_cset->cgrp_links;
496 : : while (1) {
497 : : struct cgrp_cset_link *link1, *link2;
498 : : struct cgroup *cgrp1, *cgrp2;
499 : :
500 : 0 : l1 = l1->next;
501 : 0 : l2 = l2->next;
502 : : /* See if we reached the end - both lists are equal length. */
503 [ # # ]: 0 : if (l1 == &cset->cgrp_links) {
504 [ # # ]: 0 : BUG_ON(l2 != &old_cset->cgrp_links);
505 : : break;
506 : : } else {
507 [ # # ]: 0 : BUG_ON(l2 == &old_cset->cgrp_links);
508 : : }
509 : : /* Locate the cgroups associated with these links. */
510 : : link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
511 : : link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
512 : 0 : cgrp1 = link1->cgrp;
513 : 0 : cgrp2 = link2->cgrp;
514 : : /* Hierarchies should be linked in the same order. */
515 [ # # ]: 0 : BUG_ON(cgrp1->root != cgrp2->root);
516 : :
517 : : /*
518 : : * If this hierarchy is the hierarchy of the cgroup
519 : : * that's changing, then we need to check that this
520 : : * css_set points to the new cgroup; if it's any other
521 : : * hierarchy, then this css_set should point to the
522 : : * same cgroup as the old css_set.
523 : : */
524 [ # # ]: 0 : if (cgrp1->root == new_cgrp->root) {
525 [ # # ]: 0 : if (cgrp1 != new_cgrp)
526 : : return false;
527 : : } else {
528 [ # # ]: 0 : if (cgrp1 != cgrp2)
529 : : return false;
530 : : }
531 : : }
532 : : return true;
533 : : }
534 : :
535 : : /**
536 : : * find_existing_css_set - init css array and find the matching css_set
537 : : * @old_cset: the css_set that we're using before the cgroup transition
538 : : * @cgrp: the cgroup that we're moving into
539 : : * @template: out param for the new set of csses, should be clear on entry
540 : : */
541 : 0 : static struct css_set *find_existing_css_set(struct css_set *old_cset,
542 : : struct cgroup *cgrp,
543 : : struct cgroup_subsys_state *template[])
544 : : {
545 : : struct cgroupfs_root *root = cgrp->root;
546 : : struct cgroup_subsys *ss;
547 : : struct css_set *cset;
548 : : unsigned long key;
549 : : int i;
550 : :
551 : : /*
552 : : * Build the set of subsystem state objects that we want to see in the
553 : : * new css_set. while subsystems can change globally, the entries here
554 : : * won't change, so no need for locking.
555 : : */
556 : : for_each_subsys(ss, i) {
557 : : if (root->subsys_mask & (1UL << i)) {
558 : : /* Subsystem is in this hierarchy. So we want
559 : : * the subsystem state from the new
560 : : * cgroup */
561 : : template[i] = cgroup_css(cgrp, ss);
562 : : } else {
563 : : /* Subsystem is not in this hierarchy, so we
564 : : * don't want to change the subsystem state */
565 : : template[i] = old_cset->subsys[i];
566 : : }
567 : : }
568 : :
569 : : key = css_set_hash(template);
570 [ # # ][ # # ]: 0 : hash_for_each_possible(css_set_table, cset, hlist, key) {
[ # # ]
571 [ # # ]: 0 : if (!compare_css_sets(cset, old_cset, cgrp, template))
572 : 0 : continue;
573 : :
574 : : /* This css_set matches what we need */
575 : : return cset;
576 : : }
577 : :
578 : : /* No existing cgroup group matched */
579 : : return NULL;
580 : : }
581 : :
582 : 0 : static void free_cgrp_cset_links(struct list_head *links_to_free)
583 : : {
584 : : struct cgrp_cset_link *link, *tmp_link;
585 : :
586 [ - + ]: 3 : list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
587 : : list_del(&link->cset_link);
588 : 0 : kfree(link);
589 : : }
590 : 3 : }
591 : :
592 : : /**
593 : : * allocate_cgrp_cset_links - allocate cgrp_cset_links
594 : : * @count: the number of links to allocate
595 : : * @tmp_links: list_head the allocated links are put on
596 : : *
597 : : * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
598 : : * through ->cset_link. Returns 0 on success or -errno.
599 : : */
600 : 0 : static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
601 : : {
602 : : struct cgrp_cset_link *link;
603 : : int i;
604 : :
605 : : INIT_LIST_HEAD(tmp_links);
606 : :
607 [ + + ]: 6 : for (i = 0; i < count; i++) {
608 : : link = kzalloc(sizeof(*link), GFP_KERNEL);
609 [ - + ]: 3 : if (!link) {
610 : 0 : free_cgrp_cset_links(tmp_links);
611 : 0 : return -ENOMEM;
612 : : }
613 : 3 : list_add(&link->cset_link, tmp_links);
614 : : }
615 : : return 0;
616 : : }
617 : :
618 : : /**
619 : : * link_css_set - a helper function to link a css_set to a cgroup
620 : : * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
621 : : * @cset: the css_set to be linked
622 : : * @cgrp: the destination cgroup
623 : : */
624 : 0 : static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
625 : : struct cgroup *cgrp)
626 : : {
627 : : struct cgrp_cset_link *link;
628 : :
629 [ - + ]: 3 : BUG_ON(list_empty(tmp_links));
630 : : link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
631 : 3 : link->cset = cset;
632 : 3 : link->cgrp = cgrp;
633 : 3 : list_move(&link->cset_link, &cgrp->cset_links);
634 : : /*
635 : : * Always add links to the tail of the list so that the list
636 : : * is sorted by order of hierarchy creation
637 : : */
638 : 3 : list_add_tail(&link->cgrp_link, &cset->cgrp_links);
639 : 3 : }
640 : :
641 : : /**
642 : : * find_css_set - return a new css_set with one cgroup updated
643 : : * @old_cset: the baseline css_set
644 : : * @cgrp: the cgroup to be updated
645 : : *
646 : : * Return a new css_set that's equivalent to @old_cset, but with @cgrp
647 : : * substituted into the appropriate hierarchy.
648 : : */
649 : 0 : static struct css_set *find_css_set(struct css_set *old_cset,
650 : : struct cgroup *cgrp)
651 : : {
652 : : struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
653 : : struct css_set *cset;
654 : : struct list_head tmp_links;
655 : : struct cgrp_cset_link *link;
656 : : unsigned long key;
657 : :
658 : : lockdep_assert_held(&cgroup_mutex);
659 : :
660 : : /* First see if we already have a cgroup group that matches
661 : : * the desired set */
662 : 0 : read_lock(&css_set_lock);
663 : 0 : cset = find_existing_css_set(old_cset, cgrp, template);
664 [ # # ]: 0 : if (cset)
665 : : get_css_set(cset);
666 : : read_unlock(&css_set_lock);
667 : :
668 [ # # ]: 0 : if (cset)
669 : : return cset;
670 : :
671 : : cset = kzalloc(sizeof(*cset), GFP_KERNEL);
672 [ # # ]: 0 : if (!cset)
673 : : return NULL;
674 : :
675 : : /* Allocate all the cgrp_cset_link objects that we'll need */
676 [ # # ]: 0 : if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
677 : 0 : kfree(cset);
678 : 0 : return NULL;
679 : : }
680 : :
681 : 0 : atomic_set(&cset->refcount, 1);
682 : 0 : INIT_LIST_HEAD(&cset->cgrp_links);
683 : 0 : INIT_LIST_HEAD(&cset->tasks);
684 : : INIT_HLIST_NODE(&cset->hlist);
685 : :
686 : : /* Copy the set of subsystem state objects generated in
687 : : * find_existing_css_set() */
688 : : memcpy(cset->subsys, template, sizeof(cset->subsys));
689 : :
690 : 0 : write_lock(&css_set_lock);
691 : : /* Add reference counts and links from the new css_set. */
692 [ # # ]: 0 : list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
693 : 0 : struct cgroup *c = link->cgrp;
694 : :
695 [ # # ]: 0 : if (c->root == cgrp->root)
696 : : c = cgrp;
697 : 0 : link_css_set(&tmp_links, cset, c);
698 : : }
699 : :
700 [ # # ]: 0 : BUG_ON(!list_empty(&tmp_links));
701 : :
702 : 0 : css_set_count++;
703 : :
704 : : /* Add this cgroup group to the hash table */
705 : : key = css_set_hash(cset->subsys);
706 : 0 : hash_add(css_set_table, &cset->hlist, key);
707 : :
708 : : write_unlock(&css_set_lock);
709 : :
710 : 0 : return cset;
711 : : }
712 : :
713 : : /*
714 : : * Return the cgroup for "task" from the given hierarchy. Must be
715 : : * called with cgroup_mutex held.
716 : : */
717 : 0 : static struct cgroup *task_cgroup_from_root(struct task_struct *task,
718 : : struct cgroupfs_root *root)
719 : : {
720 : : struct css_set *cset;
721 : : struct cgroup *res = NULL;
722 : :
723 [ # # ]: 0 : BUG_ON(!mutex_is_locked(&cgroup_mutex));
724 : 0 : read_lock(&css_set_lock);
725 : : /*
726 : : * No need to lock the task - since we hold cgroup_mutex the
727 : : * task can't change groups, so the only thing that can happen
728 : : * is that it exits and its css is set back to init_css_set.
729 : : */
730 : : cset = task_css_set(task);
731 [ # # ]: 0 : if (cset == &init_css_set) {
732 : 0 : res = &root->top_cgroup;
733 : : } else {
734 : : struct cgrp_cset_link *link;
735 : :
736 [ # # ]: 0 : list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
737 : 0 : struct cgroup *c = link->cgrp;
738 : :
739 [ # # ]: 0 : if (c->root == root) {
740 : : res = c;
741 : : break;
742 : : }
743 : : }
744 : : }
745 : : read_unlock(&css_set_lock);
746 [ # # ]: 0 : BUG_ON(!res);
747 : 0 : return res;
748 : : }
749 : :
750 : : /*
751 : : * There is one global cgroup mutex. We also require taking
752 : : * task_lock() when dereferencing a task's cgroup subsys pointers.
753 : : * See "The task_lock() exception", at the end of this comment.
754 : : *
755 : : * A task must hold cgroup_mutex to modify cgroups.
756 : : *
757 : : * Any task can increment and decrement the count field without lock.
758 : : * So in general, code holding cgroup_mutex can't rely on the count
759 : : * field not changing. However, if the count goes to zero, then only
760 : : * cgroup_attach_task() can increment it again. Because a count of zero
761 : : * means that no tasks are currently attached, therefore there is no
762 : : * way a task attached to that cgroup can fork (the other way to
763 : : * increment the count). So code holding cgroup_mutex can safely
764 : : * assume that if the count is zero, it will stay zero. Similarly, if
765 : : * a task holds cgroup_mutex on a cgroup with zero count, it
766 : : * knows that the cgroup won't be removed, as cgroup_rmdir()
767 : : * needs that mutex.
768 : : *
769 : : * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
770 : : * (usually) take cgroup_mutex. These are the two most performance
771 : : * critical pieces of code here. The exception occurs on cgroup_exit(),
772 : : * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
773 : : * is taken, and if the cgroup count is zero, a usermode call made
774 : : * to the release agent with the name of the cgroup (path relative to
775 : : * the root of cgroup file system) as the argument.
776 : : *
777 : : * A cgroup can only be deleted if both its 'count' of using tasks
778 : : * is zero, and its list of 'children' cgroups is empty. Since all
779 : : * tasks in the system use _some_ cgroup, and since there is always at
780 : : * least one task in the system (init, pid == 1), therefore, top_cgroup
781 : : * always has either children cgroups and/or using tasks. So we don't
782 : : * need a special hack to ensure that top_cgroup cannot be deleted.
783 : : *
784 : : * The task_lock() exception
785 : : *
786 : : * The need for this exception arises from the action of
787 : : * cgroup_attach_task(), which overwrites one task's cgroup pointer with
788 : : * another. It does so using cgroup_mutex, however there are
789 : : * several performance critical places that need to reference
790 : : * task->cgroup without the expense of grabbing a system global
791 : : * mutex. Therefore except as noted below, when dereferencing or, as
792 : : * in cgroup_attach_task(), modifying a task's cgroup pointer we use
793 : : * task_lock(), which acts on a spinlock (task->alloc_lock) already in
794 : : * the task_struct routinely used for such matters.
795 : : *
796 : : * P.S. One more locking exception. RCU is used to guard the
797 : : * update of a tasks cgroup pointer by cgroup_attach_task()
798 : : */
799 : :
800 : : /*
801 : : * A couple of forward declarations required, due to cyclic reference loop:
802 : : * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
803 : : * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
804 : : * -> cgroup_mkdir.
805 : : */
806 : :
807 : : static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
808 : : static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
809 : : static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
810 : : static const struct inode_operations cgroup_dir_inode_operations;
811 : : static const struct file_operations proc_cgroupstats_operations;
812 : :
813 : : static struct backing_dev_info cgroup_backing_dev_info = {
814 : : .name = "cgroup",
815 : : .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
816 : : };
817 : :
818 : 0 : static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
819 : : {
820 : 36 : struct inode *inode = new_inode(sb);
821 : :
822 [ + - ]: 36 : if (inode) {
823 : 36 : inode->i_ino = get_next_ino();
824 : 36 : inode->i_mode = mode;
825 : 36 : inode->i_uid = current_fsuid();
826 : 36 : inode->i_gid = current_fsgid();
827 : 36 : inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
828 : 36 : inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
829 : : }
830 : 0 : return inode;
831 : : }
832 : :
833 : 0 : static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
834 : : {
835 : : struct cgroup_name *name;
836 : :
837 : 2 : name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
838 [ + - ]: 2 : if (!name)
839 : : return NULL;
840 : 2 : strcpy(name->name, dentry->d_name.name);
841 : : return name;
842 : : }
843 : :
844 : 0 : static void cgroup_free_fn(struct work_struct *work)
845 : : {
846 : 2 : struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
847 : :
848 : 2 : mutex_lock(&cgroup_mutex);
849 : 2 : cgrp->root->number_of_cgroups--;
850 : 2 : mutex_unlock(&cgroup_mutex);
851 : :
852 : : /*
853 : : * We get a ref to the parent's dentry, and put the ref when
854 : : * this cgroup is being freed, so it's guaranteed that the
855 : : * parent won't be destroyed before its children.
856 : : */
857 : 2 : dput(cgrp->parent->dentry);
858 : :
859 : : /*
860 : : * Drop the active superblock reference that we took when we
861 : : * created the cgroup. This will free cgrp->root, if we are
862 : : * holding the last reference to @sb.
863 : : */
864 : 2 : deactivate_super(cgrp->root->sb);
865 : :
866 : : /*
867 : : * if we're getting rid of the cgroup, refcount should ensure
868 : : * that there are no pidlists left.
869 : : */
870 [ - + ]: 4 : BUG_ON(!list_empty(&cgrp->pidlists));
871 : :
872 : 2 : simple_xattrs_free(&cgrp->xattrs);
873 : :
874 : 2 : kfree(rcu_dereference_raw(cgrp->name));
875 : 2 : kfree(cgrp);
876 : 2 : }
877 : :
878 : 0 : static void cgroup_free_rcu(struct rcu_head *head)
879 : : {
880 : : struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
881 : :
882 : 4 : INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
883 : 2 : queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
884 : 2 : }
885 : :
886 : 0 : static void cgroup_diput(struct dentry *dentry, struct inode *inode)
887 : : {
888 : : /* is dentry a directory ? if so, kfree() associated cgroup */
889 [ + + ]: 33 : if (S_ISDIR(inode->i_mode)) {
890 : 2 : struct cgroup *cgrp = dentry->d_fsdata;
891 : :
892 [ - + ]: 2 : BUG_ON(!(cgroup_is_dead(cgrp)));
893 : :
894 : : /*
895 : : * XXX: cgrp->id is only used to look up css's. As cgroup
896 : : * and css's lifetimes will be decoupled, it should be made
897 : : * per-subsystem and moved to css->id so that lookups are
898 : : * successful until the target css is released.
899 : : */
900 : 2 : idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
901 : 2 : cgrp->id = -1;
902 : :
903 : 2 : call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
904 : : } else {
905 : : struct cfent *cfe = __d_cfe(dentry);
906 : 31 : struct cgroup *cgrp = dentry->d_parent->d_fsdata;
907 : :
908 [ + + ][ + - ]: 31 : WARN_ONCE(!list_empty(&cfe->node) &&
[ - + ][ # # ]
[ - - ]
909 : : cgrp != &cgrp->root->top_cgroup,
910 : : "cfe still linked for %s\n", cfe->type->name);
911 : 31 : simple_xattrs_free(&cfe->xattrs);
912 : 31 : kfree(cfe);
913 : : }
914 : 33 : iput(inode);
915 : 33 : }
916 : :
917 : 0 : static void remove_dir(struct dentry *d)
918 : : {
919 : 2 : struct dentry *parent = dget(d->d_parent);
920 : :
921 : 2 : d_delete(d);
922 : 2 : simple_rmdir(parent->d_inode, d);
923 : 2 : dput(parent);
924 : 2 : }
925 : :
926 : 0 : static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
927 : : {
928 : : struct cfent *cfe;
929 : :
930 : : lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
931 : : lockdep_assert_held(&cgroup_mutex);
932 : :
933 : : /*
934 : : * If we're doing cleanup due to failure of cgroup_create(),
935 : : * the corresponding @cfe may not exist.
936 : : */
937 [ + - ]: 10 : list_for_each_entry(cfe, &cgrp->files, node) {
938 : 10 : struct dentry *d = cfe->dentry;
939 : :
940 [ + - ][ - + ]: 10 : if (cft && cfe->type != cft)
941 : 0 : continue;
942 : :
943 : : dget(d);
944 : 10 : d_delete(d);
945 : 10 : simple_unlink(cgrp->dentry->d_inode, d);
946 : : list_del_init(&cfe->node);
947 : 10 : dput(d);
948 : :
949 : 10 : break;
950 : : }
951 : 10 : }
952 : :
953 : : /**
954 : : * cgroup_clear_dir - remove subsys files in a cgroup directory
955 : : * @cgrp: target cgroup
956 : : * @subsys_mask: mask of the subsystem ids whose files should be removed
957 : : */
958 : : static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
959 : : {
960 : : struct cgroup_subsys *ss;
961 : : int i;
962 : :
963 : : for_each_subsys(ss, i) {
964 : : struct cftype_set *set;
965 : :
966 : : if (!test_bit(i, &subsys_mask))
967 : : continue;
968 : : list_for_each_entry(set, &ss->cftsets, node)
969 : : cgroup_addrm_files(cgrp, set->cfts, false);
970 : : }
971 : : }
972 : :
973 : : /*
974 : : * NOTE : the dentry must have been dget()'ed
975 : : */
976 : 0 : static void cgroup_d_remove_dir(struct dentry *dentry)
977 : : {
978 : : struct dentry *parent;
979 : :
980 : 2 : parent = dentry->d_parent;
981 : : spin_lock(&parent->d_lock);
982 : 2 : spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
983 : 2 : list_del_init(&dentry->d_u.d_child);
984 : : spin_unlock(&dentry->d_lock);
985 : : spin_unlock(&parent->d_lock);
986 : 2 : remove_dir(dentry);
987 : 2 : }
988 : :
989 : : /*
990 : : * Call with cgroup_mutex held. Drops reference counts on modules, including
991 : : * any duplicate ones that parse_cgroupfs_options took. If this function
992 : : * returns an error, no reference counts are touched.
993 : : */
994 : 0 : static int rebind_subsystems(struct cgroupfs_root *root,
995 : : unsigned long added_mask, unsigned removed_mask)
996 : : {
997 : : struct cgroup *cgrp = &root->top_cgroup;
998 : : struct cgroup_subsys *ss;
999 : : unsigned long pinned = 0;
1000 : : int i, ret;
1001 : :
1002 [ - + ]: 6 : BUG_ON(!mutex_is_locked(&cgroup_mutex));
1003 [ - + ]: 6 : BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1004 : :
1005 : : /* Check that any added subsystems are currently free */
1006 : : for_each_subsys(ss, i) {
1007 : : if (!(added_mask & (1 << i)))
1008 : : continue;
1009 : :
1010 : : /* is the subsystem mounted elsewhere? */
1011 : : if (ss->root != &cgroup_dummy_root) {
1012 : : ret = -EBUSY;
1013 : : goto out_put;
1014 : : }
1015 : :
1016 : : /* pin the module */
1017 : : if (!try_module_get(ss->module)) {
1018 : : ret = -ENOENT;
1019 : : goto out_put;
1020 : : }
1021 : : pinned |= 1 << i;
1022 : : }
1023 : :
1024 : : /* subsys could be missing if unloaded between parsing and here */
1025 [ + - ]: 6 : if (added_mask != pinned) {
1026 : : ret = -ENOENT;
1027 : : goto out_put;
1028 : : }
1029 : :
1030 : : ret = cgroup_populate_dir(cgrp, added_mask);
1031 : : if (ret)
1032 : : goto out_put;
1033 : :
1034 : : /*
1035 : : * Nothing can fail from this point on. Remove files for the
1036 : : * removed subsystems and rebind each subsystem.
1037 : : */
1038 : : cgroup_clear_dir(cgrp, removed_mask);
1039 : :
1040 : : for_each_subsys(ss, i) {
1041 : : unsigned long bit = 1UL << i;
1042 : :
1043 : : if (bit & added_mask) {
1044 : : /* We're binding this subsystem to this hierarchy */
1045 : : BUG_ON(cgroup_css(cgrp, ss));
1046 : : BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1047 : : BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1048 : :
1049 : : rcu_assign_pointer(cgrp->subsys[i],
1050 : : cgroup_css(cgroup_dummy_top, ss));
1051 : : cgroup_css(cgrp, ss)->cgroup = cgrp;
1052 : :
1053 : : list_move(&ss->sibling, &root->subsys_list);
1054 : : ss->root = root;
1055 : : if (ss->bind)
1056 : : ss->bind(cgroup_css(cgrp, ss));
1057 : :
1058 : : /* refcount was already taken, and we're keeping it */
1059 : : root->subsys_mask |= bit;
1060 : : } else if (bit & removed_mask) {
1061 : : /* We're removing this subsystem */
1062 : : BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1063 : : BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1064 : :
1065 : : if (ss->bind)
1066 : : ss->bind(cgroup_css(cgroup_dummy_top, ss));
1067 : :
1068 : : cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
1069 : : RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1070 : :
1071 : : cgroup_subsys[i]->root = &cgroup_dummy_root;
1072 : : list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1073 : :
1074 : : /* subsystem is now free - drop reference on module */
1075 : : module_put(ss->module);
1076 : : root->subsys_mask &= ~bit;
1077 : : }
1078 : : }
1079 : :
1080 : : /*
1081 : : * Mark @root has finished binding subsystems. @root->subsys_mask
1082 : : * now matches the bound subsystems.
1083 : : */
1084 : 6 : root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1085 : :
1086 : : return 0;
1087 : :
1088 : : out_put:
1089 : : for_each_subsys(ss, i)
1090 : : if (pinned & (1 << i))
1091 : : module_put(ss->module);
1092 : : return ret;
1093 : : }
1094 : :
1095 : 0 : static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1096 : : {
1097 : 0 : struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1098 : : struct cgroup_subsys *ss;
1099 : :
1100 : 0 : mutex_lock(&cgroup_root_mutex);
1101 [ # # ]: 0 : for_each_root_subsys(root, ss)
1102 : 0 : seq_printf(seq, ",%s", ss->name);
1103 [ # # ]: 0 : if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1104 : 0 : seq_puts(seq, ",sane_behavior");
1105 [ # # ]: 0 : if (root->flags & CGRP_ROOT_NOPREFIX)
1106 : 0 : seq_puts(seq, ",noprefix");
1107 [ # # ]: 0 : if (root->flags & CGRP_ROOT_XATTR)
1108 : 0 : seq_puts(seq, ",xattr");
1109 [ # # ]: 0 : if (strlen(root->release_agent_path))
1110 : 0 : seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1111 [ # # ]: 0 : if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
1112 : 0 : seq_puts(seq, ",clone_children");
1113 [ # # ]: 0 : if (strlen(root->name))
1114 : 0 : seq_printf(seq, ",name=%s", root->name);
1115 : 0 : mutex_unlock(&cgroup_root_mutex);
1116 : 0 : return 0;
1117 : : }
1118 : :
1119 : : struct cgroup_sb_opts {
1120 : : unsigned long subsys_mask;
1121 : : unsigned long flags;
1122 : : char *release_agent;
1123 : : bool cpuset_clone_children;
1124 : : char *name;
1125 : : /* User explicitly requested empty subsystem */
1126 : : bool none;
1127 : :
1128 : : struct cgroupfs_root *new_root;
1129 : :
1130 : : };
1131 : :
1132 : : /*
1133 : : * Convert a hierarchy specifier into a bitmask of subsystems and
1134 : : * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1135 : : * array. This function takes refcounts on subsystems to be used, unless it
1136 : : * returns error, in which case no refcounts are taken.
1137 : : */
1138 : 0 : static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1139 : : {
1140 : 3193 : char *token, *o = data;
1141 : : bool all_ss = false, one_ss = false;
1142 : : unsigned long mask = (unsigned long)-1;
1143 : : struct cgroup_subsys *ss;
1144 : : int i;
1145 : :
1146 [ - + ]: 3193 : BUG_ON(!mutex_is_locked(&cgroup_mutex));
1147 : :
1148 : : #ifdef CONFIG_CPUSETS
1149 : : mask = ~(1UL << cpuset_subsys_id);
1150 : : #endif
1151 : :
1152 : 3193 : memset(opts, 0, sizeof(*opts));
1153 : :
1154 [ + + ]: 3199 : while ((token = strsep(&o, ",")) != NULL) {
1155 [ + ]: 6 : if (!*token)
1156 : : return -EINVAL;
1157 [ + + ]: 3199 : if (!strcmp(token, "none")) {
1158 : : /* Explicitly have no subsystems */
1159 : 3 : opts->none = true;
1160 : 3 : continue;
1161 : : }
1162 [ - + ]: 3196 : if (!strcmp(token, "all")) {
1163 : : /* Mutually exclusive option 'all' + subsystem name */
1164 : : if (one_ss)
1165 : : return -EINVAL;
1166 : : all_ss = true;
1167 : 0 : continue;
1168 : : }
1169 [ - + ]: 3 : if (!strcmp(token, "__DEVEL__sane_behavior")) {
1170 : 0 : opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1171 : 0 : continue;
1172 : : }
1173 [ - + ]: 3 : if (!strcmp(token, "noprefix")) {
1174 : 0 : opts->flags |= CGRP_ROOT_NOPREFIX;
1175 : 0 : continue;
1176 : : }
1177 [ - + ]: 3 : if (!strcmp(token, "clone_children")) {
1178 : 0 : opts->cpuset_clone_children = true;
1179 : 0 : continue;
1180 : : }
1181 [ - + ]: 3 : if (!strcmp(token, "xattr")) {
1182 : 0 : opts->flags |= CGRP_ROOT_XATTR;
1183 : 0 : continue;
1184 : : }
1185 [ - + ]: 3 : if (!strncmp(token, "release_agent=", 14)) {
1186 : : /* Specifying two release agents is forbidden */
1187 [ # # ]: 0 : if (opts->release_agent)
1188 : : return -EINVAL;
1189 : 0 : opts->release_agent =
1190 : 0 : kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1191 [ # # ]: 0 : if (!opts->release_agent)
1192 : : return -ENOMEM;
1193 : 0 : continue;
1194 : : }
1195 [ + - ]: 3 : if (!strncmp(token, "name=", 5)) {
1196 : 3 : const char *name = token + 5;
1197 : : /* Can't specify an empty name */
1198 [ + - ]: 3 : if (!strlen(name))
1199 : : return -EINVAL;
1200 : : /* Must match [\w.-]+ */
1201 [ + + ]: 12 : for (i = 0; i < strlen(name); i++) {
1202 : 9 : char c = name[i];
1203 [ + - ]: 9 : if (isalnum(c))
1204 : 9 : continue;
1205 [ # # ]: 0 : if ((c == '.') || (c == '-') || (c == '_'))
1206 : 0 : continue;
1207 : : return -EINVAL;
1208 : : }
1209 : : /* Specifying two names is forbidden */
1210 [ + - ]: 3 : if (opts->name)
1211 : : return -EINVAL;
1212 : 3 : opts->name = kstrndup(name,
1213 : : MAX_CGROUP_ROOT_NAMELEN - 1,
1214 : : GFP_KERNEL);
1215 [ + - ]: 3 : if (!opts->name)
1216 : : return -ENOMEM;
1217 : :
1218 : 6 : continue;
1219 : : }
1220 : :
1221 : : for_each_subsys(ss, i) {
1222 : : if (strcmp(token, ss->name))
1223 : : continue;
1224 : : if (ss->disabled)
1225 : : continue;
1226 : :
1227 : : /* Mutually exclusive option 'all' + subsystem name */
1228 : : if (all_ss)
1229 : : return -EINVAL;
1230 : : set_bit(i, &opts->subsys_mask);
1231 : : one_ss = true;
1232 : :
1233 : : break;
1234 : : }
1235 : : if (i == CGROUP_SUBSYS_COUNT)
1236 : : return -ENOENT;
1237 : : }
1238 : :
1239 : : /*
1240 : : * If the 'all' option was specified select all the subsystems,
1241 : : * otherwise if 'none', 'name=' and a subsystem name options
1242 : : * were not specified, let's default to 'all'
1243 : : */
1244 : : if (all_ss || (!one_ss && !opts->none && !opts->name))
1245 : : for_each_subsys(ss, i)
1246 : : if (!ss->disabled)
1247 : : set_bit(i, &opts->subsys_mask);
1248 : :
1249 : : /* Consistency checks */
1250 : :
1251 [ - + ]: 3193 : if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1252 : 0 : pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1253 : :
1254 [ # # ]: 0 : if (opts->flags & CGRP_ROOT_NOPREFIX) {
1255 : 0 : pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
1256 : 0 : return -EINVAL;
1257 : : }
1258 : :
1259 [ # # ]: 0 : if (opts->cpuset_clone_children) {
1260 : 0 : pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
1261 : 0 : return -EINVAL;
1262 : : }
1263 : : }
1264 : :
1265 : : /*
1266 : : * Option noprefix was introduced just for backward compatibility
1267 : : * with the old cpuset, so we allow noprefix only if mounting just
1268 : : * the cpuset subsystem.
1269 : : */
1270 [ - + ][ # # ]: 3193 : if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1271 : : return -EINVAL;
1272 : :
1273 : :
1274 : : /* Can't specify "none" and some subsystems */
1275 [ - + ][ # # ]: 3193 : if (opts->subsys_mask && opts->none)
1276 : : return -EINVAL;
1277 : :
1278 : : /*
1279 : : * We either have to specify by name or by subsystems. (So all
1280 : : * empty hierarchies must have a name).
1281 : : */
1282 [ + - ][ + + ]: 3193 : if (!opts->subsys_mask && !opts->name)
1283 : : return -EINVAL;
1284 : :
1285 : 3 : return 0;
1286 : : }
1287 : :
1288 : 0 : static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1289 : : {
1290 : : int ret = 0;
1291 : 0 : struct cgroupfs_root *root = sb->s_fs_info;
1292 : : struct cgroup *cgrp = &root->top_cgroup;
1293 : : struct cgroup_sb_opts opts;
1294 : : unsigned long added_mask, removed_mask;
1295 : :
1296 [ # # ]: 0 : if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1297 : 0 : pr_err("cgroup: sane_behavior: remount is not allowed\n");
1298 : 0 : return -EINVAL;
1299 : : }
1300 : :
1301 : 0 : mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1302 : 0 : mutex_lock(&cgroup_mutex);
1303 : 0 : mutex_lock(&cgroup_root_mutex);
1304 : :
1305 : : /* See what subsystems are wanted */
1306 : 0 : ret = parse_cgroupfs_options(data, &opts);
1307 [ # # ]: 0 : if (ret)
1308 : : goto out_unlock;
1309 : :
1310 [ # # ][ # # ]: 0 : if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1311 : 0 : pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1312 : : task_tgid_nr(current), current->comm);
1313 : :
1314 : 0 : added_mask = opts.subsys_mask & ~root->subsys_mask;
1315 : : removed_mask = root->subsys_mask & ~opts.subsys_mask;
1316 : :
1317 : : /* Don't allow flags or name to change at remount */
1318 [ # # ][ # # ]: 0 : if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1319 [ # # ]: 0 : (opts.name && strcmp(opts.name, root->name))) {
1320 [ # # ]: 0 : pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
1321 : : opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1322 : : root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1323 : : ret = -EINVAL;
1324 : 0 : goto out_unlock;
1325 : : }
1326 : :
1327 : : /* remounting is not allowed for populated hierarchies */
1328 [ # # ]: 0 : if (root->number_of_cgroups > 1) {
1329 : : ret = -EBUSY;
1330 : : goto out_unlock;
1331 : : }
1332 : :
1333 : 0 : ret = rebind_subsystems(root, added_mask, removed_mask);
1334 [ # # ]: 0 : if (ret)
1335 : : goto out_unlock;
1336 : :
1337 [ # # ]: 0 : if (opts.release_agent)
1338 : 0 : strcpy(root->release_agent_path, opts.release_agent);
1339 : : out_unlock:
1340 : 0 : kfree(opts.release_agent);
1341 : 0 : kfree(opts.name);
1342 : 0 : mutex_unlock(&cgroup_root_mutex);
1343 : 0 : mutex_unlock(&cgroup_mutex);
1344 : 0 : mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1345 : 0 : return ret;
1346 : : }
1347 : :
1348 : : static const struct super_operations cgroup_ops = {
1349 : : .statfs = simple_statfs,
1350 : : .drop_inode = generic_delete_inode,
1351 : : .show_options = cgroup_show_options,
1352 : : .remount_fs = cgroup_remount,
1353 : : };
1354 : :
1355 : 0 : static void init_cgroup_housekeeping(struct cgroup *cgrp)
1356 : : {
1357 : 5 : INIT_LIST_HEAD(&cgrp->sibling);
1358 : 5 : INIT_LIST_HEAD(&cgrp->children);
1359 : 5 : INIT_LIST_HEAD(&cgrp->files);
1360 : 5 : INIT_LIST_HEAD(&cgrp->cset_links);
1361 : 5 : INIT_LIST_HEAD(&cgrp->release_list);
1362 : 5 : INIT_LIST_HEAD(&cgrp->pidlists);
1363 : 5 : mutex_init(&cgrp->pidlist_mutex);
1364 : 5 : cgrp->dummy_css.cgroup = cgrp;
1365 : 5 : INIT_LIST_HEAD(&cgrp->event_list);
1366 : 5 : spin_lock_init(&cgrp->event_list_lock);
1367 : : simple_xattrs_init(&cgrp->xattrs);
1368 : 5 : }
1369 : :
1370 : 0 : static void init_cgroup_root(struct cgroupfs_root *root)
1371 : : {
1372 : 3 : struct cgroup *cgrp = &root->top_cgroup;
1373 : :
1374 : 3 : INIT_LIST_HEAD(&root->subsys_list);
1375 : 3 : INIT_LIST_HEAD(&root->root_list);
1376 : 3 : root->number_of_cgroups = 1;
1377 : 3 : cgrp->root = root;
1378 : 3 : RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1379 : 3 : init_cgroup_housekeeping(cgrp);
1380 : 3 : idr_init(&root->cgroup_idr);
1381 : 3 : }
1382 : :
1383 : : static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1384 : : {
1385 : : int id;
1386 : :
1387 : : lockdep_assert_held(&cgroup_mutex);
1388 : : lockdep_assert_held(&cgroup_root_mutex);
1389 : :
1390 : 3 : id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1391 : : GFP_KERNEL);
1392 [ # # + - ]: 3 : if (id < 0)
1393 : : return id;
1394 : :
1395 : 3 : root->hierarchy_id = id;
1396 : : return 0;
1397 : : }
1398 : :
1399 : : static void cgroup_exit_root_id(struct cgroupfs_root *root)
1400 : : {
1401 : : lockdep_assert_held(&cgroup_mutex);
1402 : : lockdep_assert_held(&cgroup_root_mutex);
1403 : :
1404 [ + - ][ # # ]: 3 : if (root->hierarchy_id) {
1405 : 3 : idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1406 : 3 : root->hierarchy_id = 0;
1407 : : }
1408 : : }
1409 : :
1410 : 0 : static int cgroup_test_super(struct super_block *sb, void *data)
1411 : : {
1412 : : struct cgroup_sb_opts *opts = data;
1413 : 0 : struct cgroupfs_root *root = sb->s_fs_info;
1414 : :
1415 : : /* If we asked for a name then it must match */
1416 [ # # ][ # # ]: 0 : if (opts->name && strcmp(opts->name, root->name))
1417 : : return 0;
1418 : :
1419 : : /*
1420 : : * If we asked for subsystems (or explicitly for no
1421 : : * subsystems) then they must match
1422 : : */
1423 [ # # ][ # # ]: 0 : if ((opts->subsys_mask || opts->none)
1424 [ # # ]: 0 : && (opts->subsys_mask != root->subsys_mask))
1425 : : return 0;
1426 : :
1427 : 0 : return 1;
1428 : : }
1429 : :
1430 : 0 : static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1431 : : {
1432 : : struct cgroupfs_root *root;
1433 : :
1434 [ + - ][ + - ]: 3 : if (!opts->subsys_mask && !opts->none)
1435 : : return NULL;
1436 : :
1437 : : root = kzalloc(sizeof(*root), GFP_KERNEL);
1438 [ + - ]: 3 : if (!root)
1439 : : return ERR_PTR(-ENOMEM);
1440 : :
1441 : 3 : init_cgroup_root(root);
1442 : :
1443 : : /*
1444 : : * We need to set @root->subsys_mask now so that @root can be
1445 : : * matched by cgroup_test_super() before it finishes
1446 : : * initialization; otherwise, competing mounts with the same
1447 : : * options may try to bind the same subsystems instead of waiting
1448 : : * for the first one leading to unexpected mount errors.
1449 : : * SUBSYS_BOUND will be set once actual binding is complete.
1450 : : */
1451 : 3 : root->subsys_mask = opts->subsys_mask;
1452 : 3 : root->flags = opts->flags;
1453 [ - + ]: 3 : if (opts->release_agent)
1454 : 0 : strcpy(root->release_agent_path, opts->release_agent);
1455 [ + - ]: 3 : if (opts->name)
1456 : 3 : strcpy(root->name, opts->name);
1457 [ # # ]: 3 : if (opts->cpuset_clone_children)
1458 : 0 : set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
1459 : 3 : return root;
1460 : : }
1461 : :
1462 : 0 : static void cgroup_free_root(struct cgroupfs_root *root)
1463 : : {
1464 [ + - ]: 3 : if (root) {
1465 : : /* hierarhcy ID shoulid already have been released */
1466 [ - + ][ # # ]: 3 : WARN_ON_ONCE(root->hierarchy_id);
[ # # ]
1467 : :
1468 : 3 : idr_destroy(&root->cgroup_idr);
1469 : 3 : kfree(root);
1470 : : }
1471 : 3 : }
1472 : :
1473 : 0 : static int cgroup_set_super(struct super_block *sb, void *data)
1474 : : {
1475 : : int ret;
1476 : : struct cgroup_sb_opts *opts = data;
1477 : :
1478 : : /* If we don't have a new root, we can't set up a new sb */
1479 [ + - ]: 3 : if (!opts->new_root)
1480 : : return -EINVAL;
1481 : :
1482 [ + - ][ - + ]: 3 : BUG_ON(!opts->subsys_mask && !opts->none);
1483 : :
1484 : 3 : ret = set_anon_super(sb, NULL);
1485 [ + - ]: 3 : if (ret)
1486 : : return ret;
1487 : :
1488 : 3 : sb->s_fs_info = opts->new_root;
1489 : 3 : opts->new_root->sb = sb;
1490 : :
1491 : 3 : sb->s_blocksize = PAGE_CACHE_SIZE;
1492 : 3 : sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1493 : 3 : sb->s_magic = CGROUP_SUPER_MAGIC;
1494 : 3 : sb->s_op = &cgroup_ops;
1495 : :
1496 : 3 : return 0;
1497 : : }
1498 : :
1499 : 0 : static int cgroup_get_rootdir(struct super_block *sb)
1500 : : {
1501 : : static const struct dentry_operations cgroup_dops = {
1502 : : .d_iput = cgroup_diput,
1503 : : .d_delete = always_delete_dentry,
1504 : : };
1505 : :
1506 : 3 : struct inode *inode =
1507 : : cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1508 : :
1509 [ + - ]: 3 : if (!inode)
1510 : : return -ENOMEM;
1511 : :
1512 : 3 : inode->i_fop = &simple_dir_operations;
1513 : 3 : inode->i_op = &cgroup_dir_inode_operations;
1514 : : /* directories start off with i_nlink == 2 (for "." entry) */
1515 : 3 : inc_nlink(inode);
1516 : 3 : sb->s_root = d_make_root(inode);
1517 [ + - ]: 3 : if (!sb->s_root)
1518 : : return -ENOMEM;
1519 : : /* for everything else we want ->d_op set */
1520 : 3 : sb->s_d_op = &cgroup_dops;
1521 : 3 : return 0;
1522 : : }
1523 : :
1524 : 0 : static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1525 : : int flags, const char *unused_dev_name,
1526 : : void *data)
1527 : : {
1528 : : struct cgroup_sb_opts opts;
1529 : : struct cgroupfs_root *root;
1530 : : int ret = 0;
1531 : : struct super_block *sb;
1532 : : struct cgroupfs_root *new_root;
1533 : : struct list_head tmp_links;
1534 : : struct inode *inode;
1535 : : const struct cred *cred;
1536 : :
1537 : : /* First find the desired set of subsystems */
1538 : 3193 : mutex_lock(&cgroup_mutex);
1539 : 3193 : ret = parse_cgroupfs_options(data, &opts);
1540 : 3193 : mutex_unlock(&cgroup_mutex);
1541 [ + + ]: 3193 : if (ret)
1542 : : goto out_err;
1543 : :
1544 : : /*
1545 : : * Allocate a new cgroup root. We may not need it if we're
1546 : : * reusing an existing hierarchy.
1547 : : */
1548 : 3 : new_root = cgroup_root_from_opts(&opts);
1549 [ - + ]: 3 : if (IS_ERR(new_root)) {
1550 : : ret = PTR_ERR(new_root);
1551 : 0 : goto out_err;
1552 : : }
1553 : 3 : opts.new_root = new_root;
1554 : :
1555 : : /* Locate an existing or new sb for this hierarchy */
1556 : 3 : sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
1557 [ - + ]: 3 : if (IS_ERR(sb)) {
1558 : : ret = PTR_ERR(sb);
1559 : 0 : cgroup_free_root(opts.new_root);
1560 : 0 : goto out_err;
1561 : : }
1562 : :
1563 : 3 : root = sb->s_fs_info;
1564 [ - + ]: 3 : BUG_ON(!root);
1565 [ + - ]: 3 : if (root == opts.new_root) {
1566 : : /* We used the new root structure, so this is a new hierarchy */
1567 : 3 : struct cgroup *root_cgrp = &root->top_cgroup;
1568 : : struct cgroupfs_root *existing_root;
1569 : : int i;
1570 : : struct css_set *cset;
1571 : :
1572 [ - + ]: 3 : BUG_ON(sb->s_root != NULL);
1573 : :
1574 : 3 : ret = cgroup_get_rootdir(sb);
1575 [ + - ]: 3 : if (ret)
1576 : : goto drop_new_super;
1577 : 3 : inode = sb->s_root->d_inode;
1578 : :
1579 : 3 : mutex_lock(&inode->i_mutex);
1580 : 3 : mutex_lock(&cgroup_mutex);
1581 : 3 : mutex_lock(&cgroup_root_mutex);
1582 : :
1583 : 3 : root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
1584 : : 0, 1, GFP_KERNEL);
1585 [ + - ]: 3 : if (root_cgrp->id < 0)
1586 : : goto unlock_drop;
1587 : :
1588 : : /* Check for name clashes with existing mounts */
1589 : : ret = -EBUSY;
1590 [ + - ]: 3 : if (strlen(root->name))
1591 [ - + ]: 3 : for_each_active_root(existing_root)
1592 [ # # ]: 0 : if (!strcmp(existing_root->name, root->name))
1593 : : goto unlock_drop;
1594 : :
1595 : : /*
1596 : : * We're accessing css_set_count without locking
1597 : : * css_set_lock here, but that's OK - it can only be
1598 : : * increased by someone holding cgroup_lock, and
1599 : : * that's us. The worst that can happen is that we
1600 : : * have some link structures left over
1601 : : */
1602 : 3 : ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1603 [ + - ]: 3 : if (ret)
1604 : : goto unlock_drop;
1605 : :
1606 : : /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
1607 : : ret = cgroup_init_root_id(root, 2, 0);
1608 [ + - ]: 3 : if (ret)
1609 : : goto unlock_drop;
1610 : :
1611 : 3 : sb->s_root->d_fsdata = root_cgrp;
1612 : 3 : root_cgrp->dentry = sb->s_root;
1613 : :
1614 : : /*
1615 : : * We're inside get_sb() and will call lookup_one_len() to
1616 : : * create the root files, which doesn't work if SELinux is
1617 : : * in use. The following cred dancing somehow works around
1618 : : * it. See 2ce9738ba ("cgroupfs: use init_cred when
1619 : : * populating new cgroupfs mount") for more details.
1620 : : */
1621 : 3 : cred = override_creds(&init_cred);
1622 : :
1623 : 3 : ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1624 [ + - ]: 3 : if (ret)
1625 : : goto rm_base_files;
1626 : :
1627 : 3 : ret = rebind_subsystems(root, root->subsys_mask, 0);
1628 [ + - ]: 3 : if (ret)
1629 : : goto rm_base_files;
1630 : :
1631 : 3 : revert_creds(cred);
1632 : :
1633 : : /*
1634 : : * There must be no failure case after here, since rebinding
1635 : : * takes care of subsystems' refcounts, which are explicitly
1636 : : * dropped in the failure exit path.
1637 : : */
1638 : :
1639 : 3 : list_add(&root->root_list, &cgroup_roots);
1640 : 3 : cgroup_root_count++;
1641 : :
1642 : : /* Link the top cgroup in this hierarchy into all
1643 : : * the css_set objects */
1644 : 3 : write_lock(&css_set_lock);
1645 [ + + ][ - + ]: 3583 : hash_for_each(css_set_table, i, cset, hlist)
[ + + ][ + + ]
1646 : 3 : link_css_set(&tmp_links, cset, root_cgrp);
1647 : : write_unlock(&css_set_lock);
1648 : :
1649 : 3 : free_cgrp_cset_links(&tmp_links);
1650 : :
1651 [ - + ]: 3 : BUG_ON(!list_empty(&root_cgrp->children));
1652 [ - + ]: 3 : BUG_ON(root->number_of_cgroups != 1);
1653 : :
1654 : 3 : mutex_unlock(&cgroup_root_mutex);
1655 : 3 : mutex_unlock(&cgroup_mutex);
1656 : 3 : mutex_unlock(&inode->i_mutex);
1657 : : } else {
1658 : : /*
1659 : : * We re-used an existing hierarchy - the new root (if
1660 : : * any) is not needed
1661 : : */
1662 : 0 : cgroup_free_root(opts.new_root);
1663 : :
1664 [ # # ]: 0 : if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1665 [ # # ]: 0 : if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1666 : 0 : pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1667 : : ret = -EINVAL;
1668 : 0 : goto drop_new_super;
1669 : : } else {
1670 : 0 : pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1671 : : }
1672 : : }
1673 : : }
1674 : :
1675 : 3 : kfree(opts.release_agent);
1676 : 3 : kfree(opts.name);
1677 : 3 : return dget(sb->s_root);
1678 : :
1679 : : rm_base_files:
1680 : 0 : free_cgrp_cset_links(&tmp_links);
1681 : 0 : cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1682 : 0 : revert_creds(cred);
1683 : : unlock_drop:
1684 : : cgroup_exit_root_id(root);
1685 : 0 : mutex_unlock(&cgroup_root_mutex);
1686 : 0 : mutex_unlock(&cgroup_mutex);
1687 : 0 : mutex_unlock(&inode->i_mutex);
1688 : : drop_new_super:
1689 : 0 : deactivate_locked_super(sb);
1690 : : out_err:
1691 : 3190 : kfree(opts.release_agent);
1692 : 3190 : kfree(opts.name);
1693 : 3190 : return ERR_PTR(ret);
1694 : : }
1695 : :
1696 : 0 : static void cgroup_kill_sb(struct super_block *sb) {
1697 : 3 : struct cgroupfs_root *root = sb->s_fs_info;
1698 : : struct cgroup *cgrp = &root->top_cgroup;
1699 : : struct cgrp_cset_link *link, *tmp_link;
1700 : : int ret;
1701 : :
1702 [ - + ]: 3 : BUG_ON(!root);
1703 : :
1704 [ - + ]: 3 : BUG_ON(root->number_of_cgroups != 1);
1705 [ - + ]: 3 : BUG_ON(!list_empty(&cgrp->children));
1706 : :
1707 : 3 : mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1708 : 3 : mutex_lock(&cgroup_mutex);
1709 : 3 : mutex_lock(&cgroup_root_mutex);
1710 : :
1711 : : /* Rebind all subsystems back to the default hierarchy */
1712 [ + - ]: 3 : if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
1713 : 3 : ret = rebind_subsystems(root, 0, root->subsys_mask);
1714 : : /* Shouldn't be able to fail ... */
1715 [ - + ]: 3 : BUG_ON(ret);
1716 : : }
1717 : :
1718 : : /*
1719 : : * Release all the links from cset_links to this hierarchy's
1720 : : * root cgroup
1721 : : */
1722 : 3 : write_lock(&css_set_lock);
1723 : :
1724 [ + + ]: 6 : list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1725 : : list_del(&link->cset_link);
1726 : : list_del(&link->cgrp_link);
1727 : 3 : kfree(link);
1728 : : }
1729 : : write_unlock(&css_set_lock);
1730 : :
1731 [ + - ]: 3 : if (!list_empty(&root->root_list)) {
1732 : : list_del(&root->root_list);
1733 : 3 : cgroup_root_count--;
1734 : : }
1735 : :
1736 : : cgroup_exit_root_id(root);
1737 : :
1738 : 3 : mutex_unlock(&cgroup_root_mutex);
1739 : 3 : mutex_unlock(&cgroup_mutex);
1740 : 3 : mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1741 : :
1742 : 3 : simple_xattrs_free(&cgrp->xattrs);
1743 : :
1744 : 3 : kill_litter_super(sb);
1745 : 3 : cgroup_free_root(root);
1746 : 3 : }
1747 : :
1748 : : static struct file_system_type cgroup_fs_type = {
1749 : : .name = "cgroup",
1750 : : .mount = cgroup_mount,
1751 : : .kill_sb = cgroup_kill_sb,
1752 : : };
1753 : :
1754 : : static struct kobject *cgroup_kobj;
1755 : :
1756 : : /**
1757 : : * cgroup_path - generate the path of a cgroup
1758 : : * @cgrp: the cgroup in question
1759 : : * @buf: the buffer to write the path into
1760 : : * @buflen: the length of the buffer
1761 : : *
1762 : : * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1763 : : *
1764 : : * We can't generate cgroup path using dentry->d_name, as accessing
1765 : : * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1766 : : * inode's i_mutex, while on the other hand cgroup_path() can be called
1767 : : * with some irq-safe spinlocks held.
1768 : : */
1769 : 0 : int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1770 : : {
1771 : : int ret = -ENAMETOOLONG;
1772 : : char *start;
1773 : :
1774 [ # # ]: 0 : if (!cgrp->parent) {
1775 [ # # ]: 0 : if (strlcpy(buf, "/", buflen) >= buflen)
1776 : : return -ENAMETOOLONG;
1777 : 0 : return 0;
1778 : : }
1779 : :
1780 : 0 : start = buf + buflen - 1;
1781 : 0 : *start = '\0';
1782 : :
1783 : : rcu_read_lock();
1784 : : do {
1785 : 0 : const char *name = cgroup_name(cgrp);
1786 : : int len;
1787 : :
1788 : 0 : len = strlen(name);
1789 [ # # ]: 0 : if ((start -= len) < buf)
1790 : : goto out;
1791 : 0 : memcpy(start, name, len);
1792 : :
1793 [ # # ]: 0 : if (--start < buf)
1794 : : goto out;
1795 : 0 : *start = '/';
1796 : :
1797 : 0 : cgrp = cgrp->parent;
1798 [ # # ]: 0 : } while (cgrp->parent);
1799 : : ret = 0;
1800 : 0 : memmove(buf, start, buf + buflen - start);
1801 : : out:
1802 : : rcu_read_unlock();
1803 : 0 : return ret;
1804 : : }
1805 : : EXPORT_SYMBOL_GPL(cgroup_path);
1806 : :
1807 : : /**
1808 : : * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1809 : : * @task: target task
1810 : : * @buf: the buffer to write the path into
1811 : : * @buflen: the length of the buffer
1812 : : *
1813 : : * Determine @task's cgroup on the first (the one with the lowest non-zero
1814 : : * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
1815 : : * function grabs cgroup_mutex and shouldn't be used inside locks used by
1816 : : * cgroup controller callbacks.
1817 : : *
1818 : : * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.
1819 : : */
1820 : 0 : int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1821 : : {
1822 : : struct cgroupfs_root *root;
1823 : : struct cgroup *cgrp;
1824 : 0 : int hierarchy_id = 1, ret = 0;
1825 : :
1826 [ # # ]: 0 : if (buflen < 2)
1827 : : return -ENAMETOOLONG;
1828 : :
1829 : 0 : mutex_lock(&cgroup_mutex);
1830 : :
1831 : 0 : root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1832 : :
1833 [ # # ]: 0 : if (root) {
1834 : 0 : cgrp = task_cgroup_from_root(task, root);
1835 : 0 : ret = cgroup_path(cgrp, buf, buflen);
1836 : : } else {
1837 : : /* if no hierarchy exists, everyone is in "/" */
1838 : 0 : memcpy(buf, "/", 2);
1839 : : }
1840 : :
1841 : 0 : mutex_unlock(&cgroup_mutex);
1842 : 0 : return ret;
1843 : : }
1844 : : EXPORT_SYMBOL_GPL(task_cgroup_path);
1845 : :
1846 : : /*
1847 : : * Control Group taskset
1848 : : */
1849 : : struct task_and_cgroup {
1850 : : struct task_struct *task;
1851 : : struct cgroup *cgrp;
1852 : : struct css_set *cset;
1853 : : };
1854 : :
1855 : : struct cgroup_taskset {
1856 : : struct task_and_cgroup single;
1857 : : struct flex_array *tc_array;
1858 : : int tc_array_len;
1859 : : int idx;
1860 : : struct cgroup *cur_cgrp;
1861 : : };
1862 : :
1863 : : /**
1864 : : * cgroup_taskset_first - reset taskset and return the first task
1865 : : * @tset: taskset of interest
1866 : : *
1867 : : * @tset iteration is initialized and the first task is returned.
1868 : : */
1869 : 0 : struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1870 : : {
1871 [ # # ]: 0 : if (tset->tc_array) {
1872 : 0 : tset->idx = 0;
1873 : 0 : return cgroup_taskset_next(tset);
1874 : : } else {
1875 : 0 : tset->cur_cgrp = tset->single.cgrp;
1876 : 0 : return tset->single.task;
1877 : : }
1878 : : }
1879 : : EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1880 : :
1881 : : /**
1882 : : * cgroup_taskset_next - iterate to the next task in taskset
1883 : : * @tset: taskset of interest
1884 : : *
1885 : : * Return the next task in @tset. Iteration must have been initialized
1886 : : * with cgroup_taskset_first().
1887 : : */
1888 : 0 : struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1889 : : {
1890 : : struct task_and_cgroup *tc;
1891 : :
1892 [ # # ][ # # ]: 0 : if (!tset->tc_array || tset->idx >= tset->tc_array_len)
1893 : : return NULL;
1894 : :
1895 : 0 : tc = flex_array_get(tset->tc_array, tset->idx++);
1896 : 0 : tset->cur_cgrp = tc->cgrp;
1897 : 0 : return tc->task;
1898 : : }
1899 : : EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1900 : :
1901 : : /**
1902 : : * cgroup_taskset_cur_css - return the matching css for the current task
1903 : : * @tset: taskset of interest
1904 : : * @subsys_id: the ID of the target subsystem
1905 : : *
1906 : : * Return the css for the current (last returned) task of @tset for
1907 : : * subsystem specified by @subsys_id. This function must be preceded by
1908 : : * either cgroup_taskset_first() or cgroup_taskset_next().
1909 : : */
1910 : 0 : struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1911 : : int subsys_id)
1912 : : {
1913 : 0 : return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1914 : : }
1915 : : EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1916 : :
1917 : : /**
1918 : : * cgroup_taskset_size - return the number of tasks in taskset
1919 : : * @tset: taskset of interest
1920 : : */
1921 : 0 : int cgroup_taskset_size(struct cgroup_taskset *tset)
1922 : : {
1923 [ # # ]: 0 : return tset->tc_array ? tset->tc_array_len : 1;
1924 : : }
1925 : : EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1926 : :
1927 : :
1928 : : /*
1929 : : * cgroup_task_migrate - move a task from one cgroup to another.
1930 : : *
1931 : : * Must be called with cgroup_mutex and threadgroup locked.
1932 : : */
1933 : 0 : static void cgroup_task_migrate(struct cgroup *old_cgrp,
1934 : : struct task_struct *tsk,
1935 : : struct css_set *new_cset)
1936 : : {
1937 : : struct css_set *old_cset;
1938 : :
1939 : : /*
1940 : : * We are synchronized through threadgroup_lock() against PF_EXITING
1941 : : * setting such that we can't race against cgroup_exit() changing the
1942 : : * css_set to init_css_set and dropping the old one.
1943 : : */
1944 [ # # ][ # # ]: 0 : WARN_ON_ONCE(tsk->flags & PF_EXITING);
[ # # ]
1945 : : old_cset = task_css_set(tsk);
1946 : :
1947 : : task_lock(tsk);
1948 : 0 : rcu_assign_pointer(tsk->cgroups, new_cset);
1949 : : task_unlock(tsk);
1950 : :
1951 : : /* Update the css_set linked lists if we're using them */
1952 : 0 : write_lock(&css_set_lock);
1953 [ # # ]: 0 : if (!list_empty(&tsk->cg_list))
1954 : 0 : list_move(&tsk->cg_list, &new_cset->tasks);
1955 : : write_unlock(&css_set_lock);
1956 : :
1957 : : /*
1958 : : * We just gained a reference on old_cset by taking it from the
1959 : : * task. As trading it for new_cset is protected by cgroup_mutex,
1960 : : * we're safe to drop it here; it will be freed under RCU.
1961 : : */
1962 : 0 : set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1963 : : put_css_set(old_cset);
1964 : 0 : }
1965 : :
1966 : : /**
1967 : : * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
1968 : : * @cgrp: the cgroup to attach to
1969 : : * @tsk: the task or the leader of the threadgroup to be attached
1970 : : * @threadgroup: attach the whole threadgroup?
1971 : : *
1972 : : * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
1973 : : * task_lock of @tsk or each thread in the threadgroup individually in turn.
1974 : : */
1975 : 0 : static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1976 : : bool threadgroup)
1977 : : {
1978 : : int retval, i, group_size;
1979 : : struct cgroup_subsys *ss, *failed_ss = NULL;
1980 : 0 : struct cgroupfs_root *root = cgrp->root;
1981 : : /* threadgroup list cursor and array */
1982 : : struct task_struct *leader = tsk;
1983 : : struct task_and_cgroup *tc;
1984 : : struct flex_array *group;
1985 : 0 : struct cgroup_taskset tset = { };
1986 : :
1987 : : /*
1988 : : * step 0: in order to do expensive, possibly blocking operations for
1989 : : * every thread, we cannot iterate the thread group list, since it needs
1990 : : * rcu or tasklist locked. instead, build an array of all threads in the
1991 : : * group - group_rwsem prevents new threads from appearing, and if
1992 : : * threads exit, this will just be an over-estimate.
1993 : : */
1994 [ # # ]: 0 : if (threadgroup)
1995 : : group_size = get_nr_threads(tsk);
1996 : : else
1997 : : group_size = 1;
1998 : : /* flex_array supports very large thread-groups better than kmalloc. */
1999 : 0 : group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
2000 [ # # ]: 0 : if (!group)
2001 : : return -ENOMEM;
2002 : : /* pre-allocate to guarantee space while iterating in rcu read-side. */
2003 : 0 : retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
2004 [ # # ]: 0 : if (retval)
2005 : : goto out_free_group_list;
2006 : :
2007 : : i = 0;
2008 : : /*
2009 : : * Prevent freeing of tasks while we take a snapshot. Tasks that are
2010 : : * already PF_EXITING could be freed from underneath us unless we
2011 : : * take an rcu_read_lock.
2012 : : */
2013 : : rcu_read_lock();
2014 : : do {
2015 : : struct task_and_cgroup ent;
2016 : :
2017 : : /* @tsk either already exited or can't exit until the end */
2018 [ # # ]: 0 : if (tsk->flags & PF_EXITING)
2019 : : goto next;
2020 : :
2021 : : /* as per above, nr_threads may decrease, but not increase. */
2022 [ # # ]: 0 : BUG_ON(i >= group_size);
2023 : 0 : ent.task = tsk;
2024 : 0 : ent.cgrp = task_cgroup_from_root(tsk, root);
2025 : : /* nothing to do if this task is already in the cgroup */
2026 [ # # ]: 0 : if (ent.cgrp == cgrp)
2027 : : goto next;
2028 : : /*
2029 : : * saying GFP_ATOMIC has no effect here because we did prealloc
2030 : : * earlier, but it's good form to communicate our expectations.
2031 : : */
2032 : 0 : retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2033 [ # # ]: 0 : BUG_ON(retval != 0);
2034 : 0 : i++;
2035 : : next:
2036 [ # # ]: 0 : if (!threadgroup)
2037 : : break;
2038 [ # # ]: 0 : } while_each_thread(leader, tsk);
2039 : : rcu_read_unlock();
2040 : : /* remember the number of threads in the array for later. */
2041 : : group_size = i;
2042 : 0 : tset.tc_array = group;
2043 : 0 : tset.tc_array_len = group_size;
2044 : :
2045 : : /* methods shouldn't be called if no task is actually migrating */
2046 : : retval = 0;
2047 [ # # ]: 0 : if (!group_size)
2048 : : goto out_free_group_list;
2049 : :
2050 : : /*
2051 : : * step 1: check that we can legitimately attach to the cgroup.
2052 : : */
2053 [ # # ]: 0 : for_each_root_subsys(root, ss) {
2054 : : struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2055 : :
2056 [ # # ]: 0 : if (ss->can_attach) {
2057 : 0 : retval = ss->can_attach(css, &tset);
2058 [ # # ]: 0 : if (retval) {
2059 : : failed_ss = ss;
2060 : : goto out_cancel_attach;
2061 : : }
2062 : : }
2063 : : }
2064 : :
2065 : : /*
2066 : : * step 2: make sure css_sets exist for all threads to be migrated.
2067 : : * we use find_css_set, which allocates a new one if necessary.
2068 : : */
2069 [ # # ]: 0 : for (i = 0; i < group_size; i++) {
2070 : : struct css_set *old_cset;
2071 : :
2072 : 0 : tc = flex_array_get(group, i);
2073 : 0 : old_cset = task_css_set(tc->task);
2074 : 0 : tc->cset = find_css_set(old_cset, cgrp);
2075 [ # # ]: 0 : if (!tc->cset) {
2076 : : retval = -ENOMEM;
2077 : : goto out_put_css_set_refs;
2078 : : }
2079 : : }
2080 : :
2081 : : /*
2082 : : * step 3: now that we're guaranteed success wrt the css_sets,
2083 : : * proceed to move all tasks to the new cgroup. There are no
2084 : : * failure cases after here, so this is the commit point.
2085 : : */
2086 [ # # ]: 0 : for (i = 0; i < group_size; i++) {
2087 : 0 : tc = flex_array_get(group, i);
2088 : 0 : cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2089 : : }
2090 : : /* nothing is sensitive to fork() after this point. */
2091 : :
2092 : : /*
2093 : : * step 4: do subsystem attach callbacks.
2094 : : */
2095 [ # # ]: 0 : for_each_root_subsys(root, ss) {
2096 : : struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2097 : :
2098 [ # # ]: 0 : if (ss->attach)
2099 : 0 : ss->attach(css, &tset);
2100 : : }
2101 : :
2102 : : /*
2103 : : * step 5: success! and cleanup
2104 : : */
2105 : : retval = 0;
2106 : : out_put_css_set_refs:
2107 [ # # ]: 0 : if (retval) {
2108 [ # # ]: 0 : for (i = 0; i < group_size; i++) {
2109 : 0 : tc = flex_array_get(group, i);
2110 [ # # ]: 0 : if (!tc->cset)
2111 : : break;
2112 : : put_css_set(tc->cset);
2113 : : }
2114 : : }
2115 : : out_cancel_attach:
2116 [ # # ]: 0 : if (retval) {
2117 [ # # ]: 0 : for_each_root_subsys(root, ss) {
2118 : : struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2119 : :
2120 [ # # ]: 0 : if (ss == failed_ss)
2121 : : break;
2122 [ # # ]: 0 : if (ss->cancel_attach)
2123 : 0 : ss->cancel_attach(css, &tset);
2124 : : }
2125 : : }
2126 : : out_free_group_list:
2127 : 0 : flex_array_free(group);
2128 : 0 : return retval;
2129 : : }
2130 : :
2131 : : /*
2132 : : * Find the task_struct of the task to attach by vpid and pass it along to the
2133 : : * function to attach either it or all tasks in its threadgroup. Will lock
2134 : : * cgroup_mutex and threadgroup; may take task_lock of task.
2135 : : */
2136 : 0 : static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2137 : : {
2138 : : struct task_struct *tsk;
2139 : 0 : const struct cred *cred = current_cred(), *tcred;
2140 : : int ret;
2141 : :
2142 [ # # ]: 0 : if (!cgroup_lock_live_group(cgrp))
2143 : : return -ENODEV;
2144 : :
2145 : : retry_find_task:
2146 : : rcu_read_lock();
2147 [ # # ]: 0 : if (pid) {
2148 : 0 : tsk = find_task_by_vpid(pid);
2149 [ # # ]: 0 : if (!tsk) {
2150 : : rcu_read_unlock();
2151 : : ret= -ESRCH;
2152 : 0 : goto out_unlock_cgroup;
2153 : : }
2154 : : /*
2155 : : * even if we're attaching all tasks in the thread group, we
2156 : : * only need to check permissions on one of them.
2157 : : */
2158 : 0 : tcred = __task_cred(tsk);
2159 [ # # ][ # # ]: 0 : if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2160 [ # # ]: 0 : !uid_eq(cred->euid, tcred->uid) &&
2161 : 0 : !uid_eq(cred->euid, tcred->suid)) {
2162 : : rcu_read_unlock();
2163 : : ret = -EACCES;
2164 : 0 : goto out_unlock_cgroup;
2165 : : }
2166 : : } else
2167 : 0 : tsk = current;
2168 : :
2169 [ # # ]: 0 : if (threadgroup)
2170 : 0 : tsk = tsk->group_leader;
2171 : :
2172 : : /*
2173 : : * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2174 : : * trapped in a cpuset, or RT worker may be born in a cgroup
2175 : : * with no rt_runtime allocated. Just say no.
2176 : : */
2177 [ # # ][ # # ]: 0 : if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2178 : : ret = -EINVAL;
2179 : : rcu_read_unlock();
2180 : : goto out_unlock_cgroup;
2181 : : }
2182 : :
2183 : 0 : get_task_struct(tsk);
2184 : : rcu_read_unlock();
2185 : :
2186 : : threadgroup_lock(tsk);
2187 [ # # ]: 0 : if (threadgroup) {
2188 [ # # ]: 0 : if (!thread_group_leader(tsk)) {
2189 : : /*
2190 : : * a race with de_thread from another thread's exec()
2191 : : * may strip us of our leadership, if this happens,
2192 : : * there is no choice but to throw this task away and
2193 : : * try again; this is
2194 : : * "double-double-toil-and-trouble-check locking".
2195 : : */
2196 : : threadgroup_unlock(tsk);
2197 : : put_task_struct(tsk);
2198 : : goto retry_find_task;
2199 : : }
2200 : : }
2201 : :
2202 : 0 : ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2203 : :
2204 : : threadgroup_unlock(tsk);
2205 : :
2206 : : put_task_struct(tsk);
2207 : : out_unlock_cgroup:
2208 : 0 : mutex_unlock(&cgroup_mutex);
2209 : 0 : return ret;
2210 : : }
2211 : :
2212 : : /**
2213 : : * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2214 : : * @from: attach to all cgroups of a given task
2215 : : * @tsk: the task to be attached
2216 : : */
2217 : 0 : int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2218 : : {
2219 : : struct cgroupfs_root *root;
2220 : : int retval = 0;
2221 : :
2222 : 0 : mutex_lock(&cgroup_mutex);
2223 [ # # ]: 0 : for_each_active_root(root) {
2224 : 0 : struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
2225 : :
2226 : 0 : retval = cgroup_attach_task(from_cgrp, tsk, false);
2227 [ # # ]: 0 : if (retval)
2228 : : break;
2229 : : }
2230 : 0 : mutex_unlock(&cgroup_mutex);
2231 : :
2232 : 0 : return retval;
2233 : : }
2234 : : EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2235 : :
2236 : 0 : static int cgroup_tasks_write(struct cgroup_subsys_state *css,
2237 : : struct cftype *cft, u64 pid)
2238 : : {
2239 : 0 : return attach_task_by_pid(css->cgroup, pid, false);
2240 : : }
2241 : :
2242 : 0 : static int cgroup_procs_write(struct cgroup_subsys_state *css,
2243 : : struct cftype *cft, u64 tgid)
2244 : : {
2245 : 0 : return attach_task_by_pid(css->cgroup, tgid, true);
2246 : : }
2247 : :
2248 : 0 : static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2249 : : struct cftype *cft, const char *buffer)
2250 : : {
2251 : : BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
2252 [ # # ]: 0 : if (strlen(buffer) >= PATH_MAX)
2253 : : return -EINVAL;
2254 [ # # ]: 0 : if (!cgroup_lock_live_group(css->cgroup))
2255 : : return -ENODEV;
2256 : 0 : mutex_lock(&cgroup_root_mutex);
2257 : 0 : strcpy(css->cgroup->root->release_agent_path, buffer);
2258 : 0 : mutex_unlock(&cgroup_root_mutex);
2259 : 0 : mutex_unlock(&cgroup_mutex);
2260 : 0 : return 0;
2261 : : }
2262 : :
2263 : 0 : static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2264 : : struct cftype *cft, struct seq_file *seq)
2265 : : {
2266 : 0 : struct cgroup *cgrp = css->cgroup;
2267 : :
2268 [ # # ]: 0 : if (!cgroup_lock_live_group(cgrp))
2269 : : return -ENODEV;
2270 : 0 : seq_puts(seq, cgrp->root->release_agent_path);
2271 : 0 : seq_putc(seq, '\n');
2272 : 0 : mutex_unlock(&cgroup_mutex);
2273 : 0 : return 0;
2274 : : }
2275 : :
2276 : 0 : static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
2277 : : struct cftype *cft, struct seq_file *seq)
2278 : : {
2279 : 0 : seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
2280 : 0 : return 0;
2281 : : }
2282 : :
2283 : : /* A buffer size big enough for numbers or short strings */
2284 : : #define CGROUP_LOCAL_BUFFER_SIZE 64
2285 : :
2286 : 2 : static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
2287 : : struct cftype *cft, struct file *file,
2288 : : const char __user *userbuf, size_t nbytes,
2289 : : loff_t *unused_ppos)
2290 : : {
2291 : : char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2292 : : int retval = 0;
2293 : : char *end;
2294 : :
2295 [ + - ]: 2 : if (!nbytes)
2296 : : return -EINVAL;
2297 [ + - ]: 2 : if (nbytes >= sizeof(buffer))
2298 : : return -E2BIG;
2299 [ + - ]: 2 : if (copy_from_user(buffer, userbuf, nbytes))
2300 : : return -EFAULT;
2301 : :
2302 : 2 : buffer[nbytes] = 0; /* nul-terminate */
2303 [ + - ]: 2 : if (cft->write_u64) {
2304 : 2 : u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2305 [ + - ]: 2 : if (*end)
2306 : : return -EINVAL;
2307 : 2 : retval = cft->write_u64(css, cft, val);
2308 : : } else {
2309 : 0 : s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2310 [ # # ]: 0 : if (*end)
2311 : : return -EINVAL;
2312 : 0 : retval = cft->write_s64(css, cft, val);
2313 : : }
2314 [ + - ]: 4 : if (!retval)
2315 : 2 : retval = nbytes;
2316 : : return retval;
2317 : : }
2318 : :
2319 : 0 : static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
2320 : : struct cftype *cft, struct file *file,
2321 : : const char __user *userbuf, size_t nbytes,
2322 : : loff_t *unused_ppos)
2323 : : {
2324 : : char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2325 : : int retval = 0;
2326 : 0 : size_t max_bytes = cft->max_write_len;
2327 : : char *buffer = local_buffer;
2328 : :
2329 [ # # ]: 0 : if (!max_bytes)
2330 : : max_bytes = sizeof(local_buffer) - 1;
2331 [ # # ]: 0 : if (nbytes >= max_bytes)
2332 : : return -E2BIG;
2333 : : /* Allocate a dynamic buffer if we need one */
2334 [ # # ]: 0 : if (nbytes >= sizeof(local_buffer)) {
2335 : 0 : buffer = kmalloc(nbytes + 1, GFP_KERNEL);
2336 [ # # ]: 0 : if (buffer == NULL)
2337 : : return -ENOMEM;
2338 : : }
2339 [ # # ][ # # ]: 0 : if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
2340 : : retval = -EFAULT;
2341 : : goto out;
2342 : : }
2343 : :
2344 : 0 : buffer[nbytes] = 0; /* nul-terminate */
2345 : 0 : retval = cft->write_string(css, cft, strstrip(buffer));
2346 [ # # ]: 0 : if (!retval)
2347 : 0 : retval = nbytes;
2348 : : out:
2349 [ # # ]: 0 : if (buffer != local_buffer)
2350 : 0 : kfree(buffer);
2351 : : return retval;
2352 : : }
2353 : :
2354 : 0 : static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2355 : : size_t nbytes, loff_t *ppos)
2356 : : {
2357 : 2 : struct cfent *cfe = __d_cfe(file->f_dentry);
2358 : : struct cftype *cft = __d_cft(file->f_dentry);
2359 : 2 : struct cgroup_subsys_state *css = cfe->css;
2360 : :
2361 [ - + ]: 2 : if (cft->write)
2362 : 0 : return cft->write(css, cft, file, buf, nbytes, ppos);
2363 [ - + ][ # # ]: 2 : if (cft->write_u64 || cft->write_s64)
2364 : 2 : return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
2365 [ # # ]: 0 : if (cft->write_string)
2366 : 0 : return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
2367 [ # # ]: 0 : if (cft->trigger) {
2368 : 0 : int ret = cft->trigger(css, (unsigned int)cft->private);
2369 [ # # ]: 2 : return ret ? ret : nbytes;
2370 : : }
2371 : : return -EINVAL;
2372 : : }
2373 : :
2374 : 4 : static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
2375 : : struct cftype *cft, struct file *file,
2376 : : char __user *buf, size_t nbytes, loff_t *ppos)
2377 : : {
2378 : : char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2379 : 4 : u64 val = cft->read_u64(css, cft);
2380 : 4 : int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2381 : :
2382 : 4 : return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2383 : : }
2384 : :
2385 : 0 : static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
2386 : : struct cftype *cft, struct file *file,
2387 : : char __user *buf, size_t nbytes, loff_t *ppos)
2388 : : {
2389 : : char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2390 : 0 : s64 val = cft->read_s64(css, cft);
2391 : 0 : int len = sprintf(tmp, "%lld\n", (long long) val);
2392 : :
2393 : 0 : return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2394 : : }
2395 : :
2396 : 0 : static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2397 : : size_t nbytes, loff_t *ppos)
2398 : : {
2399 : 4 : struct cfent *cfe = __d_cfe(file->f_dentry);
2400 : : struct cftype *cft = __d_cft(file->f_dentry);
2401 : 4 : struct cgroup_subsys_state *css = cfe->css;
2402 : :
2403 [ - + ]: 4 : if (cft->read)
2404 : 0 : return cft->read(css, cft, file, buf, nbytes, ppos);
2405 [ + - ]: 4 : if (cft->read_u64)
2406 : 4 : return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
2407 [ # # ]: 0 : if (cft->read_s64)
2408 : 0 : return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
2409 : : return -EINVAL;
2410 : : }
2411 : :
2412 : : /*
2413 : : * seqfile ops/methods for returning structured data. Currently just
2414 : : * supports string->u64 maps, but can be extended in future.
2415 : : */
2416 : :
2417 : 0 : static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2418 : : {
2419 : 0 : struct seq_file *sf = cb->state;
2420 : 0 : return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
2421 : : }
2422 : :
2423 : 0 : static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2424 : : {
2425 : 0 : struct cfent *cfe = m->private;
2426 : 0 : struct cftype *cft = cfe->type;
2427 : 0 : struct cgroup_subsys_state *css = cfe->css;
2428 : :
2429 [ # # ]: 0 : if (cft->read_map) {
2430 : 0 : struct cgroup_map_cb cb = {
2431 : : .fill = cgroup_map_add,
2432 : : .state = m,
2433 : : };
2434 : 0 : return cft->read_map(css, cft, &cb);
2435 : : }
2436 : 0 : return cft->read_seq_string(css, cft, m);
2437 : : }
2438 : :
2439 : : static const struct file_operations cgroup_seqfile_operations = {
2440 : : .read = seq_read,
2441 : : .write = cgroup_file_write,
2442 : : .llseek = seq_lseek,
2443 : : .release = cgroup_file_release,
2444 : : };
2445 : :
2446 : 0 : static int cgroup_file_open(struct inode *inode, struct file *file)
2447 : : {
2448 : 5 : struct cfent *cfe = __d_cfe(file->f_dentry);
2449 : : struct cftype *cft = __d_cft(file->f_dentry);
2450 : 5 : struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2451 : : struct cgroup_subsys_state *css;
2452 : : int err;
2453 : :
2454 : 5 : err = generic_file_open(inode, file);
2455 [ + - ]: 5 : if (err)
2456 : : return err;
2457 : :
2458 : : /*
2459 : : * If the file belongs to a subsystem, pin the css. Will be
2460 : : * unpinned either on open failure or release. This ensures that
2461 : : * @css stays alive for all file operations.
2462 : : */
2463 : : rcu_read_lock();
2464 : 5 : css = cgroup_css(cgrp, cft->ss);
2465 [ - + ][ # # ]: 5 : if (cft->ss && !css_tryget(css))
2466 : : css = NULL;
2467 : : rcu_read_unlock();
2468 : :
2469 [ + - ]: 5 : if (!css)
2470 : : return -ENODEV;
2471 : :
2472 : : /*
2473 : : * @cfe->css is used by read/write/close to determine the
2474 : : * associated css. @file->private_data would be a better place but
2475 : : * that's already used by seqfile. Multiple accessors may use it
2476 : : * simultaneously which is okay as the association never changes.
2477 : : */
2478 [ + + ][ + - ]: 5 : WARN_ON_ONCE(cfe->css && cfe->css != css);
[ - + ][ # # ]
[ # # ]
2479 : 5 : cfe->css = css;
2480 : :
2481 [ + - ][ - + ]: 5 : if (cft->read_map || cft->read_seq_string) {
2482 : 0 : file->f_op = &cgroup_seqfile_operations;
2483 : 0 : err = single_open(file, cgroup_seqfile_show, cfe);
2484 [ + + ]: 5 : } else if (cft->open) {
2485 : 1 : err = cft->open(inode, file);
2486 : : }
2487 : :
2488 [ - + ][ # # ]: 5 : if (css->ss && err)
2489 : : css_put(css);
2490 : 5 : return err;
2491 : : }
2492 : :
2493 : 0 : static int cgroup_file_release(struct inode *inode, struct file *file)
2494 : : {
2495 : 4 : struct cfent *cfe = __d_cfe(file->f_dentry);
2496 : : struct cftype *cft = __d_cft(file->f_dentry);
2497 : 4 : struct cgroup_subsys_state *css = cfe->css;
2498 : : int ret = 0;
2499 : :
2500 [ - + ]: 4 : if (cft->release)
2501 : 0 : ret = cft->release(inode, file);
2502 [ - + ]: 8 : if (css->ss)
2503 : : css_put(css);
2504 [ - + ]: 4 : if (file->f_op == &cgroup_seqfile_operations)
2505 : 0 : single_release(inode, file);
2506 : 4 : return ret;
2507 : : }
2508 : :
2509 : : /*
2510 : : * cgroup_rename - Only allow simple rename of directories in place.
2511 : : */
2512 : 0 : static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2513 : 0 : struct inode *new_dir, struct dentry *new_dentry)
2514 : : {
2515 : : int ret;
2516 : : struct cgroup_name *name, *old_name;
2517 : 0 : struct cgroup *cgrp;
2518 : :
2519 : : /*
2520 : : * It's convinient to use parent dir's i_mutex to protected
2521 : : * cgrp->name.
2522 : : */
2523 : : lockdep_assert_held(&old_dir->i_mutex);
2524 : :
2525 [ # # ]: 0 : if (!S_ISDIR(old_dentry->d_inode->i_mode))
2526 : : return -ENOTDIR;
2527 [ # # ]: 0 : if (new_dentry->d_inode)
2528 : : return -EEXIST;
2529 [ # # ]: 0 : if (old_dir != new_dir)
2530 : : return -EIO;
2531 : :
2532 : : cgrp = __d_cgrp(old_dentry);
2533 : :
2534 : : /*
2535 : : * This isn't a proper migration and its usefulness is very
2536 : : * limited. Disallow if sane_behavior.
2537 : : */
2538 [ # # ]: 0 : if (cgroup_sane_behavior(cgrp))
2539 : : return -EPERM;
2540 : :
2541 : 0 : name = cgroup_alloc_name(new_dentry);
2542 [ # # ]: 0 : if (!name)
2543 : : return -ENOMEM;
2544 : :
2545 : 0 : ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2546 [ # # ]: 0 : if (ret) {
2547 : 0 : kfree(name);
2548 : 0 : return ret;
2549 : : }
2550 : :
2551 : 0 : old_name = rcu_dereference_protected(cgrp->name, true);
2552 : 0 : rcu_assign_pointer(cgrp->name, name);
2553 : :
2554 : 0 : kfree_rcu(old_name, rcu_head);
2555 : 0 : return 0;
2556 : : }
2557 : :
2558 : 0 : static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2559 : : {
2560 [ # # ][ # # ]: 0 : if (S_ISDIR(dentry->d_inode->i_mode))
[ # # ][ # # ]
2561 : 0 : return &__d_cgrp(dentry)->xattrs;
2562 : : else
2563 : 0 : return &__d_cfe(dentry)->xattrs;
2564 : : }
2565 : :
2566 : : static inline int xattr_enabled(struct dentry *dentry)
2567 : : {
2568 : 0 : struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2569 : 0 : return root->flags & CGRP_ROOT_XATTR;
2570 : : }
2571 : :
2572 : 0 : static bool is_valid_xattr(const char *name)
2573 : : {
2574 [ # # ][ # # ]: 0 : if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2575 : 0 : !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2576 : : return true;
2577 : 0 : return false;
2578 : : }
2579 : :
2580 : 0 : static int cgroup_setxattr(struct dentry *dentry, const char *name,
2581 : : const void *val, size_t size, int flags)
2582 : : {
2583 [ # # ]: 0 : if (!xattr_enabled(dentry))
2584 : : return -EOPNOTSUPP;
2585 [ # # ]: 0 : if (!is_valid_xattr(name))
2586 : : return -EINVAL;
2587 : 0 : return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2588 : : }
2589 : :
2590 : 0 : static int cgroup_removexattr(struct dentry *dentry, const char *name)
2591 : : {
2592 [ # # ]: 0 : if (!xattr_enabled(dentry))
2593 : : return -EOPNOTSUPP;
2594 [ # # ]: 0 : if (!is_valid_xattr(name))
2595 : : return -EINVAL;
2596 : 0 : return simple_xattr_remove(__d_xattrs(dentry), name);
2597 : : }
2598 : :
2599 : 0 : static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2600 : : void *buf, size_t size)
2601 : : {
2602 [ # # ]: 0 : if (!xattr_enabled(dentry))
2603 : : return -EOPNOTSUPP;
2604 [ # # ]: 0 : if (!is_valid_xattr(name))
2605 : : return -EINVAL;
2606 : 0 : return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2607 : : }
2608 : :
2609 : 0 : static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2610 : : {
2611 [ # # ]: 0 : if (!xattr_enabled(dentry))
2612 : : return -EOPNOTSUPP;
2613 : 0 : return simple_xattr_list(__d_xattrs(dentry), buf, size);
2614 : : }
2615 : :
2616 : : static const struct file_operations cgroup_file_operations = {
2617 : : .read = cgroup_file_read,
2618 : : .write = cgroup_file_write,
2619 : : .llseek = generic_file_llseek,
2620 : : .open = cgroup_file_open,
2621 : : .release = cgroup_file_release,
2622 : : };
2623 : :
2624 : : static const struct inode_operations cgroup_file_inode_operations = {
2625 : : .setxattr = cgroup_setxattr,
2626 : : .getxattr = cgroup_getxattr,
2627 : : .listxattr = cgroup_listxattr,
2628 : : .removexattr = cgroup_removexattr,
2629 : : };
2630 : :
2631 : : static const struct inode_operations cgroup_dir_inode_operations = {
2632 : : .lookup = simple_lookup,
2633 : : .mkdir = cgroup_mkdir,
2634 : : .rmdir = cgroup_rmdir,
2635 : : .rename = cgroup_rename,
2636 : : .setxattr = cgroup_setxattr,
2637 : : .getxattr = cgroup_getxattr,
2638 : : .listxattr = cgroup_listxattr,
2639 : : .removexattr = cgroup_removexattr,
2640 : : };
2641 : :
2642 : : /*
2643 : : * Check if a file is a control file
2644 : : */
2645 : 0 : static inline struct cftype *__file_cft(struct file *file)
2646 : : {
2647 [ # # ]: 0 : if (file_inode(file)->i_fop != &cgroup_file_operations)
2648 : : return ERR_PTR(-EINVAL);
2649 : 0 : return __d_cft(file->f_dentry);
2650 : : }
2651 : :
2652 : 0 : static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2653 : : struct super_block *sb)
2654 : : {
2655 : : struct inode *inode;
2656 : :
2657 [ + - ]: 33 : if (!dentry)
2658 : : return -ENOENT;
2659 [ + - ]: 33 : if (dentry->d_inode)
2660 : : return -EEXIST;
2661 : :
2662 : 33 : inode = cgroup_new_inode(mode, sb);
2663 [ + - ]: 33 : if (!inode)
2664 : : return -ENOMEM;
2665 : :
2666 [ + + ]: 33 : if (S_ISDIR(mode)) {
2667 : 2 : inode->i_op = &cgroup_dir_inode_operations;
2668 : 2 : inode->i_fop = &simple_dir_operations;
2669 : :
2670 : : /* start off with i_nlink == 2 (for "." entry) */
2671 : 2 : inc_nlink(inode);
2672 : 2 : inc_nlink(dentry->d_parent->d_inode);
2673 : :
2674 : : /*
2675 : : * Control reaches here with cgroup_mutex held.
2676 : : * @inode->i_mutex should nest outside cgroup_mutex but we
2677 : : * want to populate it immediately without releasing
2678 : : * cgroup_mutex. As @inode isn't visible to anyone else
2679 : : * yet, trylock will always succeed without affecting
2680 : : * lockdep checks.
2681 : : */
2682 [ - + ][ # # ]: 2 : WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
[ # # ]
2683 [ + - ]: 31 : } else if (S_ISREG(mode)) {
2684 : 31 : inode->i_size = 0;
2685 : 31 : inode->i_fop = &cgroup_file_operations;
2686 : 31 : inode->i_op = &cgroup_file_inode_operations;
2687 : : }
2688 : 33 : d_instantiate(dentry, inode);
2689 : : dget(dentry); /* Extra count - pin the dentry in core */
2690 : : return 0;
2691 : : }
2692 : :
2693 : : /**
2694 : : * cgroup_file_mode - deduce file mode of a control file
2695 : : * @cft: the control file in question
2696 : : *
2697 : : * returns cft->mode if ->mode is not 0
2698 : : * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
2699 : : * returns S_IRUGO if it has only a read handler
2700 : : * returns S_IWUSR if it has only a write hander
2701 : : */
2702 : 0 : static umode_t cgroup_file_mode(const struct cftype *cft)
2703 : : {
2704 : : umode_t mode = 0;
2705 : :
2706 [ + ]: 31 : if (cft->mode)
2707 : : return cft->mode;
2708 : :
2709 [ + - ][ + + ]: 47 : if (cft->read || cft->read_u64 || cft->read_s64 ||
[ + - ][ + - ]
2710 [ + - ]: 16 : cft->read_map || cft->read_seq_string)
2711 : : mode |= S_IRUGO;
2712 : :
2713 [ + - ][ + + ]: 16 : if (cft->write || cft->write_u64 || cft->write_s64 ||
[ + - ][ + + ]
2714 [ - + ]: 3 : cft->write_string || cft->trigger)
2715 : 13 : mode |= S_IWUSR;
2716 : :
2717 : 16 : return mode;
2718 : : }
2719 : :
2720 : 0 : static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2721 : : {
2722 : 31 : struct dentry *dir = cgrp->dentry;
2723 : : struct cgroup *parent = __d_cgrp(dir);
2724 : : struct dentry *dentry;
2725 : : struct cfent *cfe;
2726 : : int error;
2727 : : umode_t mode;
2728 : 31 : char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2729 : :
2730 [ - + ][ # # ]: 31 : if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
[ # # ]
2731 : 0 : !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2732 : 0 : strcpy(name, cft->ss->name);
2733 : 0 : strcat(name, ".");
2734 : : }
2735 : 31 : strcat(name, cft->name);
2736 : :
2737 [ - + ]: 31 : BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2738 : :
2739 : : cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2740 [ + - ]: 31 : if (!cfe)
2741 : : return -ENOMEM;
2742 : :
2743 : 31 : dentry = lookup_one_len(name, dir, strlen(name));
2744 [ - + ]: 31 : if (IS_ERR(dentry)) {
2745 : : error = PTR_ERR(dentry);
2746 : : goto out;
2747 : : }
2748 : :
2749 : 31 : cfe->type = (void *)cft;
2750 : 31 : cfe->dentry = dentry;
2751 : 31 : dentry->d_fsdata = cfe;
2752 : : simple_xattrs_init(&cfe->xattrs);
2753 : :
2754 : 31 : mode = cgroup_file_mode(cft);
2755 : 31 : error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2756 [ + - ]: 31 : if (!error) {
2757 : 31 : list_add_tail(&cfe->node, &parent->files);
2758 : : cfe = NULL;
2759 : : }
2760 : 31 : dput(dentry);
2761 : : out:
2762 : 31 : kfree(cfe);
2763 : : return error;
2764 : : }
2765 : :
2766 : : /**
2767 : : * cgroup_addrm_files - add or remove files to a cgroup directory
2768 : : * @cgrp: the target cgroup
2769 : : * @cfts: array of cftypes to be added
2770 : : * @is_add: whether to add or remove
2771 : : *
2772 : : * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2773 : : * For removals, this function never fails. If addition fails, this
2774 : : * function doesn't remove files already added. The caller is responsible
2775 : : * for cleaning up.
2776 : : */
2777 : 0 : static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2778 : : bool is_add)
2779 : : {
2780 : : struct cftype *cft;
2781 : : int ret;
2782 : :
2783 : : lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
2784 : : lockdep_assert_held(&cgroup_mutex);
2785 : :
2786 [ + + ]: 56 : for (cft = cfts; cft->name[0] != '\0'; cft++) {
2787 : : /* does cft->flags tell us to skip this file on @cgrp? */
2788 [ + + ][ - + ]: 49 : if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2789 : 0 : continue;
2790 [ - + ][ # # ]: 49 : if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2791 : 0 : continue;
2792 [ + + ][ + + ]: 49 : if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2793 : 8 : continue;
2794 : :
2795 [ + + ]: 41 : if (is_add) {
2796 : 31 : ret = cgroup_add_file(cgrp, cft);
2797 [ - + ]: 31 : if (ret) {
2798 : 0 : pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2799 : : cft->name, ret);
2800 : 0 : return ret;
2801 : : }
2802 : : } else {
2803 : 10 : cgroup_rm_file(cgrp, cft);
2804 : : }
2805 : : }
2806 : : return 0;
2807 : : }
2808 : :
2809 : : static void cgroup_cfts_prepare(void)
2810 : : __acquires(&cgroup_mutex)
2811 : : {
2812 : : /*
2813 : : * Thanks to the entanglement with vfs inode locking, we can't walk
2814 : : * the existing cgroups under cgroup_mutex and create files.
2815 : : * Instead, we use css_for_each_descendant_pre() and drop RCU read
2816 : : * lock before calling cgroup_addrm_files().
2817 : : */
2818 : 0 : mutex_lock(&cgroup_mutex);
2819 : : }
2820 : :
2821 : 0 : static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2822 : : __releases(&cgroup_mutex)
2823 : : {
2824 : 0 : LIST_HEAD(pending);
2825 : 0 : struct cgroup_subsys *ss = cfts[0].ss;
2826 : 0 : struct cgroup *root = &ss->root->top_cgroup;
2827 : 0 : struct super_block *sb = ss->root->sb;
2828 : : struct dentry *prev = NULL;
2829 : : struct inode *inode;
2830 : : struct cgroup_subsys_state *css;
2831 : : u64 update_before;
2832 : : int ret = 0;
2833 : :
2834 : : /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2835 [ # # ][ # # ]: 0 : if (!cfts || ss->root == &cgroup_dummy_root ||
[ # # ]
2836 : 0 : !atomic_inc_not_zero(&sb->s_active)) {
2837 : 0 : mutex_unlock(&cgroup_mutex);
2838 : 0 : return 0;
2839 : : }
2840 : :
2841 : : /*
2842 : : * All cgroups which are created after we drop cgroup_mutex will
2843 : : * have the updated set of files, so we only need to update the
2844 : : * cgroups created before the current @cgroup_serial_nr_next.
2845 : : */
2846 : 0 : update_before = cgroup_serial_nr_next;
2847 : :
2848 : 0 : mutex_unlock(&cgroup_mutex);
2849 : :
2850 : : /* add/rm files for all cgroups created before */
2851 : : rcu_read_lock();
2852 [ # # ]: 0 : css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2853 : 0 : struct cgroup *cgrp = css->cgroup;
2854 : :
2855 [ # # ]: 0 : if (cgroup_is_dead(cgrp))
2856 : 0 : continue;
2857 : :
2858 : 0 : inode = cgrp->dentry->d_inode;
2859 : : dget(cgrp->dentry);
2860 : : rcu_read_unlock();
2861 : :
2862 : 0 : dput(prev);
2863 : 0 : prev = cgrp->dentry;
2864 : :
2865 : 0 : mutex_lock(&inode->i_mutex);
2866 : 0 : mutex_lock(&cgroup_mutex);
2867 [ # # ][ # # ]: 0 : if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2868 : 0 : ret = cgroup_addrm_files(cgrp, cfts, is_add);
2869 : 0 : mutex_unlock(&cgroup_mutex);
2870 : 0 : mutex_unlock(&inode->i_mutex);
2871 : :
2872 : : rcu_read_lock();
2873 [ # # ]: 0 : if (ret)
2874 : : break;
2875 : : }
2876 : : rcu_read_unlock();
2877 : 0 : dput(prev);
2878 : 0 : deactivate_super(sb);
2879 : 0 : return ret;
2880 : : }
2881 : :
2882 : : /**
2883 : : * cgroup_add_cftypes - add an array of cftypes to a subsystem
2884 : : * @ss: target cgroup subsystem
2885 : : * @cfts: zero-length name terminated array of cftypes
2886 : : *
2887 : : * Register @cfts to @ss. Files described by @cfts are created for all
2888 : : * existing cgroups to which @ss is attached and all future cgroups will
2889 : : * have them too. This function can be called anytime whether @ss is
2890 : : * attached or not.
2891 : : *
2892 : : * Returns 0 on successful registration, -errno on failure. Note that this
2893 : : * function currently returns 0 as long as @cfts registration is successful
2894 : : * even if some file creation attempts on existing cgroups fail.
2895 : : */
2896 : 0 : int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2897 : : {
2898 : : struct cftype_set *set;
2899 : : struct cftype *cft;
2900 : : int ret;
2901 : :
2902 : : set = kzalloc(sizeof(*set), GFP_KERNEL);
2903 [ # # ]: 0 : if (!set)
2904 : : return -ENOMEM;
2905 : :
2906 [ # # ]: 0 : for (cft = cfts; cft->name[0] != '\0'; cft++)
2907 : 0 : cft->ss = ss;
2908 : :
2909 : : cgroup_cfts_prepare();
2910 : 0 : set->cfts = cfts;
2911 : 0 : list_add_tail(&set->node, &ss->cftsets);
2912 : 0 : ret = cgroup_cfts_commit(cfts, true);
2913 [ # # ]: 0 : if (ret)
2914 : 0 : cgroup_rm_cftypes(cfts);
2915 : 0 : return ret;
2916 : : }
2917 : : EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2918 : :
2919 : : /**
2920 : : * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2921 : : * @cfts: zero-length name terminated array of cftypes
2922 : : *
2923 : : * Unregister @cfts. Files described by @cfts are removed from all
2924 : : * existing cgroups and all future cgroups won't have them either. This
2925 : : * function can be called anytime whether @cfts' subsys is attached or not.
2926 : : *
2927 : : * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2928 : : * registered.
2929 : : */
2930 : 0 : int cgroup_rm_cftypes(struct cftype *cfts)
2931 : : {
2932 : : struct cftype_set *set;
2933 : :
2934 [ # # ][ # # ]: 0 : if (!cfts || !cfts[0].ss)
2935 : : return -ENOENT;
2936 : :
2937 : : cgroup_cfts_prepare();
2938 : :
2939 [ # # ]: 0 : list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
2940 [ # # ]: 0 : if (set->cfts == cfts) {
2941 : : list_del(&set->node);
2942 : 0 : kfree(set);
2943 : 0 : cgroup_cfts_commit(cfts, false);
2944 : 0 : return 0;
2945 : : }
2946 : : }
2947 : :
2948 : 0 : cgroup_cfts_commit(NULL, false);
2949 : 0 : return -ENOENT;
2950 : : }
2951 : :
2952 : : /**
2953 : : * cgroup_task_count - count the number of tasks in a cgroup.
2954 : : * @cgrp: the cgroup in question
2955 : : *
2956 : : * Return the number of tasks in the cgroup.
2957 : : */
2958 : 0 : int cgroup_task_count(const struct cgroup *cgrp)
2959 : : {
2960 : : int count = 0;
2961 : : struct cgrp_cset_link *link;
2962 : :
2963 : 1 : read_lock(&css_set_lock);
2964 [ + + ]: 2 : list_for_each_entry(link, &cgrp->cset_links, cset_link)
2965 : 1 : count += atomic_read(&link->cset->refcount);
2966 : : read_unlock(&css_set_lock);
2967 : 1 : return count;
2968 : : }
2969 : :
2970 : : /*
2971 : : * To reduce the fork() overhead for systems that are not actually using
2972 : : * their cgroups capability, we don't maintain the lists running through
2973 : : * each css_set to its tasks until we see the list actually used - in other
2974 : : * words after the first call to css_task_iter_start().
2975 : : */
2976 : 0 : static void cgroup_enable_task_cg_lists(void)
2977 : : {
2978 : : struct task_struct *p, *g;
2979 : 1 : write_lock(&css_set_lock);
2980 : 1 : use_task_css_set_links = 1;
2981 : : /*
2982 : : * We need tasklist_lock because RCU is not safe against
2983 : : * while_each_thread(). Besides, a forking task that has passed
2984 : : * cgroup_post_fork() without seeing use_task_css_set_links = 1
2985 : : * is not guaranteed to have its child immediately visible in the
2986 : : * tasklist if we walk through it with RCU.
2987 : : */
2988 : 1 : read_lock(&tasklist_lock);
2989 [ + + ]: 283 : do_each_thread(g, p) {
2990 : : task_lock(p);
2991 : : /*
2992 : : * We should check if the process is exiting, otherwise
2993 : : * it will race with cgroup_exit() in that the list
2994 : : * entry won't be deleted though the process has exited.
2995 : : */
2996 [ + + ][ + - ]: 285 : if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2997 : 92 : list_add(&p->cg_list, &task_css_set(p)->tasks);
2998 : : task_unlock(p);
2999 [ + + ]: 285 : } while_each_thread(g, p);
3000 : : read_unlock(&tasklist_lock);
3001 : : write_unlock(&css_set_lock);
3002 : 1 : }
3003 : :
3004 : : /**
3005 : : * css_next_child - find the next child of a given css
3006 : : * @pos_css: the current position (%NULL to initiate traversal)
3007 : : * @parent_css: css whose children to walk
3008 : : *
3009 : : * This function returns the next child of @parent_css and should be called
3010 : : * under RCU read lock. The only requirement is that @parent_css and
3011 : : * @pos_css are accessible. The next sibling is guaranteed to be returned
3012 : : * regardless of their states.
3013 : : */
3014 : : struct cgroup_subsys_state *
3015 : 0 : css_next_child(struct cgroup_subsys_state *pos_css,
3016 : : struct cgroup_subsys_state *parent_css)
3017 : : {
3018 [ # # ]: 0 : struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
3019 : 0 : struct cgroup *cgrp = parent_css->cgroup;
3020 : : struct cgroup *next;
3021 : :
3022 : : WARN_ON_ONCE(!rcu_read_lock_held());
3023 : :
3024 : : /*
3025 : : * @pos could already have been removed. Once a cgroup is removed,
3026 : : * its ->sibling.next is no longer updated when its next sibling
3027 : : * changes. As CGRP_DEAD assertion is serialized and happens
3028 : : * before the cgroup is taken off the ->sibling list, if we see it
3029 : : * unasserted, it's guaranteed that the next sibling hasn't
3030 : : * finished its grace period even if it's already removed, and thus
3031 : : * safe to dereference from this RCU critical section. If
3032 : : * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3033 : : * to be visible as %true here.
3034 : : *
3035 : : * If @pos is dead, its next pointer can't be dereferenced;
3036 : : * however, as each cgroup is given a monotonically increasing
3037 : : * unique serial number and always appended to the sibling list,
3038 : : * the next one can be found by walking the parent's children until
3039 : : * we see a cgroup with higher serial number than @pos's. While
3040 : : * this path can be slower, it's taken only when either the current
3041 : : * cgroup is removed or iteration and removal race.
3042 : : */
3043 [ # # ]: 0 : if (!pos) {
3044 : 0 : next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
3045 [ # # ]: 0 : } else if (likely(!cgroup_is_dead(pos))) {
3046 : 0 : next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3047 : : } else {
3048 [ # # ]: 0 : list_for_each_entry_rcu(next, &cgrp->children, sibling)
3049 [ # # ]: 0 : if (next->serial_nr > pos->serial_nr)
3050 : : break;
3051 : : }
3052 : :
3053 [ # # ]: 0 : if (&next->sibling == &cgrp->children)
3054 : : return NULL;
3055 : :
3056 : 0 : return cgroup_css(next, parent_css->ss);
3057 : : }
3058 : : EXPORT_SYMBOL_GPL(css_next_child);
3059 : :
3060 : : /**
3061 : : * css_next_descendant_pre - find the next descendant for pre-order walk
3062 : : * @pos: the current position (%NULL to initiate traversal)
3063 : : * @root: css whose descendants to walk
3064 : : *
3065 : : * To be used by css_for_each_descendant_pre(). Find the next descendant
3066 : : * to visit for pre-order traversal of @root's descendants. @root is
3067 : : * included in the iteration and the first node to be visited.
3068 : : *
3069 : : * While this function requires RCU read locking, it doesn't require the
3070 : : * whole traversal to be contained in a single RCU critical section. This
3071 : : * function will return the correct next descendant as long as both @pos
3072 : : * and @root are accessible and @pos is a descendant of @root.
3073 : : */
3074 : : struct cgroup_subsys_state *
3075 : 0 : css_next_descendant_pre(struct cgroup_subsys_state *pos,
3076 : : struct cgroup_subsys_state *root)
3077 : : {
3078 : : struct cgroup_subsys_state *next;
3079 : :
3080 : : WARN_ON_ONCE(!rcu_read_lock_held());
3081 : :
3082 : : /* if first iteration, visit @root */
3083 [ # # ]: 0 : if (!pos)
3084 : : return root;
3085 : :
3086 : : /* visit the first child if exists */
3087 : 0 : next = css_next_child(NULL, pos);
3088 [ # # ]: 0 : if (next)
3089 : : return next;
3090 : :
3091 : : /* no child, visit my or the closest ancestor's next sibling */
3092 [ # # ]: 0 : while (pos != root) {
3093 : 0 : next = css_next_child(pos, css_parent(pos));
3094 [ # # ]: 0 : if (next)
3095 : : return next;
3096 : : pos = css_parent(pos);
3097 : : }
3098 : :
3099 : : return NULL;
3100 : : }
3101 : : EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3102 : :
3103 : : /**
3104 : : * css_rightmost_descendant - return the rightmost descendant of a css
3105 : : * @pos: css of interest
3106 : : *
3107 : : * Return the rightmost descendant of @pos. If there's no descendant, @pos
3108 : : * is returned. This can be used during pre-order traversal to skip
3109 : : * subtree of @pos.
3110 : : *
3111 : : * While this function requires RCU read locking, it doesn't require the
3112 : : * whole traversal to be contained in a single RCU critical section. This
3113 : : * function will return the correct rightmost descendant as long as @pos is
3114 : : * accessible.
3115 : : */
3116 : : struct cgroup_subsys_state *
3117 : 0 : css_rightmost_descendant(struct cgroup_subsys_state *pos)
3118 : : {
3119 : : struct cgroup_subsys_state *last, *tmp;
3120 : :
3121 : : WARN_ON_ONCE(!rcu_read_lock_held());
3122 : :
3123 : : do {
3124 : : last = pos;
3125 : : /* ->prev isn't RCU safe, walk ->next till the end */
3126 : : pos = NULL;
3127 [ # # ]: 0 : css_for_each_child(tmp, last)
3128 : : pos = tmp;
3129 [ # # ]: 0 : } while (pos);
3130 : :
3131 : 0 : return last;
3132 : : }
3133 : : EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3134 : :
3135 : : static struct cgroup_subsys_state *
3136 : : css_leftmost_descendant(struct cgroup_subsys_state *pos)
3137 : : {
3138 : : struct cgroup_subsys_state *last;
3139 : :
3140 : : do {
3141 : : last = pos;
3142 : 0 : pos = css_next_child(NULL, pos);
3143 [ # # ][ # # ]: 0 : } while (pos);
3144 : :
3145 : : return last;
3146 : : }
3147 : :
3148 : : /**
3149 : : * css_next_descendant_post - find the next descendant for post-order walk
3150 : : * @pos: the current position (%NULL to initiate traversal)
3151 : : * @root: css whose descendants to walk
3152 : : *
3153 : : * To be used by css_for_each_descendant_post(). Find the next descendant
3154 : : * to visit for post-order traversal of @root's descendants. @root is
3155 : : * included in the iteration and the last node to be visited.
3156 : : *
3157 : : * While this function requires RCU read locking, it doesn't require the
3158 : : * whole traversal to be contained in a single RCU critical section. This
3159 : : * function will return the correct next descendant as long as both @pos
3160 : : * and @cgroup are accessible and @pos is a descendant of @cgroup.
3161 : : */
3162 : : struct cgroup_subsys_state *
3163 : 0 : css_next_descendant_post(struct cgroup_subsys_state *pos,
3164 : : struct cgroup_subsys_state *root)
3165 : : {
3166 : : struct cgroup_subsys_state *next;
3167 : :
3168 : : WARN_ON_ONCE(!rcu_read_lock_held());
3169 : :
3170 : : /* if first iteration, visit leftmost descendant which may be @root */
3171 [ # # ]: 0 : if (!pos)
3172 : : return css_leftmost_descendant(root);
3173 : :
3174 : : /* if we visited @root, we're done */
3175 [ # # ]: 0 : if (pos == root)
3176 : : return NULL;
3177 : :
3178 : : /* if there's an unvisited sibling, visit its leftmost descendant */
3179 : 0 : next = css_next_child(pos, css_parent(pos));
3180 [ # # ]: 0 : if (next)
3181 : : return css_leftmost_descendant(next);
3182 : :
3183 : : /* no sibling left, visit parent */
3184 : 0 : return css_parent(pos);
3185 : : }
3186 : : EXPORT_SYMBOL_GPL(css_next_descendant_post);
3187 : :
3188 : : /**
3189 : : * css_advance_task_iter - advance a task itererator to the next css_set
3190 : : * @it: the iterator to advance
3191 : : *
3192 : : * Advance @it to the next css_set to walk.
3193 : : */
3194 : : static void css_advance_task_iter(struct css_task_iter *it)
3195 : : {
3196 : : struct list_head *l = it->cset_link;
3197 : : struct cgrp_cset_link *link;
3198 : : struct css_set *cset;
3199 : :
3200 : : /* Advance to the next non-empty css_set */
3201 : : do {
3202 : 2 : l = l->next;
3203 [ + - ][ - + ]: 3 : if (l == &it->origin_css->cgroup->cset_links) {
3204 : 1 : it->cset_link = NULL;
3205 : : return;
3206 : : }
3207 : : link = list_entry(l, struct cgrp_cset_link, cset_link);
3208 : 1 : cset = link->cset;
3209 [ # # ][ - + ]: 1 : } while (list_empty(&cset->tasks));
3210 : 1 : it->cset_link = l;
3211 : 1 : it->task = cset->tasks.next;
3212 : : }
3213 : :
3214 : : /**
3215 : : * css_task_iter_start - initiate task iteration
3216 : : * @css: the css to walk tasks of
3217 : : * @it: the task iterator to use
3218 : : *
3219 : : * Initiate iteration through the tasks of @css. The caller can call
3220 : : * css_task_iter_next() to walk through the tasks until the function
3221 : : * returns NULL. On completion of iteration, css_task_iter_end() must be
3222 : : * called.
3223 : : *
3224 : : * Note that this function acquires a lock which is released when the
3225 : : * iteration finishes. The caller can't sleep while iteration is in
3226 : : * progress.
3227 : : */
3228 : 0 : void css_task_iter_start(struct cgroup_subsys_state *css,
3229 : : struct css_task_iter *it)
3230 : : __acquires(css_set_lock)
3231 : : {
3232 : : /*
3233 : : * The first time anyone tries to iterate across a css, we need to
3234 : : * enable the list linking each css_set to its tasks, and fix up
3235 : : * all existing tasks.
3236 : : */
3237 [ + - ]: 1 : if (!use_task_css_set_links)
3238 : 1 : cgroup_enable_task_cg_lists();
3239 : :
3240 : 1 : read_lock(&css_set_lock);
3241 : :
3242 : 2 : it->origin_css = css;
3243 : 2 : it->cset_link = &css->cgroup->cset_links;
3244 : :
3245 : : css_advance_task_iter(it);
3246 : 1 : }
3247 : :
3248 : : /**
3249 : : * css_task_iter_next - return the next task for the iterator
3250 : : * @it: the task iterator being iterated
3251 : : *
3252 : : * The "next" function for task iteration. @it should have been
3253 : : * initialized via css_task_iter_start(). Returns NULL when the iteration
3254 : : * reaches the end.
3255 : : */
3256 : 0 : struct task_struct *css_task_iter_next(struct css_task_iter *it)
3257 : : {
3258 : : struct task_struct *res;
3259 : 93 : struct list_head *l = it->task;
3260 : : struct cgrp_cset_link *link;
3261 : :
3262 : : /* If the iterator cg is NULL, we have no tasks */
3263 [ + ]: 93 : if (!it->cset_link)
3264 : : return NULL;
3265 : : res = list_entry(l, struct task_struct, cg_list);
3266 : : /* Advance iterator to find next entry */
3267 : 185 : l = l->next;
3268 : : link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3269 [ + + ]: 185 : if (l == &link->cset->tasks) {
3270 : : /*
3271 : : * We reached the end of this task list - move on to the
3272 : : * next cgrp_cset_link.
3273 : : */
3274 : : css_advance_task_iter(it);
3275 : : } else {
3276 : 91 : it->task = l;
3277 : : }
3278 : 92 : return res;
3279 : : }
3280 : :
3281 : : /**
3282 : : * css_task_iter_end - finish task iteration
3283 : : * @it: the task iterator to finish
3284 : : *
3285 : : * Finish task iteration started by css_task_iter_start().
3286 : : */
3287 : 0 : void css_task_iter_end(struct css_task_iter *it)
3288 : : __releases(css_set_lock)
3289 : : {
3290 : : read_unlock(&css_set_lock);
3291 : 1 : }
3292 : :
3293 : : static inline int started_after_time(struct task_struct *t1,
3294 : : struct timespec *time,
3295 : : struct task_struct *t2)
3296 : : {
3297 : : int start_diff = timespec_compare(&t1->start_time, time);
3298 [ # # ][ # # ]: 0 : if (start_diff > 0) {
3299 : : return 1;
3300 [ # # ][ # # ]: 0 : } else if (start_diff < 0) {
3301 : : return 0;
3302 : : } else {
3303 : : /*
3304 : : * Arbitrarily, if two processes started at the same
3305 : : * time, we'll say that the lower pointer value
3306 : : * started first. Note that t2 may have exited by now
3307 : : * so this may not be a valid pointer any longer, but
3308 : : * that's fine - it still serves to distinguish
3309 : : * between two tasks started (effectively) simultaneously.
3310 : : */
3311 : 0 : return t1 > t2;
3312 : : }
3313 : : }
3314 : :
3315 : : /*
3316 : : * This function is a callback from heap_insert() and is used to order
3317 : : * the heap.
3318 : : * In this case we order the heap in descending task start time.
3319 : : */
3320 : 0 : static inline int started_after(void *p1, void *p2)
3321 : : {
3322 : : struct task_struct *t1 = p1;
3323 : : struct task_struct *t2 = p2;
3324 : 0 : return started_after_time(t1, &t2->start_time, t2);
3325 : : }
3326 : :
3327 : : /**
3328 : : * css_scan_tasks - iterate though all the tasks in a css
3329 : : * @css: the css to iterate tasks of
3330 : : * @test: optional test callback
3331 : : * @process: process callback
3332 : : * @data: data passed to @test and @process
3333 : : * @heap: optional pre-allocated heap used for task iteration
3334 : : *
3335 : : * Iterate through all the tasks in @css, calling @test for each, and if it
3336 : : * returns %true, call @process for it also.
3337 : : *
3338 : : * @test may be NULL, meaning always true (select all tasks), which
3339 : : * effectively duplicates css_task_iter_{start,next,end}() but does not
3340 : : * lock css_set_lock for the call to @process.
3341 : : *
3342 : : * It is guaranteed that @process will act on every task that is a member
3343 : : * of @css for the duration of this call. This function may or may not
3344 : : * call @process for tasks that exit or move to a different css during the
3345 : : * call, or are forked or move into the css during the call.
3346 : : *
3347 : : * Note that @test may be called with locks held, and may in some
3348 : : * situations be called multiple times for the same task, so it should be
3349 : : * cheap.
3350 : : *
3351 : : * If @heap is non-NULL, a heap has been pre-allocated and will be used for
3352 : : * heap operations (and its "gt" member will be overwritten), else a
3353 : : * temporary heap will be used (allocation of which may cause this function
3354 : : * to fail).
3355 : : */
3356 : 0 : int css_scan_tasks(struct cgroup_subsys_state *css,
3357 : : bool (*test)(struct task_struct *, void *),
3358 : : void (*process)(struct task_struct *, void *),
3359 : : void *data, struct ptr_heap *heap)
3360 : : {
3361 : : int retval, i;
3362 : : struct css_task_iter it;
3363 : : struct task_struct *p, *dropped;
3364 : : /* Never dereference latest_task, since it's not refcounted */
3365 : : struct task_struct *latest_task = NULL;
3366 : : struct ptr_heap tmp_heap;
3367 : : struct timespec latest_time = { 0, 0 };
3368 : :
3369 [ # # ]: 0 : if (heap) {
3370 : : /* The caller supplied our heap and pre-allocated its memory */
3371 : 0 : heap->gt = &started_after;
3372 : : } else {
3373 : : /* We need to allocate our own heap memory */
3374 : : heap = &tmp_heap;
3375 : 0 : retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
3376 [ # # ]: 0 : if (retval)
3377 : : /* cannot allocate the heap */
3378 : : return retval;
3379 : : }
3380 : :
3381 : : again:
3382 : : /*
3383 : : * Scan tasks in the css, using the @test callback to determine
3384 : : * which are of interest, and invoking @process callback on the
3385 : : * ones which need an update. Since we don't want to hold any
3386 : : * locks during the task updates, gather tasks to be processed in a
3387 : : * heap structure. The heap is sorted by descending task start
3388 : : * time. If the statically-sized heap fills up, we overflow tasks
3389 : : * that started later, and in future iterations only consider tasks
3390 : : * that started after the latest task in the previous pass. This
3391 : : * guarantees forward progress and that we don't miss any tasks.
3392 : : */
3393 : 0 : heap->size = 0;
3394 : 0 : css_task_iter_start(css, &it);
3395 [ # # ]: 0 : while ((p = css_task_iter_next(&it))) {
3396 : : /*
3397 : : * Only affect tasks that qualify per the caller's callback,
3398 : : * if he provided one
3399 : : */
3400 [ # # ][ # # ]: 0 : if (test && !test(p, data))
3401 : 0 : continue;
3402 : : /*
3403 : : * Only process tasks that started after the last task
3404 : : * we processed
3405 : : */
3406 [ # # ]: 0 : if (!started_after_time(p, &latest_time, latest_task))
3407 : 0 : continue;
3408 : 0 : dropped = heap_insert(heap, p);
3409 [ # # ]: 0 : if (dropped == NULL) {
3410 : : /*
3411 : : * The new task was inserted; the heap wasn't
3412 : : * previously full
3413 : : */
3414 : 0 : get_task_struct(p);
3415 [ # # ]: 0 : } else if (dropped != p) {
3416 : : /*
3417 : : * The new task was inserted, and pushed out a
3418 : : * different task
3419 : : */
3420 : 0 : get_task_struct(p);
3421 : : put_task_struct(dropped);
3422 : : }
3423 : : /*
3424 : : * Else the new task was newer than anything already in
3425 : : * the heap and wasn't inserted
3426 : : */
3427 : : }
3428 : 0 : css_task_iter_end(&it);
3429 : :
3430 [ # # ]: 0 : if (heap->size) {
3431 [ # # ]: 0 : for (i = 0; i < heap->size; i++) {
3432 : 0 : struct task_struct *q = heap->ptrs[i];
3433 [ # # ]: 0 : if (i == 0) {
3434 : 0 : latest_time = q->start_time;
3435 : : latest_task = q;
3436 : : }
3437 : : /* Process the task per the caller's callback */
3438 : 0 : process(q, data);
3439 : : put_task_struct(q);
3440 : : }
3441 : : /*
3442 : : * If we had to process any tasks at all, scan again
3443 : : * in case some of them were in the middle of forking
3444 : : * children that didn't get processed.
3445 : : * Not the most efficient way to do it, but it avoids
3446 : : * having to take callback_mutex in the fork path
3447 : : */
3448 : : goto again;
3449 : : }
3450 [ # # ]: 0 : if (heap == &tmp_heap)
3451 : 0 : heap_free(&tmp_heap);
3452 : : return 0;
3453 : : }
3454 : :
3455 : 0 : static void cgroup_transfer_one_task(struct task_struct *task, void *data)
3456 : : {
3457 : : struct cgroup *new_cgroup = data;
3458 : :
3459 : 0 : mutex_lock(&cgroup_mutex);
3460 : 0 : cgroup_attach_task(new_cgroup, task, false);
3461 : 0 : mutex_unlock(&cgroup_mutex);
3462 : 0 : }
3463 : :
3464 : : /**
3465 : : * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3466 : : * @to: cgroup to which the tasks will be moved
3467 : : * @from: cgroup in which the tasks currently reside
3468 : : */
3469 : 0 : int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3470 : : {
3471 : 0 : return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3472 : : to, NULL);
3473 : : }
3474 : :
3475 : : /*
3476 : : * Stuff for reading the 'tasks'/'procs' files.
3477 : : *
3478 : : * Reading this file can return large amounts of data if a cgroup has
3479 : : * *lots* of attached tasks. So it may need several calls to read(),
3480 : : * but we cannot guarantee that the information we produce is correct
3481 : : * unless we produce it entirely atomically.
3482 : : *
3483 : : */
3484 : :
3485 : : /* which pidlist file are we talking about? */
3486 : : enum cgroup_filetype {
3487 : : CGROUP_FILE_PROCS,
3488 : : CGROUP_FILE_TASKS,
3489 : : };
3490 : :
3491 : : /*
3492 : : * A pidlist is a list of pids that virtually represents the contents of one
3493 : : * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
3494 : : * a pair (one each for procs, tasks) for each pid namespace that's relevant
3495 : : * to the cgroup.
3496 : : */
3497 : : struct cgroup_pidlist {
3498 : : /*
3499 : : * used to find which pidlist is wanted. doesn't change as long as
3500 : : * this particular list stays in the list.
3501 : : */
3502 : : struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
3503 : : /* array of xids */
3504 : : pid_t *list;
3505 : : /* how many elements the above list has */
3506 : : int length;
3507 : : /* how many files are using the current array */
3508 : : int use_count;
3509 : : /* each of these stored in a list by its cgroup */
3510 : : struct list_head links;
3511 : : /* pointer to the cgroup we belong to, for list removal purposes */
3512 : : struct cgroup *owner;
3513 : : /* protects the other fields */
3514 : : struct rw_semaphore rwsem;
3515 : : };
3516 : :
3517 : : /*
3518 : : * The following two functions "fix" the issue where there are more pids
3519 : : * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
3520 : : * TODO: replace with a kernel-wide solution to this problem
3521 : : */
3522 : : #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
3523 : 0 : static void *pidlist_allocate(int count)
3524 : : {
3525 [ - + ]: 1 : if (PIDLIST_TOO_LARGE(count))
3526 : 0 : return vmalloc(count * sizeof(pid_t));
3527 : : else
3528 : 1 : return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3529 : : }
3530 : 0 : static void pidlist_free(void *p)
3531 : : {
3532 [ - + ]: 2 : if (is_vmalloc_addr(p))
3533 : 0 : vfree(p);
3534 : : else
3535 : 2 : kfree(p);
3536 : 2 : }
3537 : :
3538 : : /*
3539 : : * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3540 : : * Returns the number of unique elements.
3541 : : */
3542 : 0 : static int pidlist_uniq(pid_t *list, int length)
3543 : : {
3544 : : int src, dest = 1;
3545 : :
3546 : : /*
3547 : : * we presume the 0th element is unique, so i starts at 1. trivial
3548 : : * edge cases first; no work needs to be done for either
3549 : : */
3550 [ # # ]: 0 : if (length == 0 || length == 1)
3551 : : return length;
3552 : : /* src and dest walk down the list; dest counts unique elements */
3553 [ # # ]: 0 : for (src = 1; src < length; src++) {
3554 : : /* find next unique element */
3555 [ # # ]: 0 : while (list[src] == list[src-1]) {
3556 : 0 : src++;
3557 [ # # ]: 0 : if (src == length)
3558 : : goto after;
3559 : : }
3560 : : /* dest always points to where the next unique element goes */
3561 : 0 : list[dest] = list[src];
3562 : 0 : dest++;
3563 : : }
3564 : : after:
3565 : : return dest;
3566 : : }
3567 : :
3568 : 0 : static int cmppid(const void *a, const void *b)
3569 : : {
3570 : 880 : return *(pid_t *)a - *(pid_t *)b;
3571 : : }
3572 : :
3573 : : /*
3574 : : * find the appropriate pidlist for our purpose (given procs vs tasks)
3575 : : * returns with the lock on that pidlist already held, and takes care
3576 : : * of the use count, or returns NULL with no locks held if we're out of
3577 : : * memory.
3578 : : */
3579 : 0 : static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3580 : : enum cgroup_filetype type)
3581 : : {
3582 : : struct cgroup_pidlist *l;
3583 : : /* don't need task_nsproxy() if we're looking at ourself */
3584 : 1 : struct pid_namespace *ns = task_active_pid_ns(current);
3585 : :
3586 : : /*
3587 : : * We can't drop the pidlist_mutex before taking the l->rwsem in case
3588 : : * the last ref-holder is trying to remove l from the list at the same
3589 : : * time. Holding the pidlist_mutex precludes somebody taking whichever
3590 : : * list we find out from under us - compare release_pid_array().
3591 : : */
3592 : 1 : mutex_lock(&cgrp->pidlist_mutex);
3593 [ - + ]: 2 : list_for_each_entry(l, &cgrp->pidlists, links) {
3594 [ # # ][ # # ]: 0 : if (l->key.type == type && l->key.ns == ns) {
3595 : : /* make sure l doesn't vanish out from under us */
3596 : 0 : down_write(&l->rwsem);
3597 : 0 : mutex_unlock(&cgrp->pidlist_mutex);
3598 : 0 : return l;
3599 : : }
3600 : : }
3601 : : /* entry not found; create a new one */
3602 : : l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3603 [ - + ]: 1 : if (!l) {
3604 : 0 : mutex_unlock(&cgrp->pidlist_mutex);
3605 : 0 : return l;
3606 : : }
3607 : 1 : init_rwsem(&l->rwsem);
3608 : 1 : down_write(&l->rwsem);
3609 : 1 : l->key.type = type;
3610 : 1 : l->key.ns = get_pid_ns(ns);
3611 : 1 : l->owner = cgrp;
3612 : 1 : list_add(&l->links, &cgrp->pidlists);
3613 : 1 : mutex_unlock(&cgrp->pidlist_mutex);
3614 : 1 : return l;
3615 : : }
3616 : :
3617 : : /*
3618 : : * Load a cgroup's pidarray with either procs' tgids or tasks' pids
3619 : : */
3620 : 0 : static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3621 : : struct cgroup_pidlist **lp)
3622 : : {
3623 : : pid_t *array;
3624 : : int length;
3625 : : int pid, n = 0; /* used for populating the array */
3626 : : struct css_task_iter it;
3627 : : struct task_struct *tsk;
3628 : : struct cgroup_pidlist *l;
3629 : :
3630 : : /*
3631 : : * If cgroup gets more users after we read count, we won't have
3632 : : * enough space - tough. This race is indistinguishable to the
3633 : : * caller from the case that the additional cgroup users didn't
3634 : : * show up until sometime later on.
3635 : : */
3636 : 1 : length = cgroup_task_count(cgrp);
3637 : 1 : array = pidlist_allocate(length);
3638 [ + - ]: 1 : if (!array)
3639 : : return -ENOMEM;
3640 : : /* now, populate the array */
3641 : 1 : css_task_iter_start(&cgrp->dummy_css, &it);
3642 [ + + ]: 94 : while ((tsk = css_task_iter_next(&it))) {
3643 [ + - ]: 92 : if (unlikely(n == length))
3644 : : break;
3645 : : /* get tgid or pid for procs or tasks file respectively */
3646 [ - + ]: 92 : if (type == CGROUP_FILE_PROCS)
3647 : : pid = task_tgid_vnr(tsk);
3648 : : else
3649 : : pid = task_pid_vnr(tsk);
3650 [ + - ]: 92 : if (pid > 0) /* make sure to only use valid results */
3651 : 93 : array[n++] = pid;
3652 : : }
3653 : 1 : css_task_iter_end(&it);
3654 : : length = n;
3655 : : /* now sort & (if procs) strip out duplicates */
3656 : 1 : sort(array, length, sizeof(pid_t), cmppid, NULL);
3657 [ - + ]: 1 : if (type == CGROUP_FILE_PROCS)
3658 : 0 : length = pidlist_uniq(array, length);
3659 : 1 : l = cgroup_pidlist_find(cgrp, type);
3660 [ - + ]: 1 : if (!l) {
3661 : 0 : pidlist_free(array);
3662 : 0 : return -ENOMEM;
3663 : : }
3664 : : /* store array, freeing old if necessary - lock already held */
3665 : 1 : pidlist_free(l->list);
3666 : 1 : l->list = array;
3667 : 1 : l->length = length;
3668 : 1 : l->use_count++;
3669 : 1 : up_write(&l->rwsem);
3670 : 1 : *lp = l;
3671 : 1 : return 0;
3672 : : }
3673 : :
3674 : : /**
3675 : : * cgroupstats_build - build and fill cgroupstats
3676 : : * @stats: cgroupstats to fill information into
3677 : : * @dentry: A dentry entry belonging to the cgroup for which stats have
3678 : : * been requested.
3679 : : *
3680 : : * Build and fill cgroupstats so that taskstats can export it to user
3681 : : * space.
3682 : : */
3683 : 0 : int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3684 : : {
3685 : : int ret = -EINVAL;
3686 : : struct cgroup *cgrp;
3687 : : struct css_task_iter it;
3688 : : struct task_struct *tsk;
3689 : :
3690 : : /*
3691 : : * Validate dentry by checking the superblock operations,
3692 : : * and make sure it's a directory.
3693 : : */
3694 [ # # ][ # # ]: 0 : if (dentry->d_sb->s_op != &cgroup_ops ||
3695 : 0 : !S_ISDIR(dentry->d_inode->i_mode))
3696 : : goto err;
3697 : :
3698 : : ret = 0;
3699 : 0 : cgrp = dentry->d_fsdata;
3700 : :
3701 : 0 : css_task_iter_start(&cgrp->dummy_css, &it);
3702 [ # # ]: 0 : while ((tsk = css_task_iter_next(&it))) {
3703 [ # # # # : 0 : switch (tsk->state) {
# ]
3704 : : case TASK_RUNNING:
3705 : 0 : stats->nr_running++;
3706 : 0 : break;
3707 : : case TASK_INTERRUPTIBLE:
3708 : 0 : stats->nr_sleeping++;
3709 : 0 : break;
3710 : : case TASK_UNINTERRUPTIBLE:
3711 : 0 : stats->nr_uninterruptible++;
3712 : 0 : break;
3713 : : case TASK_STOPPED:
3714 : 0 : stats->nr_stopped++;
3715 : 0 : break;
3716 : : default:
3717 : : if (delayacct_is_task_waiting_on_io(tsk))
3718 : : stats->nr_io_wait++;
3719 : : break;
3720 : : }
3721 : : }
3722 : 0 : css_task_iter_end(&it);
3723 : :
3724 : : err:
3725 : 0 : return ret;
3726 : : }
3727 : :
3728 : :
3729 : : /*
3730 : : * seq_file methods for the tasks/procs files. The seq_file position is the
3731 : : * next pid to display; the seq_file iterator is a pointer to the pid
3732 : : * in the cgroup->l->list array.
3733 : : */
3734 : :
3735 : 0 : static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3736 : : {
3737 : : /*
3738 : : * Initially we receive a position value that corresponds to
3739 : : * one more than the last pid shown (or 0 on the first call or
3740 : : * after a seek to the start). Use a binary-search to find the
3741 : : * next pid to display, if any
3742 : : */
3743 : 2 : struct cgroup_pidlist *l = s->private;
3744 : 2 : int index = 0, pid = *pos;
3745 : : int *iter;
3746 : :
3747 : 2 : down_read(&l->rwsem);
3748 [ + + ]: 2 : if (pid) {
3749 : 1 : int end = l->length;
3750 : :
3751 [ + + ]: 7 : while (index < end) {
3752 : 6 : int mid = (index + end) / 2;
3753 [ + - ]: 6 : if (l->list[mid] == pid) {
3754 : : index = mid;
3755 : : break;
3756 [ + - ]: 6 : } else if (l->list[mid] <= pid)
3757 : 6 : index = mid + 1;
3758 : : else
3759 : : end = mid;
3760 : : }
3761 : : }
3762 : : /* If we're off the end of the array, we're done */
3763 [ # # ]: 2 : if (index >= l->length)
3764 : : return NULL;
3765 : : /* Update the abstract position to be the actual pid that we found */
3766 : 1 : iter = l->list + index;
3767 : 1 : *pos = *iter;
3768 : 1 : return iter;
3769 : : }
3770 : :
3771 : 0 : static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3772 : : {
3773 : 2 : struct cgroup_pidlist *l = s->private;
3774 : 2 : up_read(&l->rwsem);
3775 : 2 : }
3776 : :
3777 : 0 : static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3778 : : {
3779 : 92 : struct cgroup_pidlist *l = s->private;
3780 : : pid_t *p = v;
3781 : 92 : pid_t *end = l->list + l->length;
3782 : : /*
3783 : : * Advance to the next pid in the array. If this goes off the
3784 : : * end, we're done
3785 : : */
3786 : 92 : p++;
3787 [ + + ]: 92 : if (p >= end) {
3788 : : return NULL;
3789 : : } else {
3790 : 91 : *pos = *p;
3791 : 91 : return p;
3792 : : }
3793 : : }
3794 : :
3795 : 0 : static int cgroup_pidlist_show(struct seq_file *s, void *v)
3796 : : {
3797 : 92 : return seq_printf(s, "%d\n", *(int *)v);
3798 : : }
3799 : :
3800 : : /*
3801 : : * seq_operations functions for iterating on pidlists through seq_file -
3802 : : * independent of whether it's tasks or procs
3803 : : */
3804 : : static const struct seq_operations cgroup_pidlist_seq_operations = {
3805 : : .start = cgroup_pidlist_start,
3806 : : .stop = cgroup_pidlist_stop,
3807 : : .next = cgroup_pidlist_next,
3808 : : .show = cgroup_pidlist_show,
3809 : : };
3810 : :
3811 : 0 : static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3812 : : {
3813 : : /*
3814 : : * the case where we're the last user of this particular pidlist will
3815 : : * have us remove it from the cgroup's list, which entails taking the
3816 : : * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
3817 : : * pidlist_mutex, we have to take pidlist_mutex first.
3818 : : */
3819 : 1 : mutex_lock(&l->owner->pidlist_mutex);
3820 : 1 : down_write(&l->rwsem);
3821 [ - + ]: 2 : BUG_ON(!l->use_count);
3822 [ + - ]: 1 : if (!--l->use_count) {
3823 : : /* we're the last user if refcount is 0; remove and free */
3824 : : list_del(&l->links);
3825 : 1 : mutex_unlock(&l->owner->pidlist_mutex);
3826 : 1 : pidlist_free(l->list);
3827 : : put_pid_ns(l->key.ns);
3828 : 1 : up_write(&l->rwsem);
3829 : 1 : kfree(l);
3830 : 1 : return;
3831 : : }
3832 : 0 : mutex_unlock(&l->owner->pidlist_mutex);
3833 : 0 : up_write(&l->rwsem);
3834 : : }
3835 : :
3836 : 0 : static int cgroup_pidlist_release(struct inode *inode, struct file *file)
3837 : : {
3838 : : struct cgroup_pidlist *l;
3839 [ + - ]: 1 : if (!(file->f_mode & FMODE_READ))
3840 : : return 0;
3841 : : /*
3842 : : * the seq_file will only be initialized if the file was opened for
3843 : : * reading; hence we check if it's not null only in that case.
3844 : : */
3845 : 1 : l = ((struct seq_file *)file->private_data)->private;
3846 : 1 : cgroup_release_pid_array(l);
3847 : 1 : return seq_release(inode, file);
3848 : : }
3849 : :
3850 : : static const struct file_operations cgroup_pidlist_operations = {
3851 : : .read = seq_read,
3852 : : .llseek = seq_lseek,
3853 : : .write = cgroup_file_write,
3854 : : .release = cgroup_pidlist_release,
3855 : : };
3856 : :
3857 : : /*
3858 : : * The following functions handle opens on a file that displays a pidlist
3859 : : * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
3860 : : * in the cgroup.
3861 : : */
3862 : : /* helper function for the two below it */
3863 : 0 : static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
3864 : : {
3865 : 1 : struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
3866 : : struct cgroup_pidlist *l;
3867 : : int retval;
3868 : :
3869 : : /* Nothing to do for write-only files */
3870 [ + - ]: 1 : if (!(file->f_mode & FMODE_READ))
3871 : : return 0;
3872 : :
3873 : : /* have the array populated */
3874 : 1 : retval = pidlist_array_load(cgrp, type, &l);
3875 [ + - ]: 1 : if (retval)
3876 : : return retval;
3877 : : /* configure file information */
3878 : 1 : file->f_op = &cgroup_pidlist_operations;
3879 : :
3880 : 1 : retval = seq_open(file, &cgroup_pidlist_seq_operations);
3881 [ - + ]: 2 : if (retval) {
3882 : 0 : cgroup_release_pid_array(l);
3883 : 0 : return retval;
3884 : : }
3885 : 1 : ((struct seq_file *)file->private_data)->private = l;
3886 : 1 : return 0;
3887 : : }
3888 : 0 : static int cgroup_tasks_open(struct inode *unused, struct file *file)
3889 : : {
3890 : 1 : return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
3891 : : }
3892 : 0 : static int cgroup_procs_open(struct inode *unused, struct file *file)
3893 : : {
3894 : 0 : return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3895 : : }
3896 : :
3897 : 0 : static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3898 : : struct cftype *cft)
3899 : : {
3900 : 8 : return notify_on_release(css->cgroup);
3901 : : }
3902 : :
3903 : 0 : static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3904 : : struct cftype *cft, u64 val)
3905 : : {
3906 : 2 : clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3907 [ + + ]: 2 : if (val)
3908 : 1 : set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3909 : : else
3910 : 1 : clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3911 : 2 : return 0;
3912 : : }
3913 : :
3914 : : /*
3915 : : * When dput() is called asynchronously, if umount has been done and
3916 : : * then deactivate_super() in cgroup_free_fn() kills the superblock,
3917 : : * there's a small window that vfs will see the root dentry with non-zero
3918 : : * refcnt and trigger BUG().
3919 : : *
3920 : : * That's why we hold a reference before dput() and drop it right after.
3921 : : */
3922 : 0 : static void cgroup_dput(struct cgroup *cgrp)
3923 : : {
3924 : 0 : struct super_block *sb = cgrp->root->sb;
3925 : :
3926 : 0 : atomic_inc(&sb->s_active);
3927 : 0 : dput(cgrp->dentry);
3928 : 0 : deactivate_super(sb);
3929 : 0 : }
3930 : :
3931 : : /*
3932 : : * Unregister event and free resources.
3933 : : *
3934 : : * Gets called from workqueue.
3935 : : */
3936 : 0 : static void cgroup_event_remove(struct work_struct *work)
3937 : : {
3938 : 0 : struct cgroup_event *event = container_of(work, struct cgroup_event,
3939 : : remove);
3940 : 0 : struct cgroup_subsys_state *css = event->css;
3941 : :
3942 : 0 : remove_wait_queue(event->wqh, &event->wait);
3943 : :
3944 : 0 : event->cft->unregister_event(css, event->cft, event->eventfd);
3945 : :
3946 : : /* Notify userspace the event is going away. */
3947 : 0 : eventfd_signal(event->eventfd, 1);
3948 : :
3949 : 0 : eventfd_ctx_put(event->eventfd);
3950 : 0 : kfree(event);
3951 : : css_put(css);
3952 : 0 : }
3953 : :
3954 : : /*
3955 : : * Gets called on POLLHUP on eventfd when user closes it.
3956 : : *
3957 : : * Called with wqh->lock held and interrupts disabled.
3958 : : */
3959 : 0 : static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3960 : : int sync, void *key)
3961 : : {
3962 : : struct cgroup_event *event = container_of(wait,
3963 : : struct cgroup_event, wait);
3964 : 0 : struct cgroup *cgrp = event->css->cgroup;
3965 : 0 : unsigned long flags = (unsigned long)key;
3966 : :
3967 [ # # ]: 0 : if (flags & POLLHUP) {
3968 : : /*
3969 : : * If the event has been detached at cgroup removal, we
3970 : : * can simply return knowing the other side will cleanup
3971 : : * for us.
3972 : : *
3973 : : * We can't race against event freeing since the other
3974 : : * side will require wqh->lock via remove_wait_queue(),
3975 : : * which we hold.
3976 : : */
3977 : : spin_lock(&cgrp->event_list_lock);
3978 [ # # ]: 0 : if (!list_empty(&event->list)) {
3979 : : list_del_init(&event->list);
3980 : : /*
3981 : : * We are in atomic context, but cgroup_event_remove()
3982 : : * may sleep, so we have to call it in workqueue.
3983 : : */
3984 : 0 : schedule_work(&event->remove);
3985 : : }
3986 : : spin_unlock(&cgrp->event_list_lock);
3987 : : }
3988 : :
3989 : 0 : return 0;
3990 : : }
3991 : :
3992 : 0 : static void cgroup_event_ptable_queue_proc(struct file *file,
3993 : : wait_queue_head_t *wqh, poll_table *pt)
3994 : : {
3995 : : struct cgroup_event *event = container_of(pt,
3996 : : struct cgroup_event, pt);
3997 : :
3998 : 0 : event->wqh = wqh;
3999 : 0 : add_wait_queue(wqh, &event->wait);
4000 : 0 : }
4001 : :
4002 : : /*
4003 : : * Parse input and register new cgroup event handler.
4004 : : *
4005 : : * Input must be in format '<event_fd> <control_fd> <args>'.
4006 : : * Interpretation of args is defined by control file implementation.
4007 : : */
4008 : 0 : static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
4009 : : struct cftype *cft, const char *buffer)
4010 : : {
4011 : 0 : struct cgroup *cgrp = dummy_css->cgroup;
4012 : : struct cgroup_event *event;
4013 : : struct cgroup_subsys_state *cfile_css;
4014 : : unsigned int efd, cfd;
4015 : : struct fd efile;
4016 : : struct fd cfile;
4017 : : char *endp;
4018 : : int ret;
4019 : :
4020 : 0 : efd = simple_strtoul(buffer, &endp, 10);
4021 [ # # ]: 0 : if (*endp != ' ')
4022 : : return -EINVAL;
4023 : 0 : buffer = endp + 1;
4024 : :
4025 : 0 : cfd = simple_strtoul(buffer, &endp, 10);
4026 [ # # ]: 0 : if ((*endp != ' ') && (*endp != '\0'))
4027 : : return -EINVAL;
4028 : 0 : buffer = endp + 1;
4029 : :
4030 : : event = kzalloc(sizeof(*event), GFP_KERNEL);
4031 [ # # ]: 0 : if (!event)
4032 : : return -ENOMEM;
4033 : :
4034 : 0 : INIT_LIST_HEAD(&event->list);
4035 : : init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
4036 : : init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
4037 : 0 : INIT_WORK(&event->remove, cgroup_event_remove);
4038 : :
4039 : : efile = fdget(efd);
4040 [ # # ]: 0 : if (!efile.file) {
4041 : : ret = -EBADF;
4042 : : goto out_kfree;
4043 : : }
4044 : :
4045 : 0 : event->eventfd = eventfd_ctx_fileget(efile.file);
4046 [ # # ]: 0 : if (IS_ERR(event->eventfd)) {
4047 : : ret = PTR_ERR(event->eventfd);
4048 : 0 : goto out_put_efile;
4049 : : }
4050 : :
4051 : : cfile = fdget(cfd);
4052 [ # # ]: 0 : if (!cfile.file) {
4053 : : ret = -EBADF;
4054 : : goto out_put_eventfd;
4055 : : }
4056 : :
4057 : : /* the process need read permission on control file */
4058 : : /* AV: shouldn't we check that it's been opened for read instead? */
4059 : 0 : ret = inode_permission(file_inode(cfile.file), MAY_READ);
4060 [ # # ]: 0 : if (ret < 0)
4061 : : goto out_put_cfile;
4062 : :
4063 : 0 : event->cft = __file_cft(cfile.file);
4064 [ # # ]: 0 : if (IS_ERR(event->cft)) {
4065 : : ret = PTR_ERR(event->cft);
4066 : 0 : goto out_put_cfile;
4067 : : }
4068 : :
4069 [ # # ]: 0 : if (!event->cft->ss) {
4070 : : ret = -EBADF;
4071 : : goto out_put_cfile;
4072 : : }
4073 : :
4074 : : /*
4075 : : * Determine the css of @cfile, verify it belongs to the same
4076 : : * cgroup as cgroup.event_control, and associate @event with it.
4077 : : * Remaining events are automatically removed on cgroup destruction
4078 : : * but the removal is asynchronous, so take an extra ref.
4079 : : */
4080 : : rcu_read_lock();
4081 : :
4082 : : ret = -EINVAL;
4083 : 0 : event->css = cgroup_css(cgrp, event->cft->ss);
4084 : 0 : cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4085 [ # # ][ # # ]: 0 : if (event->css && event->css == cfile_css && css_tryget(event->css))
[ # # ]
4086 : : ret = 0;
4087 : :
4088 : : rcu_read_unlock();
4089 [ # # ]: 0 : if (ret)
4090 : : goto out_put_cfile;
4091 : :
4092 [ # # ][ # # ]: 0 : if (!event->cft->register_event || !event->cft->unregister_event) {
4093 : : ret = -EINVAL;
4094 : : goto out_put_css;
4095 : : }
4096 : :
4097 : 0 : ret = event->cft->register_event(event->css, event->cft,
4098 : : event->eventfd, buffer);
4099 [ # # ]: 0 : if (ret)
4100 : : goto out_put_css;
4101 : :
4102 : 0 : efile.file->f_op->poll(efile.file, &event->pt);
4103 : :
4104 : : spin_lock(&cgrp->event_list_lock);
4105 : 0 : list_add(&event->list, &cgrp->event_list);
4106 : : spin_unlock(&cgrp->event_list_lock);
4107 : :
4108 : : fdput(cfile);
4109 : : fdput(efile);
4110 : :
4111 : : return 0;
4112 : :
4113 : : out_put_css:
4114 : 0 : css_put(event->css);
4115 : : out_put_cfile:
4116 : : fdput(cfile);
4117 : : out_put_eventfd:
4118 : 0 : eventfd_ctx_put(event->eventfd);
4119 : : out_put_efile:
4120 : : fdput(efile);
4121 : : out_kfree:
4122 : 0 : kfree(event);
4123 : :
4124 : 0 : return ret;
4125 : : }
4126 : :
4127 : 0 : static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4128 : : struct cftype *cft)
4129 : : {
4130 : 0 : return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4131 : : }
4132 : :
4133 : 0 : static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4134 : : struct cftype *cft, u64 val)
4135 : : {
4136 [ # # ]: 0 : if (val)
4137 : 0 : set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4138 : : else
4139 : 0 : clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4140 : 0 : return 0;
4141 : : }
4142 : :
4143 : : static struct cftype cgroup_base_files[] = {
4144 : : {
4145 : : .name = "cgroup.procs",
4146 : : .open = cgroup_procs_open,
4147 : : .write_u64 = cgroup_procs_write,
4148 : : .release = cgroup_pidlist_release,
4149 : : .mode = S_IRUGO | S_IWUSR,
4150 : : },
4151 : : {
4152 : : .name = "cgroup.event_control",
4153 : : .write_string = cgroup_write_event_control,
4154 : : .mode = S_IWUGO,
4155 : : },
4156 : : {
4157 : : .name = "cgroup.clone_children",
4158 : : .flags = CFTYPE_INSANE,
4159 : : .read_u64 = cgroup_clone_children_read,
4160 : : .write_u64 = cgroup_clone_children_write,
4161 : : },
4162 : : {
4163 : : .name = "cgroup.sane_behavior",
4164 : : .flags = CFTYPE_ONLY_ON_ROOT,
4165 : : .read_seq_string = cgroup_sane_behavior_show,
4166 : : },
4167 : :
4168 : : /*
4169 : : * Historical crazy stuff. These don't have "cgroup." prefix and
4170 : : * don't exist if sane_behavior. If you're depending on these, be
4171 : : * prepared to be burned.
4172 : : */
4173 : : {
4174 : : .name = "tasks",
4175 : : .flags = CFTYPE_INSANE, /* use "procs" instead */
4176 : : .open = cgroup_tasks_open,
4177 : : .write_u64 = cgroup_tasks_write,
4178 : : .release = cgroup_pidlist_release,
4179 : : .mode = S_IRUGO | S_IWUSR,
4180 : : },
4181 : : {
4182 : : .name = "notify_on_release",
4183 : : .flags = CFTYPE_INSANE,
4184 : : .read_u64 = cgroup_read_notify_on_release,
4185 : : .write_u64 = cgroup_write_notify_on_release,
4186 : : },
4187 : : {
4188 : : .name = "release_agent",
4189 : : .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
4190 : : .read_seq_string = cgroup_release_agent_show,
4191 : : .write_string = cgroup_release_agent_write,
4192 : : .max_write_len = PATH_MAX,
4193 : : },
4194 : : { } /* terminate */
4195 : : };
4196 : :
4197 : : /**
4198 : : * cgroup_populate_dir - create subsys files in a cgroup directory
4199 : : * @cgrp: target cgroup
4200 : : * @subsys_mask: mask of the subsystem ids whose files should be added
4201 : : *
4202 : : * On failure, no file is added.
4203 : : */
4204 : : static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4205 : : {
4206 : : struct cgroup_subsys *ss;
4207 : : int i, ret = 0;
4208 : :
4209 : : /* process cftsets of each subsystem */
4210 : : for_each_subsys(ss, i) {
4211 : : struct cftype_set *set;
4212 : :
4213 : : if (!test_bit(i, &subsys_mask))
4214 : : continue;
4215 : :
4216 : : list_for_each_entry(set, &ss->cftsets, node) {
4217 : : ret = cgroup_addrm_files(cgrp, set->cfts, true);
4218 : : if (ret < 0)
4219 : : goto err;
4220 : : }
4221 : : }
4222 : : return 0;
4223 : : err:
4224 : : cgroup_clear_dir(cgrp, subsys_mask);
4225 : : return ret;
4226 : : }
4227 : :
4228 : : /*
4229 : : * css destruction is four-stage process.
4230 : : *
4231 : : * 1. Destruction starts. Killing of the percpu_ref is initiated.
4232 : : * Implemented in kill_css().
4233 : : *
4234 : : * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4235 : : * and thus css_tryget() is guaranteed to fail, the css can be offlined
4236 : : * by invoking offline_css(). After offlining, the base ref is put.
4237 : : * Implemented in css_killed_work_fn().
4238 : : *
4239 : : * 3. When the percpu_ref reaches zero, the only possible remaining
4240 : : * accessors are inside RCU read sections. css_release() schedules the
4241 : : * RCU callback.
4242 : : *
4243 : : * 4. After the grace period, the css can be freed. Implemented in
4244 : : * css_free_work_fn().
4245 : : *
4246 : : * It is actually hairier because both step 2 and 4 require process context
4247 : : * and thus involve punting to css->destroy_work adding two additional
4248 : : * steps to the already complex sequence.
4249 : : */
4250 : 0 : static void css_free_work_fn(struct work_struct *work)
4251 : : {
4252 : 0 : struct cgroup_subsys_state *css =
4253 : : container_of(work, struct cgroup_subsys_state, destroy_work);
4254 : 0 : struct cgroup *cgrp = css->cgroup;
4255 : :
4256 [ # # ]: 0 : if (css->parent)
4257 : : css_put(css->parent);
4258 : :
4259 : 0 : css->ss->css_free(css);
4260 : 0 : cgroup_dput(cgrp);
4261 : 0 : }
4262 : :
4263 : 0 : static void css_free_rcu_fn(struct rcu_head *rcu_head)
4264 : : {
4265 : : struct cgroup_subsys_state *css =
4266 : : container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4267 : :
4268 : : /*
4269 : : * css holds an extra ref to @cgrp->dentry which is put on the last
4270 : : * css_put(). dput() requires process context which we don't have.
4271 : : */
4272 : 0 : INIT_WORK(&css->destroy_work, css_free_work_fn);
4273 : 0 : queue_work(cgroup_destroy_wq, &css->destroy_work);
4274 : 0 : }
4275 : :
4276 : 0 : static void css_release(struct percpu_ref *ref)
4277 : : {
4278 : : struct cgroup_subsys_state *css =
4279 : : container_of(ref, struct cgroup_subsys_state, refcnt);
4280 : :
4281 : 0 : rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
4282 : 0 : call_rcu(&css->rcu_head, css_free_rcu_fn);
4283 : 0 : }
4284 : :
4285 : 0 : static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
4286 : : struct cgroup *cgrp)
4287 : : {
4288 : 0 : css->cgroup = cgrp;
4289 : 0 : css->ss = ss;
4290 : 0 : css->flags = 0;
4291 : :
4292 [ # # ]: 0 : if (cgrp->parent)
4293 : 0 : css->parent = cgroup_css(cgrp->parent, ss);
4294 : : else
4295 : 0 : css->flags |= CSS_ROOT;
4296 : :
4297 [ # # ]: 0 : BUG_ON(cgroup_css(cgrp, ss));
4298 : 0 : }
4299 : :
4300 : : /* invoke ->css_online() on a new CSS and mark it online if successful */
4301 : 0 : static int online_css(struct cgroup_subsys_state *css)
4302 : : {
4303 : 0 : struct cgroup_subsys *ss = css->ss;
4304 : : int ret = 0;
4305 : :
4306 : : lockdep_assert_held(&cgroup_mutex);
4307 : :
4308 [ # # ]: 0 : if (ss->css_online)
4309 : 0 : ret = ss->css_online(css);
4310 [ # # ]: 0 : if (!ret) {
4311 : 0 : css->flags |= CSS_ONLINE;
4312 : 0 : css->cgroup->nr_css++;
4313 : 0 : rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
4314 : : }
4315 : 0 : return ret;
4316 : : }
4317 : :
4318 : : /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4319 : 0 : static void offline_css(struct cgroup_subsys_state *css)
4320 : : {
4321 : 0 : struct cgroup_subsys *ss = css->ss;
4322 : :
4323 : : lockdep_assert_held(&cgroup_mutex);
4324 : :
4325 [ # # ]: 0 : if (!(css->flags & CSS_ONLINE))
4326 : 0 : return;
4327 : :
4328 [ # # ]: 0 : if (ss->css_offline)
4329 : 0 : ss->css_offline(css);
4330 : :
4331 : 0 : css->flags &= ~CSS_ONLINE;
4332 : 0 : css->cgroup->nr_css--;
4333 : 0 : RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4334 : : }
4335 : :
4336 : : /*
4337 : : * cgroup_create - create a cgroup
4338 : : * @parent: cgroup that will be parent of the new cgroup
4339 : : * @dentry: dentry of the new cgroup
4340 : : * @mode: mode to set on new inode
4341 : : *
4342 : : * Must be called with the mutex on the parent inode held
4343 : : */
4344 : 0 : static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4345 : : umode_t mode)
4346 : : {
4347 : : struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4348 : : struct cgroup *cgrp;
4349 : : struct cgroup_name *name;
4350 : 2 : struct cgroupfs_root *root = parent->root;
4351 : : int err = 0;
4352 : : struct cgroup_subsys *ss;
4353 : 2 : struct super_block *sb = root->sb;
4354 : :
4355 : : /* allocate the cgroup and its ID, 0 is reserved for the root */
4356 : : cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4357 [ + - ]: 2 : if (!cgrp)
4358 : : return -ENOMEM;
4359 : :
4360 : 2 : name = cgroup_alloc_name(dentry);
4361 [ + - ]: 2 : if (!name)
4362 : : goto err_free_cgrp;
4363 : 2 : rcu_assign_pointer(cgrp->name, name);
4364 : :
4365 : : /*
4366 : : * Temporarily set the pointer to NULL, so idr_find() won't return
4367 : : * a half-baked cgroup.
4368 : : */
4369 : 2 : cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4370 [ + - ]: 2 : if (cgrp->id < 0)
4371 : : goto err_free_name;
4372 : :
4373 : : /*
4374 : : * Only live parents can have children. Note that the liveliness
4375 : : * check isn't strictly necessary because cgroup_mkdir() and
4376 : : * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4377 : : * anyway so that locking is contained inside cgroup proper and we
4378 : : * don't get nasty surprises if we ever grow another caller.
4379 : : */
4380 [ + - ]: 2 : if (!cgroup_lock_live_group(parent)) {
4381 : : err = -ENODEV;
4382 : : goto err_free_id;
4383 : : }
4384 : :
4385 : : /* Grab a reference on the superblock so the hierarchy doesn't
4386 : : * get deleted on unmount if there are child cgroups. This
4387 : : * can be done outside cgroup_mutex, since the sb can't
4388 : : * disappear while someone has an open control file on the
4389 : : * fs */
4390 : 2 : atomic_inc(&sb->s_active);
4391 : :
4392 : 2 : init_cgroup_housekeeping(cgrp);
4393 : :
4394 : 2 : dentry->d_fsdata = cgrp;
4395 : 2 : cgrp->dentry = dentry;
4396 : :
4397 : 2 : cgrp->parent = parent;
4398 : 2 : cgrp->dummy_css.parent = &parent->dummy_css;
4399 : 2 : cgrp->root = parent->root;
4400 : :
4401 [ + + ]: 2 : if (notify_on_release(parent))
4402 : 1 : set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4403 : :
4404 [ - + ]: 2 : if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4405 : 0 : set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4406 : :
4407 [ - + ]: 2 : for_each_root_subsys(root, ss) {
4408 : : struct cgroup_subsys_state *css;
4409 : :
4410 : 0 : css = ss->css_alloc(cgroup_css(parent, ss));
4411 [ # # ]: 2 : if (IS_ERR(css)) {
4412 : : err = PTR_ERR(css);
4413 : 0 : goto err_free_all;
4414 : : }
4415 : : css_ar[ss->subsys_id] = css;
4416 : :
4417 : 0 : err = percpu_ref_init(&css->refcnt, css_release);
4418 [ # # ]: 0 : if (err)
4419 : : goto err_free_all;
4420 : :
4421 : 0 : init_css(css, ss, cgrp);
4422 : : }
4423 : :
4424 : : /*
4425 : : * Create directory. cgroup_create_file() returns with the new
4426 : : * directory locked on success so that it can be populated without
4427 : : * dropping cgroup_mutex.
4428 : : */
4429 : 2 : err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4430 [ + - ]: 2 : if (err < 0)
4431 : : goto err_free_all;
4432 : : lockdep_assert_held(&dentry->d_inode->i_mutex);
4433 : :
4434 : 2 : cgrp->serial_nr = cgroup_serial_nr_next++;
4435 : :
4436 : : /* allocation complete, commit to creation */
4437 : 2 : list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4438 : 2 : root->number_of_cgroups++;
4439 : :
4440 : : /* hold a ref to the parent's dentry */
4441 : 2 : dget(parent->dentry);
4442 : :
4443 : : /* creation succeeded, notify subsystems */
4444 [ - + ]: 2 : for_each_root_subsys(root, ss) {
4445 : 0 : struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4446 : :
4447 : 0 : err = online_css(css);
4448 [ # # ]: 0 : if (err)
4449 : : goto err_destroy;
4450 : :
4451 : : /* each css holds a ref to the cgroup's dentry and parent css */
4452 : : dget(dentry);
4453 : 0 : css_get(css->parent);
4454 : :
4455 : : /* mark it consumed for error path */
4456 : : css_ar[ss->subsys_id] = NULL;
4457 : :
4458 [ # # ][ # # ]: 0 : if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
[ # # ]
4459 : 0 : parent->parent) {
4460 : 0 : pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4461 : : current->comm, current->pid, ss->name);
4462 [ # # ]: 0 : if (!strcmp(ss->name, "memory"))
4463 : 0 : pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4464 : 0 : ss->warned_broken_hierarchy = true;
4465 : : }
4466 : : }
4467 : :
4468 : 2 : idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4469 : :
4470 : 2 : err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4471 [ + - ]: 2 : if (err)
4472 : : goto err_destroy;
4473 : :
4474 : : err = cgroup_populate_dir(cgrp, root->subsys_mask);
4475 : : if (err)
4476 : : goto err_destroy;
4477 : :
4478 : 2 : mutex_unlock(&cgroup_mutex);
4479 : 2 : mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4480 : :
4481 : 2 : return 0;
4482 : :
4483 : : err_free_all:
4484 [ # # ]: 0 : for_each_root_subsys(root, ss) {
4485 : 0 : struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4486 : :
4487 [ # # ]: 0 : if (css) {
4488 : 0 : percpu_ref_cancel_init(&css->refcnt);
4489 : 0 : ss->css_free(css);
4490 : : }
4491 : : }
4492 : 0 : mutex_unlock(&cgroup_mutex);
4493 : : /* Release the reference count that we took on the superblock */
4494 : 0 : deactivate_super(sb);
4495 : : err_free_id:
4496 : 0 : idr_remove(&root->cgroup_idr, cgrp->id);
4497 : : err_free_name:
4498 : 0 : kfree(rcu_dereference_raw(cgrp->name));
4499 : : err_free_cgrp:
4500 : 0 : kfree(cgrp);
4501 : 0 : return err;
4502 : :
4503 : : err_destroy:
4504 [ # # ]: 0 : for_each_root_subsys(root, ss) {
4505 : 0 : struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4506 : :
4507 [ # # ]: 0 : if (css) {
4508 : 0 : percpu_ref_cancel_init(&css->refcnt);
4509 : 0 : ss->css_free(css);
4510 : : }
4511 : : }
4512 : 0 : cgroup_destroy_locked(cgrp);
4513 : 0 : mutex_unlock(&cgroup_mutex);
4514 : 0 : mutex_unlock(&dentry->d_inode->i_mutex);
4515 : 0 : return err;
4516 : : }
4517 : :
4518 : 0 : static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4519 : : {
4520 : 2 : struct cgroup *c_parent = dentry->d_parent->d_fsdata;
4521 : :
4522 : : /* the vfs holds inode->i_mutex already */
4523 : 2 : return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4524 : : }
4525 : :
4526 : : /*
4527 : : * This is called when the refcnt of a css is confirmed to be killed.
4528 : : * css_tryget() is now guaranteed to fail.
4529 : : */
4530 : 0 : static void css_killed_work_fn(struct work_struct *work)
4531 : : {
4532 : 0 : struct cgroup_subsys_state *css =
4533 : : container_of(work, struct cgroup_subsys_state, destroy_work);
4534 : 0 : struct cgroup *cgrp = css->cgroup;
4535 : :
4536 : 0 : mutex_lock(&cgroup_mutex);
4537 : :
4538 : : /*
4539 : : * css_tryget() is guaranteed to fail now. Tell subsystems to
4540 : : * initate destruction.
4541 : : */
4542 : 0 : offline_css(css);
4543 : :
4544 : : /*
4545 : : * If @cgrp is marked dead, it's waiting for refs of all css's to
4546 : : * be disabled before proceeding to the second phase of cgroup
4547 : : * destruction. If we are the last one, kick it off.
4548 : : */
4549 [ # # ][ # # ]: 0 : if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4550 : 0 : cgroup_destroy_css_killed(cgrp);
4551 : :
4552 : 0 : mutex_unlock(&cgroup_mutex);
4553 : :
4554 : : /*
4555 : : * Put the css refs from kill_css(). Each css holds an extra
4556 : : * reference to the cgroup's dentry and cgroup removal proceeds
4557 : : * regardless of css refs. On the last put of each css, whenever
4558 : : * that may be, the extra dentry ref is put so that dentry
4559 : : * destruction happens only after all css's are released.
4560 : : */
4561 : : css_put(css);
4562 : 0 : }
4563 : :
4564 : : /* css kill confirmation processing requires process context, bounce */
4565 : 0 : static void css_killed_ref_fn(struct percpu_ref *ref)
4566 : : {
4567 : : struct cgroup_subsys_state *css =
4568 : : container_of(ref, struct cgroup_subsys_state, refcnt);
4569 : :
4570 : 0 : INIT_WORK(&css->destroy_work, css_killed_work_fn);
4571 : 0 : queue_work(cgroup_destroy_wq, &css->destroy_work);
4572 : 0 : }
4573 : :
4574 : : /**
4575 : : * kill_css - destroy a css
4576 : : * @css: css to destroy
4577 : : *
4578 : : * This function initiates destruction of @css by removing cgroup interface
4579 : : * files and putting its base reference. ->css_offline() will be invoked
4580 : : * asynchronously once css_tryget() is guaranteed to fail and when the
4581 : : * reference count reaches zero, @css will be released.
4582 : : */
4583 : 0 : static void kill_css(struct cgroup_subsys_state *css)
4584 : : {
4585 : : cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4586 : :
4587 : : /*
4588 : : * Killing would put the base ref, but we need to keep it alive
4589 : : * until after ->css_offline().
4590 : : */
4591 : : css_get(css);
4592 : :
4593 : : /*
4594 : : * cgroup core guarantees that, by the time ->css_offline() is
4595 : : * invoked, no new css reference will be given out via
4596 : : * css_tryget(). We can't simply call percpu_ref_kill() and
4597 : : * proceed to offlining css's because percpu_ref_kill() doesn't
4598 : : * guarantee that the ref is seen as killed on all CPUs on return.
4599 : : *
4600 : : * Use percpu_ref_kill_and_confirm() to get notifications as each
4601 : : * css is confirmed to be seen as killed on all CPUs.
4602 : : */
4603 : 0 : percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4604 : 0 : }
4605 : :
4606 : : /**
4607 : : * cgroup_destroy_locked - the first stage of cgroup destruction
4608 : : * @cgrp: cgroup to be destroyed
4609 : : *
4610 : : * css's make use of percpu refcnts whose killing latency shouldn't be
4611 : : * exposed to userland and are RCU protected. Also, cgroup core needs to
4612 : : * guarantee that css_tryget() won't succeed by the time ->css_offline() is
4613 : : * invoked. To satisfy all the requirements, destruction is implemented in
4614 : : * the following two steps.
4615 : : *
4616 : : * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
4617 : : * userland visible parts and start killing the percpu refcnts of
4618 : : * css's. Set up so that the next stage will be kicked off once all
4619 : : * the percpu refcnts are confirmed to be killed.
4620 : : *
4621 : : * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4622 : : * rest of destruction. Once all cgroup references are gone, the
4623 : : * cgroup is RCU-freed.
4624 : : *
4625 : : * This function implements s1. After this step, @cgrp is gone as far as
4626 : : * the userland is concerned and a new cgroup with the same name may be
4627 : : * created. As cgroup doesn't care about the names internally, this
4628 : : * doesn't cause any problem.
4629 : : */
4630 : 0 : static int cgroup_destroy_locked(struct cgroup *cgrp)
4631 : : __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4632 : : {
4633 : 2 : struct dentry *d = cgrp->dentry;
4634 : : struct cgroup_event *event, *tmp;
4635 : : struct cgroup_subsys *ss;
4636 : : struct cgroup *child;
4637 : : bool empty;
4638 : :
4639 : : lockdep_assert_held(&d->d_inode->i_mutex);
4640 : : lockdep_assert_held(&cgroup_mutex);
4641 : :
4642 : : /*
4643 : : * css_set_lock synchronizes access to ->cset_links and prevents
4644 : : * @cgrp from being removed while __put_css_set() is in progress.
4645 : : */
4646 : 2 : read_lock(&css_set_lock);
4647 : 2 : empty = list_empty(&cgrp->cset_links);
4648 : : read_unlock(&css_set_lock);
4649 [ + - ]: 2 : if (!empty)
4650 : : return -EBUSY;
4651 : :
4652 : : /*
4653 : : * Make sure there's no live children. We can't test ->children
4654 : : * emptiness as dead children linger on it while being destroyed;
4655 : : * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
4656 : : */
4657 : : empty = true;
4658 : : rcu_read_lock();
4659 [ - + ]: 2 : list_for_each_entry_rcu(child, &cgrp->children, sibling) {
4660 : : empty = cgroup_is_dead(child);
4661 [ # # ]: 0 : if (!empty)
4662 : : break;
4663 : : }
4664 : : rcu_read_unlock();
4665 [ + - ]: 2 : if (!empty)
4666 : : return -EBUSY;
4667 : :
4668 : : /*
4669 : : * Initiate massacre of all css's. cgroup_destroy_css_killed()
4670 : : * will be invoked to perform the rest of destruction once the
4671 : : * percpu refs of all css's are confirmed to be killed.
4672 : : */
4673 [ - + ]: 2 : for_each_root_subsys(cgrp->root, ss) {
4674 : : struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
4675 : :
4676 [ # # ]: 0 : if (css)
4677 : 0 : kill_css(css);
4678 : : }
4679 : :
4680 : : /*
4681 : : * Mark @cgrp dead. This prevents further task migration and child
4682 : : * creation by disabling cgroup_lock_live_group(). Note that
4683 : : * CGRP_DEAD assertion is depended upon by css_next_child() to
4684 : : * resume iteration after dropping RCU read lock. See
4685 : : * css_next_child() for details.
4686 : : */
4687 : 2 : set_bit(CGRP_DEAD, &cgrp->flags);
4688 : :
4689 : : /* CGRP_DEAD is set, remove from ->release_list for the last time */
4690 : 2 : raw_spin_lock(&release_list_lock);
4691 [ - + ]: 2 : if (!list_empty(&cgrp->release_list))
4692 : : list_del_init(&cgrp->release_list);
4693 : : raw_spin_unlock(&release_list_lock);
4694 : :
4695 : : /*
4696 : : * If @cgrp has css's attached, the second stage of cgroup
4697 : : * destruction is kicked off from css_killed_work_fn() after the
4698 : : * refs of all attached css's are killed. If @cgrp doesn't have
4699 : : * any css, we kick it off here.
4700 : : */
4701 [ + - ]: 2 : if (!cgrp->nr_css)
4702 : 2 : cgroup_destroy_css_killed(cgrp);
4703 : :
4704 : : /*
4705 : : * Clear the base files and remove @cgrp directory. The removal
4706 : : * puts the base ref but we aren't quite done with @cgrp yet, so
4707 : : * hold onto it.
4708 : : */
4709 : 2 : cgroup_addrm_files(cgrp, cgroup_base_files, false);
4710 : : dget(d);
4711 : 2 : cgroup_d_remove_dir(d);
4712 : :
4713 : : /*
4714 : : * Unregister events and notify userspace.
4715 : : * Notify userspace about cgroup removing only after rmdir of cgroup
4716 : : * directory to avoid race between userspace and kernelspace.
4717 : : */
4718 : : spin_lock(&cgrp->event_list_lock);
4719 [ - + ]: 2 : list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4720 : : list_del_init(&event->list);
4721 : 0 : schedule_work(&event->remove);
4722 : : }
4723 : : spin_unlock(&cgrp->event_list_lock);
4724 : :
4725 : 2 : return 0;
4726 : : };
4727 : :
4728 : : /**
4729 : : * cgroup_destroy_css_killed - the second step of cgroup destruction
4730 : : * @work: cgroup->destroy_free_work
4731 : : *
4732 : : * This function is invoked from a work item for a cgroup which is being
4733 : : * destroyed after all css's are offlined and performs the rest of
4734 : : * destruction. This is the second step of destruction described in the
4735 : : * comment above cgroup_destroy_locked().
4736 : : */
4737 : 0 : static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4738 : : {
4739 : 2 : struct cgroup *parent = cgrp->parent;
4740 : 2 : struct dentry *d = cgrp->dentry;
4741 : :
4742 : : lockdep_assert_held(&cgroup_mutex);
4743 : :
4744 : : /* delete this cgroup from parent->children */
4745 : : list_del_rcu(&cgrp->sibling);
4746 : :
4747 : 2 : dput(d);
4748 : :
4749 : 2 : set_bit(CGRP_RELEASABLE, &parent->flags);
4750 : 2 : check_for_release(parent);
4751 : 2 : }
4752 : :
4753 : 0 : static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4754 : : {
4755 : : int ret;
4756 : :
4757 : 2 : mutex_lock(&cgroup_mutex);
4758 : 2 : ret = cgroup_destroy_locked(dentry->d_fsdata);
4759 : 2 : mutex_unlock(&cgroup_mutex);
4760 : :
4761 : 2 : return ret;
4762 : : }
4763 : :
4764 : : static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4765 : : {
4766 : 0 : INIT_LIST_HEAD(&ss->cftsets);
4767 : :
4768 : : /*
4769 : : * base_cftset is embedded in subsys itself, no need to worry about
4770 : : * deregistration.
4771 : : */
4772 [ # # ]: 0 : if (ss->base_cftypes) {
4773 : : struct cftype *cft;
4774 : :
4775 [ # # ]: 0 : for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
4776 : 0 : cft->ss = ss;
4777 : :
4778 : 0 : ss->base_cftset.cfts = ss->base_cftypes;
4779 : 0 : list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4780 : : }
4781 : : }
4782 : :
4783 : : static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4784 : : {
4785 : : struct cgroup_subsys_state *css;
4786 : :
4787 : : printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4788 : :
4789 : : mutex_lock(&cgroup_mutex);
4790 : :
4791 : : /* init base cftset */
4792 : : cgroup_init_cftsets(ss);
4793 : :
4794 : : /* Create the top cgroup state for this subsystem */
4795 : : list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4796 : : ss->root = &cgroup_dummy_root;
4797 : : css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4798 : : /* We don't handle early failures gracefully */
4799 : : BUG_ON(IS_ERR(css));
4800 : : init_css(css, ss, cgroup_dummy_top);
4801 : :
4802 : : /* Update the init_css_set to contain a subsys
4803 : : * pointer to this state - since the subsystem is
4804 : : * newly registered, all tasks and hence the
4805 : : * init_css_set is in the subsystem's top cgroup. */
4806 : : init_css_set.subsys[ss->subsys_id] = css;
4807 : :
4808 : : need_forkexit_callback |= ss->fork || ss->exit;
4809 : :
4810 : : /* At system boot, before all subsystems have been
4811 : : * registered, no tasks have been forked, so we don't
4812 : : * need to invoke fork callbacks here. */
4813 : : BUG_ON(!list_empty(&init_task.tasks));
4814 : :
4815 : : BUG_ON(online_css(css));
4816 : :
4817 : : mutex_unlock(&cgroup_mutex);
4818 : :
4819 : : /* this function shouldn't be used with modular subsystems, since they
4820 : : * need to register a subsys_id, among other things */
4821 : : BUG_ON(ss->module);
4822 : : }
4823 : :
4824 : : /**
4825 : : * cgroup_load_subsys: load and register a modular subsystem at runtime
4826 : : * @ss: the subsystem to load
4827 : : *
4828 : : * This function should be called in a modular subsystem's initcall. If the
4829 : : * subsystem is built as a module, it will be assigned a new subsys_id and set
4830 : : * up for use. If the subsystem is built-in anyway, work is delegated to the
4831 : : * simpler cgroup_init_subsys.
4832 : : */
4833 : 0 : int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4834 : : {
4835 : : struct cgroup_subsys_state *css;
4836 : : int i, ret;
4837 : : struct hlist_node *tmp;
4838 : : struct css_set *cset;
4839 : : unsigned long key;
4840 : :
4841 : : /* check name and function validity */
4842 [ # # ][ # # ]: 0 : if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
[ # # ]
4843 [ # # ]: 0 : ss->css_alloc == NULL || ss->css_free == NULL)
4844 : : return -EINVAL;
4845 : :
4846 : : /*
4847 : : * we don't support callbacks in modular subsystems. this check is
4848 : : * before the ss->module check for consistency; a subsystem that could
4849 : : * be a module should still have no callbacks even if the user isn't
4850 : : * compiling it as one.
4851 : : */
4852 [ # # ][ # # ]: 0 : if (ss->fork || ss->exit)
4853 : : return -EINVAL;
4854 : :
4855 : : /*
4856 : : * an optionally modular subsystem is built-in: we want to do nothing,
4857 : : * since cgroup_init_subsys will have already taken care of it.
4858 : : */
4859 [ # # ]: 0 : if (ss->module == NULL) {
4860 : : /* a sanity check */
4861 [ # # ]: 0 : BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4862 : : return 0;
4863 : : }
4864 : :
4865 : : /* init base cftset */
4866 : : cgroup_init_cftsets(ss);
4867 : :
4868 : 0 : mutex_lock(&cgroup_mutex);
4869 : 0 : cgroup_subsys[ss->subsys_id] = ss;
4870 : :
4871 : : /*
4872 : : * no ss->css_alloc seems to need anything important in the ss
4873 : : * struct, so this can happen first (i.e. before the dummy root
4874 : : * attachment).
4875 : : */
4876 : 0 : css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4877 [ # # ]: 0 : if (IS_ERR(css)) {
4878 : : /* failure case - need to deassign the cgroup_subsys[] slot. */
4879 : 0 : cgroup_subsys[ss->subsys_id] = NULL;
4880 : 0 : mutex_unlock(&cgroup_mutex);
4881 : 0 : return PTR_ERR(css);
4882 : : }
4883 : :
4884 : 0 : list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4885 : 0 : ss->root = &cgroup_dummy_root;
4886 : :
4887 : : /* our new subsystem will be attached to the dummy hierarchy. */
4888 : 0 : init_css(css, ss, cgroup_dummy_top);
4889 : :
4890 : : /*
4891 : : * Now we need to entangle the css into the existing css_sets. unlike
4892 : : * in cgroup_init_subsys, there are now multiple css_sets, so each one
4893 : : * will need a new pointer to it; done by iterating the css_set_table.
4894 : : * furthermore, modifying the existing css_sets will corrupt the hash
4895 : : * table state, so each changed css_set will need its hash recomputed.
4896 : : * this is all done under the css_set_lock.
4897 : : */
4898 : 0 : write_lock(&css_set_lock);
4899 [ # # ][ # # ]: 0 : hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
[ # # ][ # # ]
4900 : : /* skip entries that we already rehashed */
4901 [ # # ]: 0 : if (cset->subsys[ss->subsys_id])
4902 : 0 : continue;
4903 : : /* remove existing entry */
4904 : : hash_del(&cset->hlist);
4905 : : /* set new value */
4906 : 0 : cset->subsys[ss->subsys_id] = css;
4907 : : /* recompute hash and restore entry */
4908 : : key = css_set_hash(cset->subsys);
4909 : 0 : hash_add(css_set_table, &cset->hlist, key);
4910 : : }
4911 : : write_unlock(&css_set_lock);
4912 : :
4913 : 0 : ret = online_css(css);
4914 [ # # ]: 0 : if (ret)
4915 : : goto err_unload;
4916 : :
4917 : : /* success! */
4918 : 0 : mutex_unlock(&cgroup_mutex);
4919 : 0 : return 0;
4920 : :
4921 : : err_unload:
4922 : 0 : mutex_unlock(&cgroup_mutex);
4923 : : /* @ss can't be mounted here as try_module_get() would fail */
4924 : 0 : cgroup_unload_subsys(ss);
4925 : 0 : return ret;
4926 : : }
4927 : : EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4928 : :
4929 : : /**
4930 : : * cgroup_unload_subsys: unload a modular subsystem
4931 : : * @ss: the subsystem to unload
4932 : : *
4933 : : * This function should be called in a modular subsystem's exitcall. When this
4934 : : * function is invoked, the refcount on the subsystem's module will be 0, so
4935 : : * the subsystem will not be attached to any hierarchy.
4936 : : */
4937 : 0 : void cgroup_unload_subsys(struct cgroup_subsys *ss)
4938 : : {
4939 : : struct cgrp_cset_link *link;
4940 : :
4941 [ # # ]: 0 : BUG_ON(ss->module == NULL);
4942 : :
4943 : : /*
4944 : : * we shouldn't be called if the subsystem is in use, and the use of
4945 : : * try_module_get() in rebind_subsystems() should ensure that it
4946 : : * doesn't start being used while we're killing it off.
4947 : : */
4948 [ # # ]: 0 : BUG_ON(ss->root != &cgroup_dummy_root);
4949 : :
4950 : 0 : mutex_lock(&cgroup_mutex);
4951 : :
4952 : 0 : offline_css(cgroup_css(cgroup_dummy_top, ss));
4953 : :
4954 : : /* deassign the subsys_id */
4955 : 0 : cgroup_subsys[ss->subsys_id] = NULL;
4956 : :
4957 : : /* remove subsystem from the dummy root's list of subsystems */
4958 : 0 : list_del_init(&ss->sibling);
4959 : :
4960 : : /*
4961 : : * disentangle the css from all css_sets attached to the dummy
4962 : : * top. as in loading, we need to pay our respects to the hashtable
4963 : : * gods.
4964 : : */
4965 : 0 : write_lock(&css_set_lock);
4966 [ # # ]: 0 : list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4967 : 0 : struct css_set *cset = link->cset;
4968 : : unsigned long key;
4969 : :
4970 : : hash_del(&cset->hlist);
4971 : 0 : cset->subsys[ss->subsys_id] = NULL;
4972 : : key = css_set_hash(cset->subsys);
4973 : 0 : hash_add(css_set_table, &cset->hlist, key);
4974 : : }
4975 : : write_unlock(&css_set_lock);
4976 : :
4977 : : /*
4978 : : * remove subsystem's css from the cgroup_dummy_top and free it -
4979 : : * need to free before marking as null because ss->css_free needs
4980 : : * the cgrp->subsys pointer to find their state.
4981 : : */
4982 : 0 : ss->css_free(cgroup_css(cgroup_dummy_top, ss));
4983 : 0 : RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4984 : :
4985 : 0 : mutex_unlock(&cgroup_mutex);
4986 : 0 : }
4987 : : EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4988 : :
4989 : : /**
4990 : : * cgroup_init_early - cgroup initialization at system boot
4991 : : *
4992 : : * Initialize cgroups at system boot, and initialize any
4993 : : * subsystems that request early init.
4994 : : */
4995 : 0 : int __init cgroup_init_early(void)
4996 : : {
4997 : : struct cgroup_subsys *ss;
4998 : : int i;
4999 : :
5000 : 0 : atomic_set(&init_css_set.refcount, 1);
5001 : : INIT_LIST_HEAD(&init_css_set.cgrp_links);
5002 : : INIT_LIST_HEAD(&init_css_set.tasks);
5003 : : INIT_HLIST_NODE(&init_css_set.hlist);
5004 : 0 : css_set_count = 1;
5005 : 0 : init_cgroup_root(&cgroup_dummy_root);
5006 : 0 : cgroup_root_count = 1;
5007 : 0 : RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5008 : :
5009 : 0 : init_cgrp_cset_link.cset = &init_css_set;
5010 : 0 : init_cgrp_cset_link.cgrp = cgroup_dummy_top;
5011 : : list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
5012 : : list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
5013 : :
5014 : : /* at bootup time, we don't worry about modular subsystems */
5015 : : for_each_builtin_subsys(ss, i) {
5016 : : BUG_ON(!ss->name);
5017 : : BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
5018 : : BUG_ON(!ss->css_alloc);
5019 : : BUG_ON(!ss->css_free);
5020 : : if (ss->subsys_id != i) {
5021 : : printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
5022 : : ss->name, ss->subsys_id);
5023 : : BUG();
5024 : : }
5025 : :
5026 : : if (ss->early_init)
5027 : : cgroup_init_subsys(ss);
5028 : : }
5029 : 0 : return 0;
5030 : : }
5031 : :
5032 : : /**
5033 : : * cgroup_init - cgroup initialization
5034 : : *
5035 : : * Register cgroup filesystem and /proc file, and initialize
5036 : : * any subsystems that didn't request early init.
5037 : : */
5038 : 0 : int __init cgroup_init(void)
5039 : : {
5040 : : struct cgroup_subsys *ss;
5041 : : unsigned long key;
5042 : : int i, err;
5043 : :
5044 : 0 : err = bdi_init(&cgroup_backing_dev_info);
5045 [ # # ]: 0 : if (err)
5046 : : return err;
5047 : :
5048 : : for_each_builtin_subsys(ss, i) {
5049 : : if (!ss->early_init)
5050 : : cgroup_init_subsys(ss);
5051 : : }
5052 : :
5053 : : /* allocate id for the dummy hierarchy */
5054 : 0 : mutex_lock(&cgroup_mutex);
5055 : 0 : mutex_lock(&cgroup_root_mutex);
5056 : :
5057 : : /* Add init_css_set to the hash table */
5058 : : key = css_set_hash(init_css_set.subsys);
5059 : : hash_add(css_set_table, &init_css_set.hlist, key);
5060 : :
5061 [ # # ]: 0 : BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
5062 : :
5063 : 0 : err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
5064 : : 0, 1, GFP_KERNEL);
5065 [ # # ]: 0 : BUG_ON(err < 0);
5066 : :
5067 : 0 : mutex_unlock(&cgroup_root_mutex);
5068 : 0 : mutex_unlock(&cgroup_mutex);
5069 : :
5070 : 0 : cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
5071 [ # # ]: 0 : if (!cgroup_kobj) {
5072 : : err = -ENOMEM;
5073 : : goto out;
5074 : : }
5075 : :
5076 : 0 : err = register_filesystem(&cgroup_fs_type);
5077 [ # # ]: 0 : if (err < 0) {
5078 : 0 : kobject_put(cgroup_kobj);
5079 : 0 : goto out;
5080 : : }
5081 : :
5082 : : proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
5083 : :
5084 : : out:
5085 [ # # ]: 0 : if (err)
5086 : 0 : bdi_destroy(&cgroup_backing_dev_info);
5087 : :
5088 : 0 : return err;
5089 : : }
5090 : :
5091 : 0 : static int __init cgroup_wq_init(void)
5092 : : {
5093 : : /*
5094 : : * There isn't much point in executing destruction path in
5095 : : * parallel. Good chunk is serialized with cgroup_mutex anyway.
5096 : : * Use 1 for @max_active.
5097 : : *
5098 : : * We would prefer to do this in cgroup_init() above, but that
5099 : : * is called before init_workqueues(): so leave this until after.
5100 : : */
5101 : 0 : cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5102 [ # # ]: 0 : BUG_ON(!cgroup_destroy_wq);
5103 : 0 : return 0;
5104 : : }
5105 : : core_initcall(cgroup_wq_init);
5106 : :
5107 : : /*
5108 : : * proc_cgroup_show()
5109 : : * - Print task's cgroup paths into seq_file, one line for each hierarchy
5110 : : * - Used for /proc/<pid>/cgroup.
5111 : : * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
5112 : : * doesn't really matter if tsk->cgroup changes after we read it,
5113 : : * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
5114 : : * anyway. No need to check that tsk->cgroup != NULL, thanks to
5115 : : * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
5116 : : * cgroup to top_cgroup.
5117 : : */
5118 : :
5119 : : /* TODO: Use a proper seq_file iterator */
5120 : 0 : int proc_cgroup_show(struct seq_file *m, void *v)
5121 : : {
5122 : : struct pid *pid;
5123 : : struct task_struct *tsk;
5124 : : char *buf;
5125 : : int retval;
5126 : : struct cgroupfs_root *root;
5127 : :
5128 : : retval = -ENOMEM;
5129 : : buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
5130 [ + - ]: 2 : if (!buf)
5131 : : goto out;
5132 : :
5133 : : retval = -ESRCH;
5134 : 2 : pid = m->private;
5135 : 2 : tsk = get_pid_task(pid, PIDTYPE_PID);
5136 [ + - ]: 2 : if (!tsk)
5137 : : goto out_free;
5138 : :
5139 : : retval = 0;
5140 : :
5141 : 2 : mutex_lock(&cgroup_mutex);
5142 : :
5143 [ - + ]: 2 : for_each_active_root(root) {
5144 : : struct cgroup_subsys *ss;
5145 : : struct cgroup *cgrp;
5146 : : int count = 0;
5147 : :
5148 : 0 : seq_printf(m, "%d:", root->hierarchy_id);
5149 [ # # ]: 0 : for_each_root_subsys(root, ss)
5150 [ # # ]: 0 : seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
5151 [ # # ]: 0 : if (strlen(root->name))
5152 [ # # ]: 0 : seq_printf(m, "%sname=%s", count ? "," : "",
5153 : 0 : root->name);
5154 : 0 : seq_putc(m, ':');
5155 : 0 : cgrp = task_cgroup_from_root(tsk, root);
5156 : 0 : retval = cgroup_path(cgrp, buf, PAGE_SIZE);
5157 [ # # ]: 0 : if (retval < 0)
5158 : : goto out_unlock;
5159 : 0 : seq_puts(m, buf);
5160 : 0 : seq_putc(m, '\n');
5161 : : }
5162 : :
5163 : : out_unlock:
5164 : 2 : mutex_unlock(&cgroup_mutex);
5165 : : put_task_struct(tsk);
5166 : : out_free:
5167 : 2 : kfree(buf);
5168 : : out:
5169 : 2 : return retval;
5170 : : }
5171 : :
5172 : : /* Display information about each subsystem and each hierarchy */
5173 : 0 : static int proc_cgroupstats_show(struct seq_file *m, void *v)
5174 : : {
5175 : : struct cgroup_subsys *ss;
5176 : : int i;
5177 : :
5178 : 56 : seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
5179 : : /*
5180 : : * ideally we don't want subsystems moving around while we do this.
5181 : : * cgroup_mutex is also necessary to guarantee an atomic snapshot of
5182 : : * subsys/hierarchy state.
5183 : : */
5184 : 56 : mutex_lock(&cgroup_mutex);
5185 : :
5186 : : for_each_subsys(ss, i)
5187 : : seq_printf(m, "%s\t%d\t%d\t%d\n",
5188 : : ss->name, ss->root->hierarchy_id,
5189 : : ss->root->number_of_cgroups, !ss->disabled);
5190 : :
5191 : 56 : mutex_unlock(&cgroup_mutex);
5192 : 56 : return 0;
5193 : : }
5194 : :
5195 : 0 : static int cgroupstats_open(struct inode *inode, struct file *file)
5196 : : {
5197 : 56 : return single_open(file, proc_cgroupstats_show, NULL);
5198 : : }
5199 : :
5200 : : static const struct file_operations proc_cgroupstats_operations = {
5201 : : .open = cgroupstats_open,
5202 : : .read = seq_read,
5203 : : .llseek = seq_lseek,
5204 : : .release = single_release,
5205 : : };
5206 : :
5207 : : /**
5208 : : * cgroup_fork - attach newly forked task to its parents cgroup.
5209 : : * @child: pointer to task_struct of forking parent process.
5210 : : *
5211 : : * Description: A task inherits its parent's cgroup at fork().
5212 : : *
5213 : : * A pointer to the shared css_set was automatically copied in
5214 : : * fork.c by dup_task_struct(). However, we ignore that copy, since
5215 : : * it was not made under the protection of RCU or cgroup_mutex, so
5216 : : * might no longer be a valid cgroup pointer. cgroup_attach_task() might
5217 : : * have already changed current->cgroups, allowing the previously
5218 : : * referenced cgroup group to be removed and freed.
5219 : : *
5220 : : * At the point that cgroup_fork() is called, 'current' is the parent
5221 : : * task, and the passed argument 'child' points to the child task.
5222 : : */
5223 : 0 : void cgroup_fork(struct task_struct *child)
5224 : : {
5225 : 1122967 : task_lock(current);
5226 : 1122974 : get_css_set(task_css_set(current));
5227 : 1122984 : child->cgroups = current->cgroups;
5228 : 1122984 : task_unlock(current);
5229 : 1122983 : INIT_LIST_HEAD(&child->cg_list);
5230 : 1122983 : }
5231 : :
5232 : : /**
5233 : : * cgroup_post_fork - called on a new task after adding it to the task list
5234 : : * @child: the task in question
5235 : : *
5236 : : * Adds the task to the list running through its css_set if necessary and
5237 : : * call the subsystem fork() callbacks. Has to be after the task is
5238 : : * visible on the task list in case we race with the first call to
5239 : : * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5240 : : * list.
5241 : : */
5242 : 0 : void cgroup_post_fork(struct task_struct *child)
5243 : : {
5244 : : struct cgroup_subsys *ss;
5245 : : int i;
5246 : :
5247 : : /*
5248 : : * use_task_css_set_links is set to 1 before we walk the tasklist
5249 : : * under the tasklist_lock and we read it here after we added the child
5250 : : * to the tasklist under the tasklist_lock as well. If the child wasn't
5251 : : * yet in the tasklist when we walked through it from
5252 : : * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
5253 : : * should be visible now due to the paired locking and barriers implied
5254 : : * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
5255 : : * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
5256 : : * lock on fork.
5257 : : */
5258 [ + + ]: 1122972 : if (use_task_css_set_links) {
5259 : 14720 : write_lock(&css_set_lock);
5260 : : task_lock(child);
5261 [ + - ]: 14720 : if (list_empty(&child->cg_list))
5262 : 14720 : list_add(&child->cg_list, &task_css_set(child)->tasks);
5263 : : task_unlock(child);
5264 : : write_unlock(&css_set_lock);
5265 : : }
5266 : :
5267 : : /*
5268 : : * Call ss->fork(). This must happen after @child is linked on
5269 : : * css_set; otherwise, @child might change state between ->fork()
5270 : : * and addition to css_set.
5271 : : */
5272 : : if (need_forkexit_callback) {
5273 : : /*
5274 : : * fork/exit callbacks are supported only for builtin
5275 : : * subsystems, and the builtin section of the subsys
5276 : : * array is immutable, so we don't need to lock the
5277 : : * subsys array here. On the other hand, modular section
5278 : : * of the array can be freed at module unload, so we
5279 : : * can't touch that.
5280 : : */
5281 : : for_each_builtin_subsys(ss, i)
5282 : : if (ss->fork)
5283 : : ss->fork(child);
5284 : : }
5285 : 1122972 : }
5286 : :
5287 : : /**
5288 : : * cgroup_exit - detach cgroup from exiting task
5289 : : * @tsk: pointer to task_struct of exiting process
5290 : : * @run_callback: run exit callbacks?
5291 : : *
5292 : : * Description: Detach cgroup from @tsk and release it.
5293 : : *
5294 : : * Note that cgroups marked notify_on_release force every task in
5295 : : * them to take the global cgroup_mutex mutex when exiting.
5296 : : * This could impact scaling on very large systems. Be reluctant to
5297 : : * use notify_on_release cgroups where very high task exit scaling
5298 : : * is required on large systems.
5299 : : *
5300 : : * the_top_cgroup_hack:
5301 : : *
5302 : : * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
5303 : : *
5304 : : * We call cgroup_exit() while the task is still competent to
5305 : : * handle notify_on_release(), then leave the task attached to the
5306 : : * root cgroup in each hierarchy for the remainder of its exit.
5307 : : *
5308 : : * To do this properly, we would increment the reference count on
5309 : : * top_cgroup, and near the very end of the kernel/exit.c do_exit()
5310 : : * code we would add a second cgroup function call, to drop that
5311 : : * reference. This would just create an unnecessary hot spot on
5312 : : * the top_cgroup reference count, to no avail.
5313 : : *
5314 : : * Normally, holding a reference to a cgroup without bumping its
5315 : : * count is unsafe. The cgroup could go away, or someone could
5316 : : * attach us to a different cgroup, decrementing the count on
5317 : : * the first cgroup that we never incremented. But in this case,
5318 : : * top_cgroup isn't going away, and either task has PF_EXITING set,
5319 : : * which wards off any cgroup_attach_task() attempts, or task is a failed
5320 : : * fork, never visible to cgroup_attach_task.
5321 : : */
5322 : 0 : void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5323 : : {
5324 : : struct cgroup_subsys *ss;
5325 : : struct css_set *cset;
5326 : : int i;
5327 : :
5328 : : /*
5329 : : * Unlink from the css_set task list if necessary.
5330 : : * Optimistically check cg_list before taking
5331 : : * css_set_lock
5332 : : */
5333 [ + + ]: 1122968 : if (!list_empty(&tsk->cg_list)) {
5334 : 14726 : write_lock(&css_set_lock);
5335 [ + - ]: 14726 : if (!list_empty(&tsk->cg_list))
5336 : : list_del_init(&tsk->cg_list);
5337 : : write_unlock(&css_set_lock);
5338 : : }
5339 : :
5340 : : /* Reassign the task to the init_css_set. */
5341 : : task_lock(tsk);
5342 : : cset = task_css_set(tsk);
5343 : 1122948 : RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5344 : :
5345 : : if (run_callbacks && need_forkexit_callback) {
5346 : : /*
5347 : : * fork/exit callbacks are supported only for builtin
5348 : : * subsystems, see cgroup_post_fork() for details.
5349 : : */
5350 : : for_each_builtin_subsys(ss, i) {
5351 : : if (ss->exit) {
5352 : : struct cgroup_subsys_state *old_css = cset->subsys[i];
5353 : : struct cgroup_subsys_state *css = task_css(tsk, i);
5354 : :
5355 : : ss->exit(css, old_css, tsk);
5356 : : }
5357 : : }
5358 : : }
5359 : : task_unlock(tsk);
5360 : :
5361 : : put_css_set_taskexit(cset);
5362 : 1122983 : }
5363 : :
5364 : 0 : static void check_for_release(struct cgroup *cgrp)
5365 : : {
5366 [ + - ][ - + ]: 2 : if (cgroup_is_releasable(cgrp) &&
5367 [ # # ]: 0 : list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
5368 : : /*
5369 : : * Control Group is currently removeable. If it's not
5370 : : * already queued for a userspace notification, queue
5371 : : * it now
5372 : : */
5373 : : int need_schedule_work = 0;
5374 : :
5375 : 0 : raw_spin_lock(&release_list_lock);
5376 [ # # ][ # # ]: 0 : if (!cgroup_is_dead(cgrp) &&
5377 : 0 : list_empty(&cgrp->release_list)) {
5378 : : list_add(&cgrp->release_list, &release_list);
5379 : : need_schedule_work = 1;
5380 : : }
5381 : : raw_spin_unlock(&release_list_lock);
5382 [ # # ]: 0 : if (need_schedule_work)
5383 : : schedule_work(&release_agent_work);
5384 : : }
5385 : 2 : }
5386 : :
5387 : : /*
5388 : : * Notify userspace when a cgroup is released, by running the
5389 : : * configured release agent with the name of the cgroup (path
5390 : : * relative to the root of cgroup file system) as the argument.
5391 : : *
5392 : : * Most likely, this user command will try to rmdir this cgroup.
5393 : : *
5394 : : * This races with the possibility that some other task will be
5395 : : * attached to this cgroup before it is removed, or that some other
5396 : : * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
5397 : : * The presumed 'rmdir' will fail quietly if this cgroup is no longer
5398 : : * unused, and this cgroup will be reprieved from its death sentence,
5399 : : * to continue to serve a useful existence. Next time it's released,
5400 : : * we will get notified again, if it still has 'notify_on_release' set.
5401 : : *
5402 : : * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
5403 : : * means only wait until the task is successfully execve()'d. The
5404 : : * separate release agent task is forked by call_usermodehelper(),
5405 : : * then control in this thread returns here, without waiting for the
5406 : : * release agent task. We don't bother to wait because the caller of
5407 : : * this routine has no use for the exit status of the release agent
5408 : : * task, so no sense holding our caller up for that.
5409 : : */
5410 : 0 : static void cgroup_release_agent(struct work_struct *work)
5411 : : {
5412 [ # # ]: 0 : BUG_ON(work != &release_agent_work);
5413 : 0 : mutex_lock(&cgroup_mutex);
5414 : 0 : raw_spin_lock(&release_list_lock);
5415 [ # # ]: 0 : while (!list_empty(&release_list)) {
5416 : : char *argv[3], *envp[3];
5417 : : int i;
5418 : : char *pathbuf = NULL, *agentbuf = NULL;
5419 : 0 : struct cgroup *cgrp = list_entry(release_list.next,
5420 : : struct cgroup,
5421 : : release_list);
5422 : 0 : list_del_init(&cgrp->release_list);
5423 : : raw_spin_unlock(&release_list_lock);
5424 : : pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
5425 [ # # ]: 0 : if (!pathbuf)
5426 : : goto continue_free;
5427 [ # # ]: 0 : if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
5428 : : goto continue_free;
5429 : 0 : agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5430 [ # # ]: 0 : if (!agentbuf)
5431 : : goto continue_free;
5432 : :
5433 : : i = 0;
5434 : 0 : argv[i++] = agentbuf;
5435 : 0 : argv[i++] = pathbuf;
5436 : 0 : argv[i] = NULL;
5437 : :
5438 : : i = 0;
5439 : : /* minimal command environment */
5440 : 0 : envp[i++] = "HOME=/";
5441 : 0 : envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5442 : 0 : envp[i] = NULL;
5443 : :
5444 : : /* Drop the lock while we invoke the usermode helper,
5445 : : * since the exec could involve hitting disk and hence
5446 : : * be a slow process */
5447 : 0 : mutex_unlock(&cgroup_mutex);
5448 : 0 : call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5449 : 0 : mutex_lock(&cgroup_mutex);
5450 : : continue_free:
5451 : 0 : kfree(pathbuf);
5452 : 0 : kfree(agentbuf);
5453 : 0 : raw_spin_lock(&release_list_lock);
5454 : : }
5455 : : raw_spin_unlock(&release_list_lock);
5456 : 0 : mutex_unlock(&cgroup_mutex);
5457 : 0 : }
5458 : :
5459 : 0 : static int __init cgroup_disable(char *str)
5460 : : {
5461 : : struct cgroup_subsys *ss;
5462 : : char *token;
5463 : : int i;
5464 : :
5465 [ # # ]: 0 : while ((token = strsep(&str, ",")) != NULL) {
5466 : : if (!*token)
5467 : : continue;
5468 : :
5469 : : /*
5470 : : * cgroup_disable, being at boot time, can't know about
5471 : : * module subsystems, so we don't worry about them.
5472 : : */
5473 : : for_each_builtin_subsys(ss, i) {
5474 : : if (!strcmp(token, ss->name)) {
5475 : : ss->disabled = 1;
5476 : : printk(KERN_INFO "Disabling %s control group"
5477 : : " subsystem\n", ss->name);
5478 : : break;
5479 : : }
5480 : : }
5481 : : }
5482 : 0 : return 1;
5483 : : }
5484 : : __setup("cgroup_disable=", cgroup_disable);
5485 : :
5486 : : /**
5487 : : * css_from_dir - get corresponding css from the dentry of a cgroup dir
5488 : : * @dentry: directory dentry of interest
5489 : : * @ss: subsystem of interest
5490 : : *
5491 : : * Must be called under RCU read lock. The caller is responsible for
5492 : : * pinning the returned css if it needs to be accessed outside the RCU
5493 : : * critical section.
5494 : : */
5495 : 0 : struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5496 : : struct cgroup_subsys *ss)
5497 : : {
5498 : : struct cgroup *cgrp;
5499 : :
5500 : : WARN_ON_ONCE(!rcu_read_lock_held());
5501 : :
5502 : : /* is @dentry a cgroup dir? */
5503 [ # # ][ # # ]: 0 : if (!dentry->d_inode ||
[ # # ][ # # ]
5504 : 0 : dentry->d_inode->i_op != &cgroup_dir_inode_operations)
5505 : : return ERR_PTR(-EBADF);
5506 : :
5507 : : cgrp = __d_cgrp(dentry);
5508 [ # # ][ # # ]: 0 : return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
5509 : : }
5510 : :
5511 : : /**
5512 : : * css_from_id - lookup css by id
5513 : : * @id: the cgroup id
5514 : : * @ss: cgroup subsys to be looked into
5515 : : *
5516 : : * Returns the css if there's valid one with @id, otherwise returns NULL.
5517 : : * Should be called under rcu_read_lock().
5518 : : */
5519 : 0 : struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5520 : : {
5521 : : struct cgroup *cgrp;
5522 : :
5523 : : rcu_lockdep_assert(rcu_read_lock_held() ||
5524 : : lockdep_is_held(&cgroup_mutex),
5525 : : "css_from_id() needs proper protection");
5526 : :
5527 : 0 : cgrp = idr_find(&ss->root->cgroup_idr, id);
5528 [ # # ]: 0 : if (cgrp)
5529 : 0 : return cgroup_css(cgrp, ss);
5530 : : return NULL;
5531 : : }
5532 : :
5533 : : #ifdef CONFIG_CGROUP_DEBUG
5534 : : static struct cgroup_subsys_state *
5535 : : debug_css_alloc(struct cgroup_subsys_state *parent_css)
5536 : : {
5537 : : struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5538 : :
5539 : : if (!css)
5540 : : return ERR_PTR(-ENOMEM);
5541 : :
5542 : : return css;
5543 : : }
5544 : :
5545 : : static void debug_css_free(struct cgroup_subsys_state *css)
5546 : : {
5547 : : kfree(css);
5548 : : }
5549 : :
5550 : : static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5551 : : struct cftype *cft)
5552 : : {
5553 : : return cgroup_task_count(css->cgroup);
5554 : : }
5555 : :
5556 : : static u64 current_css_set_read(struct cgroup_subsys_state *css,
5557 : : struct cftype *cft)
5558 : : {
5559 : : return (u64)(unsigned long)current->cgroups;
5560 : : }
5561 : :
5562 : : static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5563 : : struct cftype *cft)
5564 : : {
5565 : : u64 count;
5566 : :
5567 : : rcu_read_lock();
5568 : : count = atomic_read(&task_css_set(current)->refcount);
5569 : : rcu_read_unlock();
5570 : : return count;
5571 : : }
5572 : :
5573 : : static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5574 : : struct cftype *cft,
5575 : : struct seq_file *seq)
5576 : : {
5577 : : struct cgrp_cset_link *link;
5578 : : struct css_set *cset;
5579 : :
5580 : : read_lock(&css_set_lock);
5581 : : rcu_read_lock();
5582 : : cset = rcu_dereference(current->cgroups);
5583 : : list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5584 : : struct cgroup *c = link->cgrp;
5585 : : const char *name;
5586 : :
5587 : : if (c->dentry)
5588 : : name = c->dentry->d_name.name;
5589 : : else
5590 : : name = "?";
5591 : : seq_printf(seq, "Root %d group %s\n",
5592 : : c->root->hierarchy_id, name);
5593 : : }
5594 : : rcu_read_unlock();
5595 : : read_unlock(&css_set_lock);
5596 : : return 0;
5597 : : }
5598 : :
5599 : : #define MAX_TASKS_SHOWN_PER_CSS 25
5600 : : static int cgroup_css_links_read(struct cgroup_subsys_state *css,
5601 : : struct cftype *cft, struct seq_file *seq)
5602 : : {
5603 : : struct cgrp_cset_link *link;
5604 : :
5605 : : read_lock(&css_set_lock);
5606 : : list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5607 : : struct css_set *cset = link->cset;
5608 : : struct task_struct *task;
5609 : : int count = 0;
5610 : : seq_printf(seq, "css_set %p\n", cset);
5611 : : list_for_each_entry(task, &cset->tasks, cg_list) {
5612 : : if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
5613 : : seq_puts(seq, " ...\n");
5614 : : break;
5615 : : } else {
5616 : : seq_printf(seq, " task %d\n",
5617 : : task_pid_vnr(task));
5618 : : }
5619 : : }
5620 : : }
5621 : : read_unlock(&css_set_lock);
5622 : : return 0;
5623 : : }
5624 : :
5625 : : static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5626 : : {
5627 : : return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5628 : : }
5629 : :
5630 : : static struct cftype debug_files[] = {
5631 : : {
5632 : : .name = "taskcount",
5633 : : .read_u64 = debug_taskcount_read,
5634 : : },
5635 : :
5636 : : {
5637 : : .name = "current_css_set",
5638 : : .read_u64 = current_css_set_read,
5639 : : },
5640 : :
5641 : : {
5642 : : .name = "current_css_set_refcount",
5643 : : .read_u64 = current_css_set_refcount_read,
5644 : : },
5645 : :
5646 : : {
5647 : : .name = "current_css_set_cg_links",
5648 : : .read_seq_string = current_css_set_cg_links_read,
5649 : : },
5650 : :
5651 : : {
5652 : : .name = "cgroup_css_links",
5653 : : .read_seq_string = cgroup_css_links_read,
5654 : : },
5655 : :
5656 : : {
5657 : : .name = "releasable",
5658 : : .read_u64 = releasable_read,
5659 : : },
5660 : :
5661 : : { } /* terminate */
5662 : : };
5663 : :
5664 : : struct cgroup_subsys debug_subsys = {
5665 : : .name = "debug",
5666 : : .css_alloc = debug_css_alloc,
5667 : : .css_free = debug_css_free,
5668 : : .subsys_id = debug_subsys_id,
5669 : : .base_cftypes = debug_files,
5670 : : };
5671 : : #endif /* CONFIG_CGROUP_DEBUG */
|