Branch data Line data Source code
1 : : /*
2 : : * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 : : * Internal non-public definitions that provide either classic
4 : : * or preemptible semantics.
5 : : *
6 : : * This program is free software; you can redistribute it and/or modify
7 : : * it under the terms of the GNU General Public License as published by
8 : : * the Free Software Foundation; either version 2 of the License, or
9 : : * (at your option) any later version.
10 : : *
11 : : * This program is distributed in the hope that it will be useful,
12 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : : * GNU General Public License for more details.
15 : : *
16 : : * You should have received a copy of the GNU General Public License
17 : : * along with this program; if not, write to the Free Software
18 : : * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 : : *
20 : : * Copyright Red Hat, 2009
21 : : * Copyright IBM Corporation, 2009
22 : : *
23 : : * Author: Ingo Molnar <mingo@elte.hu>
24 : : * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 : : */
26 : :
27 : : #include <linux/delay.h>
28 : : #include <linux/gfp.h>
29 : : #include <linux/oom.h>
30 : : #include <linux/smpboot.h>
31 : : #include "../time/tick-internal.h"
32 : :
33 : : #define RCU_KTHREAD_PRIO 1
34 : :
35 : : #ifdef CONFIG_RCU_BOOST
36 : : #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
37 : : #else
38 : : #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
39 : : #endif
40 : :
41 : : #ifdef CONFIG_RCU_NOCB_CPU
42 : : static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
43 : : static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
44 : : static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
45 : : static char __initdata nocb_buf[NR_CPUS * 5];
46 : : #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
47 : :
48 : : /*
49 : : * Check the RCU kernel configuration parameters and print informative
50 : : * messages about anything out of the ordinary. If you like #ifdef, you
51 : : * will love this function.
52 : : */
53 : 0 : static void __init rcu_bootup_announce_oddness(void)
54 : : {
55 : : #ifdef CONFIG_RCU_TRACE
56 : : pr_info("\tRCU debugfs-based tracing is enabled.\n");
57 : : #endif
58 : : #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
59 : : pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
60 : : CONFIG_RCU_FANOUT);
61 : : #endif
62 : : #ifdef CONFIG_RCU_FANOUT_EXACT
63 : : pr_info("\tHierarchical RCU autobalancing is disabled.\n");
64 : : #endif
65 : : #ifdef CONFIG_RCU_FAST_NO_HZ
66 : : pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
67 : : #endif
68 : : #ifdef CONFIG_PROVE_RCU
69 : : pr_info("\tRCU lockdep checking is enabled.\n");
70 : : #endif
71 : : #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
72 : : pr_info("\tRCU torture testing starts during boot.\n");
73 : : #endif
74 : : #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
75 : : pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
76 : : #endif
77 : : #if defined(CONFIG_RCU_CPU_STALL_INFO)
78 : : pr_info("\tAdditional per-CPU info printed with stalls.\n");
79 : : #endif
80 : : #if NUM_RCU_LVL_4 != 0
81 : : pr_info("\tFour-level hierarchy is enabled.\n");
82 : : #endif
83 [ # # ]: 0 : if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
84 : 0 : pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
85 [ # # ]: 0 : if (nr_cpu_ids != NR_CPUS)
86 : 0 : pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
87 : : #ifdef CONFIG_RCU_NOCB_CPU
88 : : #ifndef CONFIG_RCU_NOCB_CPU_NONE
89 : : if (!have_rcu_nocb_mask) {
90 : : zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
91 : : have_rcu_nocb_mask = true;
92 : : }
93 : : #ifdef CONFIG_RCU_NOCB_CPU_ZERO
94 : : pr_info("\tOffload RCU callbacks from CPU 0\n");
95 : : cpumask_set_cpu(0, rcu_nocb_mask);
96 : : #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
97 : : #ifdef CONFIG_RCU_NOCB_CPU_ALL
98 : : pr_info("\tOffload RCU callbacks from all CPUs\n");
99 : : cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
100 : : #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
101 : : #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
102 : : if (have_rcu_nocb_mask) {
103 : : if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
104 : : pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
105 : : cpumask_and(rcu_nocb_mask, cpu_possible_mask,
106 : : rcu_nocb_mask);
107 : : }
108 : : cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
109 : : pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
110 : : if (rcu_nocb_poll)
111 : : pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
112 : : }
113 : : #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
114 : 0 : }
115 : :
116 : : #ifdef CONFIG_TREE_PREEMPT_RCU
117 : :
118 : : RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
119 : : static struct rcu_state *rcu_state = &rcu_preempt_state;
120 : :
121 : : static int rcu_preempted_readers_exp(struct rcu_node *rnp);
122 : :
123 : : /*
124 : : * Tell them what RCU they are running.
125 : : */
126 : : static void __init rcu_bootup_announce(void)
127 : : {
128 : : pr_info("Preemptible hierarchical RCU implementation.\n");
129 : : rcu_bootup_announce_oddness();
130 : : }
131 : :
132 : : /*
133 : : * Return the number of RCU-preempt batches processed thus far
134 : : * for debug and statistics.
135 : : */
136 : : long rcu_batches_completed_preempt(void)
137 : : {
138 : : return rcu_preempt_state.completed;
139 : : }
140 : : EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
141 : :
142 : : /*
143 : : * Return the number of RCU batches processed thus far for debug & stats.
144 : : */
145 : : long rcu_batches_completed(void)
146 : : {
147 : : return rcu_batches_completed_preempt();
148 : : }
149 : : EXPORT_SYMBOL_GPL(rcu_batches_completed);
150 : :
151 : : /*
152 : : * Force a quiescent state for preemptible RCU.
153 : : */
154 : : void rcu_force_quiescent_state(void)
155 : : {
156 : : force_quiescent_state(&rcu_preempt_state);
157 : : }
158 : : EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
159 : :
160 : : /*
161 : : * Record a preemptible-RCU quiescent state for the specified CPU. Note
162 : : * that this just means that the task currently running on the CPU is
163 : : * not in a quiescent state. There might be any number of tasks blocked
164 : : * while in an RCU read-side critical section.
165 : : *
166 : : * Unlike the other rcu_*_qs() functions, callers to this function
167 : : * must disable irqs in order to protect the assignment to
168 : : * ->rcu_read_unlock_special.
169 : : */
170 : : static void rcu_preempt_qs(int cpu)
171 : : {
172 : : struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
173 : :
174 : : if (rdp->passed_quiesce == 0)
175 : : trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs"));
176 : : rdp->passed_quiesce = 1;
177 : : current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
178 : : }
179 : :
180 : : /*
181 : : * We have entered the scheduler, and the current task might soon be
182 : : * context-switched away from. If this task is in an RCU read-side
183 : : * critical section, we will no longer be able to rely on the CPU to
184 : : * record that fact, so we enqueue the task on the blkd_tasks list.
185 : : * The task will dequeue itself when it exits the outermost enclosing
186 : : * RCU read-side critical section. Therefore, the current grace period
187 : : * cannot be permitted to complete until the blkd_tasks list entries
188 : : * predating the current grace period drain, in other words, until
189 : : * rnp->gp_tasks becomes NULL.
190 : : *
191 : : * Caller must disable preemption.
192 : : */
193 : : static void rcu_preempt_note_context_switch(int cpu)
194 : : {
195 : : struct task_struct *t = current;
196 : : unsigned long flags;
197 : : struct rcu_data *rdp;
198 : : struct rcu_node *rnp;
199 : :
200 : : if (t->rcu_read_lock_nesting > 0 &&
201 : : (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
202 : :
203 : : /* Possibly blocking in an RCU read-side critical section. */
204 : : rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
205 : : rnp = rdp->mynode;
206 : : raw_spin_lock_irqsave(&rnp->lock, flags);
207 : : smp_mb__after_unlock_lock();
208 : : t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
209 : : t->rcu_blocked_node = rnp;
210 : :
211 : : /*
212 : : * If this CPU has already checked in, then this task
213 : : * will hold up the next grace period rather than the
214 : : * current grace period. Queue the task accordingly.
215 : : * If the task is queued for the current grace period
216 : : * (i.e., this CPU has not yet passed through a quiescent
217 : : * state for the current grace period), then as long
218 : : * as that task remains queued, the current grace period
219 : : * cannot end. Note that there is some uncertainty as
220 : : * to exactly when the current grace period started.
221 : : * We take a conservative approach, which can result
222 : : * in unnecessarily waiting on tasks that started very
223 : : * slightly after the current grace period began. C'est
224 : : * la vie!!!
225 : : *
226 : : * But first, note that the current CPU must still be
227 : : * on line!
228 : : */
229 : : WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
230 : : WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
231 : : if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
232 : : list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
233 : : rnp->gp_tasks = &t->rcu_node_entry;
234 : : #ifdef CONFIG_RCU_BOOST
235 : : if (rnp->boost_tasks != NULL)
236 : : rnp->boost_tasks = rnp->gp_tasks;
237 : : #endif /* #ifdef CONFIG_RCU_BOOST */
238 : : } else {
239 : : list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
240 : : if (rnp->qsmask & rdp->grpmask)
241 : : rnp->gp_tasks = &t->rcu_node_entry;
242 : : }
243 : : trace_rcu_preempt_task(rdp->rsp->name,
244 : : t->pid,
245 : : (rnp->qsmask & rdp->grpmask)
246 : : ? rnp->gpnum
247 : : : rnp->gpnum + 1);
248 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
249 : : } else if (t->rcu_read_lock_nesting < 0 &&
250 : : t->rcu_read_unlock_special) {
251 : :
252 : : /*
253 : : * Complete exit from RCU read-side critical section on
254 : : * behalf of preempted instance of __rcu_read_unlock().
255 : : */
256 : : rcu_read_unlock_special(t);
257 : : }
258 : :
259 : : /*
260 : : * Either we were not in an RCU read-side critical section to
261 : : * begin with, or we have now recorded that critical section
262 : : * globally. Either way, we can now note a quiescent state
263 : : * for this CPU. Again, if we were in an RCU read-side critical
264 : : * section, and if that critical section was blocking the current
265 : : * grace period, then the fact that the task has been enqueued
266 : : * means that we continue to block the current grace period.
267 : : */
268 : : local_irq_save(flags);
269 : : rcu_preempt_qs(cpu);
270 : : local_irq_restore(flags);
271 : : }
272 : :
273 : : /*
274 : : * Check for preempted RCU readers blocking the current grace period
275 : : * for the specified rcu_node structure. If the caller needs a reliable
276 : : * answer, it must hold the rcu_node's ->lock.
277 : : */
278 : : static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
279 : : {
280 : : return rnp->gp_tasks != NULL;
281 : : }
282 : :
283 : : /*
284 : : * Record a quiescent state for all tasks that were previously queued
285 : : * on the specified rcu_node structure and that were blocking the current
286 : : * RCU grace period. The caller must hold the specified rnp->lock with
287 : : * irqs disabled, and this lock is released upon return, but irqs remain
288 : : * disabled.
289 : : */
290 : : static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
291 : : __releases(rnp->lock)
292 : : {
293 : : unsigned long mask;
294 : : struct rcu_node *rnp_p;
295 : :
296 : : if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
297 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
298 : : return; /* Still need more quiescent states! */
299 : : }
300 : :
301 : : rnp_p = rnp->parent;
302 : : if (rnp_p == NULL) {
303 : : /*
304 : : * Either there is only one rcu_node in the tree,
305 : : * or tasks were kicked up to root rcu_node due to
306 : : * CPUs going offline.
307 : : */
308 : : rcu_report_qs_rsp(&rcu_preempt_state, flags);
309 : : return;
310 : : }
311 : :
312 : : /* Report up the rest of the hierarchy. */
313 : : mask = rnp->grpmask;
314 : : raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
315 : : raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
316 : : smp_mb__after_unlock_lock();
317 : : rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
318 : : }
319 : :
320 : : /*
321 : : * Advance a ->blkd_tasks-list pointer to the next entry, instead
322 : : * returning NULL if at the end of the list.
323 : : */
324 : : static struct list_head *rcu_next_node_entry(struct task_struct *t,
325 : : struct rcu_node *rnp)
326 : : {
327 : : struct list_head *np;
328 : :
329 : : np = t->rcu_node_entry.next;
330 : : if (np == &rnp->blkd_tasks)
331 : : np = NULL;
332 : : return np;
333 : : }
334 : :
335 : : /*
336 : : * Handle special cases during rcu_read_unlock(), such as needing to
337 : : * notify RCU core processing or task having blocked during the RCU
338 : : * read-side critical section.
339 : : */
340 : : void rcu_read_unlock_special(struct task_struct *t)
341 : : {
342 : : int empty;
343 : : int empty_exp;
344 : : int empty_exp_now;
345 : : unsigned long flags;
346 : : struct list_head *np;
347 : : #ifdef CONFIG_RCU_BOOST
348 : : struct rt_mutex *rbmp = NULL;
349 : : #endif /* #ifdef CONFIG_RCU_BOOST */
350 : : struct rcu_node *rnp;
351 : : int special;
352 : :
353 : : /* NMI handlers cannot block and cannot safely manipulate state. */
354 : : if (in_nmi())
355 : : return;
356 : :
357 : : local_irq_save(flags);
358 : :
359 : : /*
360 : : * If RCU core is waiting for this CPU to exit critical section,
361 : : * let it know that we have done so.
362 : : */
363 : : special = t->rcu_read_unlock_special;
364 : : if (special & RCU_READ_UNLOCK_NEED_QS) {
365 : : rcu_preempt_qs(smp_processor_id());
366 : : if (!t->rcu_read_unlock_special) {
367 : : local_irq_restore(flags);
368 : : return;
369 : : }
370 : : }
371 : :
372 : : /* Hardware IRQ handlers cannot block, complain if they get here. */
373 : : if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
374 : : local_irq_restore(flags);
375 : : return;
376 : : }
377 : :
378 : : /* Clean up if blocked during RCU read-side critical section. */
379 : : if (special & RCU_READ_UNLOCK_BLOCKED) {
380 : : t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
381 : :
382 : : /*
383 : : * Remove this task from the list it blocked on. The
384 : : * task can migrate while we acquire the lock, but at
385 : : * most one time. So at most two passes through loop.
386 : : */
387 : : for (;;) {
388 : : rnp = t->rcu_blocked_node;
389 : : raw_spin_lock(&rnp->lock); /* irqs already disabled. */
390 : : smp_mb__after_unlock_lock();
391 : : if (rnp == t->rcu_blocked_node)
392 : : break;
393 : : raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
394 : : }
395 : : empty = !rcu_preempt_blocked_readers_cgp(rnp);
396 : : empty_exp = !rcu_preempted_readers_exp(rnp);
397 : : smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
398 : : np = rcu_next_node_entry(t, rnp);
399 : : list_del_init(&t->rcu_node_entry);
400 : : t->rcu_blocked_node = NULL;
401 : : trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
402 : : rnp->gpnum, t->pid);
403 : : if (&t->rcu_node_entry == rnp->gp_tasks)
404 : : rnp->gp_tasks = np;
405 : : if (&t->rcu_node_entry == rnp->exp_tasks)
406 : : rnp->exp_tasks = np;
407 : : #ifdef CONFIG_RCU_BOOST
408 : : if (&t->rcu_node_entry == rnp->boost_tasks)
409 : : rnp->boost_tasks = np;
410 : : /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
411 : : if (t->rcu_boost_mutex) {
412 : : rbmp = t->rcu_boost_mutex;
413 : : t->rcu_boost_mutex = NULL;
414 : : }
415 : : #endif /* #ifdef CONFIG_RCU_BOOST */
416 : :
417 : : /*
418 : : * If this was the last task on the current list, and if
419 : : * we aren't waiting on any CPUs, report the quiescent state.
420 : : * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
421 : : * so we must take a snapshot of the expedited state.
422 : : */
423 : : empty_exp_now = !rcu_preempted_readers_exp(rnp);
424 : : if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
425 : : trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
426 : : rnp->gpnum,
427 : : 0, rnp->qsmask,
428 : : rnp->level,
429 : : rnp->grplo,
430 : : rnp->grphi,
431 : : !!rnp->gp_tasks);
432 : : rcu_report_unblock_qs_rnp(rnp, flags);
433 : : } else {
434 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
435 : : }
436 : :
437 : : #ifdef CONFIG_RCU_BOOST
438 : : /* Unboost if we were boosted. */
439 : : if (rbmp)
440 : : rt_mutex_unlock(rbmp);
441 : : #endif /* #ifdef CONFIG_RCU_BOOST */
442 : :
443 : : /*
444 : : * If this was the last task on the expedited lists,
445 : : * then we need to report up the rcu_node hierarchy.
446 : : */
447 : : if (!empty_exp && empty_exp_now)
448 : : rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
449 : : } else {
450 : : local_irq_restore(flags);
451 : : }
452 : : }
453 : :
454 : : #ifdef CONFIG_RCU_CPU_STALL_VERBOSE
455 : :
456 : : /*
457 : : * Dump detailed information for all tasks blocking the current RCU
458 : : * grace period on the specified rcu_node structure.
459 : : */
460 : : static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
461 : : {
462 : : unsigned long flags;
463 : : struct task_struct *t;
464 : :
465 : : raw_spin_lock_irqsave(&rnp->lock, flags);
466 : : if (!rcu_preempt_blocked_readers_cgp(rnp)) {
467 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
468 : : return;
469 : : }
470 : : t = list_entry(rnp->gp_tasks,
471 : : struct task_struct, rcu_node_entry);
472 : : list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
473 : : sched_show_task(t);
474 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
475 : : }
476 : :
477 : : /*
478 : : * Dump detailed information for all tasks blocking the current RCU
479 : : * grace period.
480 : : */
481 : : static void rcu_print_detail_task_stall(struct rcu_state *rsp)
482 : : {
483 : : struct rcu_node *rnp = rcu_get_root(rsp);
484 : :
485 : : rcu_print_detail_task_stall_rnp(rnp);
486 : : rcu_for_each_leaf_node(rsp, rnp)
487 : : rcu_print_detail_task_stall_rnp(rnp);
488 : : }
489 : :
490 : : #else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
491 : :
492 : : static void rcu_print_detail_task_stall(struct rcu_state *rsp)
493 : : {
494 : : }
495 : :
496 : : #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
497 : :
498 : : #ifdef CONFIG_RCU_CPU_STALL_INFO
499 : :
500 : : static void rcu_print_task_stall_begin(struct rcu_node *rnp)
501 : : {
502 : : pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
503 : : rnp->level, rnp->grplo, rnp->grphi);
504 : : }
505 : :
506 : : static void rcu_print_task_stall_end(void)
507 : : {
508 : : pr_cont("\n");
509 : : }
510 : :
511 : : #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
512 : :
513 : : static void rcu_print_task_stall_begin(struct rcu_node *rnp)
514 : : {
515 : : }
516 : :
517 : : static void rcu_print_task_stall_end(void)
518 : : {
519 : : }
520 : :
521 : : #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
522 : :
523 : : /*
524 : : * Scan the current list of tasks blocked within RCU read-side critical
525 : : * sections, printing out the tid of each.
526 : : */
527 : : static int rcu_print_task_stall(struct rcu_node *rnp)
528 : : {
529 : : struct task_struct *t;
530 : : int ndetected = 0;
531 : :
532 : : if (!rcu_preempt_blocked_readers_cgp(rnp))
533 : : return 0;
534 : : rcu_print_task_stall_begin(rnp);
535 : : t = list_entry(rnp->gp_tasks,
536 : : struct task_struct, rcu_node_entry);
537 : : list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
538 : : pr_cont(" P%d", t->pid);
539 : : ndetected++;
540 : : }
541 : : rcu_print_task_stall_end();
542 : : return ndetected;
543 : : }
544 : :
545 : : /*
546 : : * Check that the list of blocked tasks for the newly completed grace
547 : : * period is in fact empty. It is a serious bug to complete a grace
548 : : * period that still has RCU readers blocked! This function must be
549 : : * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
550 : : * must be held by the caller.
551 : : *
552 : : * Also, if there are blocked tasks on the list, they automatically
553 : : * block the newly created grace period, so set up ->gp_tasks accordingly.
554 : : */
555 : : static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
556 : : {
557 : : WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
558 : : if (!list_empty(&rnp->blkd_tasks))
559 : : rnp->gp_tasks = rnp->blkd_tasks.next;
560 : : WARN_ON_ONCE(rnp->qsmask);
561 : : }
562 : :
563 : : #ifdef CONFIG_HOTPLUG_CPU
564 : :
565 : : /*
566 : : * Handle tasklist migration for case in which all CPUs covered by the
567 : : * specified rcu_node have gone offline. Move them up to the root
568 : : * rcu_node. The reason for not just moving them to the immediate
569 : : * parent is to remove the need for rcu_read_unlock_special() to
570 : : * make more than two attempts to acquire the target rcu_node's lock.
571 : : * Returns true if there were tasks blocking the current RCU grace
572 : : * period.
573 : : *
574 : : * Returns 1 if there was previously a task blocking the current grace
575 : : * period on the specified rcu_node structure.
576 : : *
577 : : * The caller must hold rnp->lock with irqs disabled.
578 : : */
579 : : static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
580 : : struct rcu_node *rnp,
581 : : struct rcu_data *rdp)
582 : : {
583 : : struct list_head *lp;
584 : : struct list_head *lp_root;
585 : : int retval = 0;
586 : : struct rcu_node *rnp_root = rcu_get_root(rsp);
587 : : struct task_struct *t;
588 : :
589 : : if (rnp == rnp_root) {
590 : : WARN_ONCE(1, "Last CPU thought to be offlined?");
591 : : return 0; /* Shouldn't happen: at least one CPU online. */
592 : : }
593 : :
594 : : /* If we are on an internal node, complain bitterly. */
595 : : WARN_ON_ONCE(rnp != rdp->mynode);
596 : :
597 : : /*
598 : : * Move tasks up to root rcu_node. Don't try to get fancy for
599 : : * this corner-case operation -- just put this node's tasks
600 : : * at the head of the root node's list, and update the root node's
601 : : * ->gp_tasks and ->exp_tasks pointers to those of this node's,
602 : : * if non-NULL. This might result in waiting for more tasks than
603 : : * absolutely necessary, but this is a good performance/complexity
604 : : * tradeoff.
605 : : */
606 : : if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
607 : : retval |= RCU_OFL_TASKS_NORM_GP;
608 : : if (rcu_preempted_readers_exp(rnp))
609 : : retval |= RCU_OFL_TASKS_EXP_GP;
610 : : lp = &rnp->blkd_tasks;
611 : : lp_root = &rnp_root->blkd_tasks;
612 : : while (!list_empty(lp)) {
613 : : t = list_entry(lp->next, typeof(*t), rcu_node_entry);
614 : : raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
615 : : smp_mb__after_unlock_lock();
616 : : list_del(&t->rcu_node_entry);
617 : : t->rcu_blocked_node = rnp_root;
618 : : list_add(&t->rcu_node_entry, lp_root);
619 : : if (&t->rcu_node_entry == rnp->gp_tasks)
620 : : rnp_root->gp_tasks = rnp->gp_tasks;
621 : : if (&t->rcu_node_entry == rnp->exp_tasks)
622 : : rnp_root->exp_tasks = rnp->exp_tasks;
623 : : #ifdef CONFIG_RCU_BOOST
624 : : if (&t->rcu_node_entry == rnp->boost_tasks)
625 : : rnp_root->boost_tasks = rnp->boost_tasks;
626 : : #endif /* #ifdef CONFIG_RCU_BOOST */
627 : : raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
628 : : }
629 : :
630 : : rnp->gp_tasks = NULL;
631 : : rnp->exp_tasks = NULL;
632 : : #ifdef CONFIG_RCU_BOOST
633 : : rnp->boost_tasks = NULL;
634 : : /*
635 : : * In case root is being boosted and leaf was not. Make sure
636 : : * that we boost the tasks blocking the current grace period
637 : : * in this case.
638 : : */
639 : : raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
640 : : smp_mb__after_unlock_lock();
641 : : if (rnp_root->boost_tasks != NULL &&
642 : : rnp_root->boost_tasks != rnp_root->gp_tasks &&
643 : : rnp_root->boost_tasks != rnp_root->exp_tasks)
644 : : rnp_root->boost_tasks = rnp_root->gp_tasks;
645 : : raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
646 : : #endif /* #ifdef CONFIG_RCU_BOOST */
647 : :
648 : : return retval;
649 : : }
650 : :
651 : : #endif /* #ifdef CONFIG_HOTPLUG_CPU */
652 : :
653 : : /*
654 : : * Check for a quiescent state from the current CPU. When a task blocks,
655 : : * the task is recorded in the corresponding CPU's rcu_node structure,
656 : : * which is checked elsewhere.
657 : : *
658 : : * Caller must disable hard irqs.
659 : : */
660 : : static void rcu_preempt_check_callbacks(int cpu)
661 : : {
662 : : struct task_struct *t = current;
663 : :
664 : : if (t->rcu_read_lock_nesting == 0) {
665 : : rcu_preempt_qs(cpu);
666 : : return;
667 : : }
668 : : if (t->rcu_read_lock_nesting > 0 &&
669 : : per_cpu(rcu_preempt_data, cpu).qs_pending)
670 : : t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
671 : : }
672 : :
673 : : #ifdef CONFIG_RCU_BOOST
674 : :
675 : : static void rcu_preempt_do_callbacks(void)
676 : : {
677 : : rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
678 : : }
679 : :
680 : : #endif /* #ifdef CONFIG_RCU_BOOST */
681 : :
682 : : /*
683 : : * Queue a preemptible-RCU callback for invocation after a grace period.
684 : : */
685 : : void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
686 : : {
687 : : __call_rcu(head, func, &rcu_preempt_state, -1, 0);
688 : : }
689 : : EXPORT_SYMBOL_GPL(call_rcu);
690 : :
691 : : /*
692 : : * Queue an RCU callback for lazy invocation after a grace period.
693 : : * This will likely be later named something like "call_rcu_lazy()",
694 : : * but this change will require some way of tagging the lazy RCU
695 : : * callbacks in the list of pending callbacks. Until then, this
696 : : * function may only be called from __kfree_rcu().
697 : : */
698 : : void kfree_call_rcu(struct rcu_head *head,
699 : : void (*func)(struct rcu_head *rcu))
700 : : {
701 : : __call_rcu(head, func, &rcu_preempt_state, -1, 1);
702 : : }
703 : : EXPORT_SYMBOL_GPL(kfree_call_rcu);
704 : :
705 : : /**
706 : : * synchronize_rcu - wait until a grace period has elapsed.
707 : : *
708 : : * Control will return to the caller some time after a full grace
709 : : * period has elapsed, in other words after all currently executing RCU
710 : : * read-side critical sections have completed. Note, however, that
711 : : * upon return from synchronize_rcu(), the caller might well be executing
712 : : * concurrently with new RCU read-side critical sections that began while
713 : : * synchronize_rcu() was waiting. RCU read-side critical sections are
714 : : * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
715 : : *
716 : : * See the description of synchronize_sched() for more detailed information
717 : : * on memory ordering guarantees.
718 : : */
719 : : void synchronize_rcu(void)
720 : : {
721 : : rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
722 : : !lock_is_held(&rcu_lock_map) &&
723 : : !lock_is_held(&rcu_sched_lock_map),
724 : : "Illegal synchronize_rcu() in RCU read-side critical section");
725 : : if (!rcu_scheduler_active)
726 : : return;
727 : : if (rcu_expedited)
728 : : synchronize_rcu_expedited();
729 : : else
730 : : wait_rcu_gp(call_rcu);
731 : : }
732 : : EXPORT_SYMBOL_GPL(synchronize_rcu);
733 : :
734 : : static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
735 : : static unsigned long sync_rcu_preempt_exp_count;
736 : : static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
737 : :
738 : : /*
739 : : * Return non-zero if there are any tasks in RCU read-side critical
740 : : * sections blocking the current preemptible-RCU expedited grace period.
741 : : * If there is no preemptible-RCU expedited grace period currently in
742 : : * progress, returns zero unconditionally.
743 : : */
744 : : static int rcu_preempted_readers_exp(struct rcu_node *rnp)
745 : : {
746 : : return rnp->exp_tasks != NULL;
747 : : }
748 : :
749 : : /*
750 : : * return non-zero if there is no RCU expedited grace period in progress
751 : : * for the specified rcu_node structure, in other words, if all CPUs and
752 : : * tasks covered by the specified rcu_node structure have done their bit
753 : : * for the current expedited grace period. Works only for preemptible
754 : : * RCU -- other RCU implementation use other means.
755 : : *
756 : : * Caller must hold sync_rcu_preempt_exp_mutex.
757 : : */
758 : : static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
759 : : {
760 : : return !rcu_preempted_readers_exp(rnp) &&
761 : : ACCESS_ONCE(rnp->expmask) == 0;
762 : : }
763 : :
764 : : /*
765 : : * Report the exit from RCU read-side critical section for the last task
766 : : * that queued itself during or before the current expedited preemptible-RCU
767 : : * grace period. This event is reported either to the rcu_node structure on
768 : : * which the task was queued or to one of that rcu_node structure's ancestors,
769 : : * recursively up the tree. (Calm down, calm down, we do the recursion
770 : : * iteratively!)
771 : : *
772 : : * Most callers will set the "wake" flag, but the task initiating the
773 : : * expedited grace period need not wake itself.
774 : : *
775 : : * Caller must hold sync_rcu_preempt_exp_mutex.
776 : : */
777 : : static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
778 : : bool wake)
779 : : {
780 : : unsigned long flags;
781 : : unsigned long mask;
782 : :
783 : : raw_spin_lock_irqsave(&rnp->lock, flags);
784 : : smp_mb__after_unlock_lock();
785 : : for (;;) {
786 : : if (!sync_rcu_preempt_exp_done(rnp)) {
787 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
788 : : break;
789 : : }
790 : : if (rnp->parent == NULL) {
791 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
792 : : if (wake) {
793 : : smp_mb(); /* EGP done before wake_up(). */
794 : : wake_up(&sync_rcu_preempt_exp_wq);
795 : : }
796 : : break;
797 : : }
798 : : mask = rnp->grpmask;
799 : : raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
800 : : rnp = rnp->parent;
801 : : raw_spin_lock(&rnp->lock); /* irqs already disabled */
802 : : smp_mb__after_unlock_lock();
803 : : rnp->expmask &= ~mask;
804 : : }
805 : : }
806 : :
807 : : /*
808 : : * Snapshot the tasks blocking the newly started preemptible-RCU expedited
809 : : * grace period for the specified rcu_node structure. If there are no such
810 : : * tasks, report it up the rcu_node hierarchy.
811 : : *
812 : : * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
813 : : * CPU hotplug operations.
814 : : */
815 : : static void
816 : : sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
817 : : {
818 : : unsigned long flags;
819 : : int must_wait = 0;
820 : :
821 : : raw_spin_lock_irqsave(&rnp->lock, flags);
822 : : smp_mb__after_unlock_lock();
823 : : if (list_empty(&rnp->blkd_tasks)) {
824 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
825 : : } else {
826 : : rnp->exp_tasks = rnp->blkd_tasks.next;
827 : : rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
828 : : must_wait = 1;
829 : : }
830 : : if (!must_wait)
831 : : rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
832 : : }
833 : :
834 : : /**
835 : : * synchronize_rcu_expedited - Brute-force RCU grace period
836 : : *
837 : : * Wait for an RCU-preempt grace period, but expedite it. The basic
838 : : * idea is to invoke synchronize_sched_expedited() to push all the tasks to
839 : : * the ->blkd_tasks lists and wait for this list to drain. This consumes
840 : : * significant time on all CPUs and is unfriendly to real-time workloads,
841 : : * so is thus not recommended for any sort of common-case code.
842 : : * In fact, if you are using synchronize_rcu_expedited() in a loop,
843 : : * please restructure your code to batch your updates, and then Use a
844 : : * single synchronize_rcu() instead.
845 : : *
846 : : * Note that it is illegal to call this function while holding any lock
847 : : * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
848 : : * to call this function from a CPU-hotplug notifier. Failing to observe
849 : : * these restriction will result in deadlock.
850 : : */
851 : : void synchronize_rcu_expedited(void)
852 : : {
853 : : unsigned long flags;
854 : : struct rcu_node *rnp;
855 : : struct rcu_state *rsp = &rcu_preempt_state;
856 : : unsigned long snap;
857 : : int trycount = 0;
858 : :
859 : : smp_mb(); /* Caller's modifications seen first by other CPUs. */
860 : : snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
861 : : smp_mb(); /* Above access cannot bleed into critical section. */
862 : :
863 : : /*
864 : : * Block CPU-hotplug operations. This means that any CPU-hotplug
865 : : * operation that finds an rcu_node structure with tasks in the
866 : : * process of being boosted will know that all tasks blocking
867 : : * this expedited grace period will already be in the process of
868 : : * being boosted. This simplifies the process of moving tasks
869 : : * from leaf to root rcu_node structures.
870 : : */
871 : : get_online_cpus();
872 : :
873 : : /*
874 : : * Acquire lock, falling back to synchronize_rcu() if too many
875 : : * lock-acquisition failures. Of course, if someone does the
876 : : * expedited grace period for us, just leave.
877 : : */
878 : : while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
879 : : if (ULONG_CMP_LT(snap,
880 : : ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
881 : : put_online_cpus();
882 : : goto mb_ret; /* Others did our work for us. */
883 : : }
884 : : if (trycount++ < 10) {
885 : : udelay(trycount * num_online_cpus());
886 : : } else {
887 : : put_online_cpus();
888 : : wait_rcu_gp(call_rcu);
889 : : return;
890 : : }
891 : : }
892 : : if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
893 : : put_online_cpus();
894 : : goto unlock_mb_ret; /* Others did our work for us. */
895 : : }
896 : :
897 : : /* force all RCU readers onto ->blkd_tasks lists. */
898 : : synchronize_sched_expedited();
899 : :
900 : : /* Initialize ->expmask for all non-leaf rcu_node structures. */
901 : : rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
902 : : raw_spin_lock_irqsave(&rnp->lock, flags);
903 : : smp_mb__after_unlock_lock();
904 : : rnp->expmask = rnp->qsmaskinit;
905 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
906 : : }
907 : :
908 : : /* Snapshot current state of ->blkd_tasks lists. */
909 : : rcu_for_each_leaf_node(rsp, rnp)
910 : : sync_rcu_preempt_exp_init(rsp, rnp);
911 : : if (NUM_RCU_NODES > 1)
912 : : sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
913 : :
914 : : put_online_cpus();
915 : :
916 : : /* Wait for snapshotted ->blkd_tasks lists to drain. */
917 : : rnp = rcu_get_root(rsp);
918 : : wait_event(sync_rcu_preempt_exp_wq,
919 : : sync_rcu_preempt_exp_done(rnp));
920 : :
921 : : /* Clean up and exit. */
922 : : smp_mb(); /* ensure expedited GP seen before counter increment. */
923 : : ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
924 : : unlock_mb_ret:
925 : : mutex_unlock(&sync_rcu_preempt_exp_mutex);
926 : : mb_ret:
927 : : smp_mb(); /* ensure subsequent action seen after grace period. */
928 : : }
929 : : EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
930 : :
931 : : /**
932 : : * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
933 : : *
934 : : * Note that this primitive does not necessarily wait for an RCU grace period
935 : : * to complete. For example, if there are no RCU callbacks queued anywhere
936 : : * in the system, then rcu_barrier() is within its rights to return
937 : : * immediately, without waiting for anything, much less an RCU grace period.
938 : : */
939 : : void rcu_barrier(void)
940 : : {
941 : : _rcu_barrier(&rcu_preempt_state);
942 : : }
943 : : EXPORT_SYMBOL_GPL(rcu_barrier);
944 : :
945 : : /*
946 : : * Initialize preemptible RCU's state structures.
947 : : */
948 : : static void __init __rcu_init_preempt(void)
949 : : {
950 : : rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
951 : : }
952 : :
953 : : /*
954 : : * Check for a task exiting while in a preemptible-RCU read-side
955 : : * critical section, clean up if so. No need to issue warnings,
956 : : * as debug_check_no_locks_held() already does this if lockdep
957 : : * is enabled.
958 : : */
959 : : void exit_rcu(void)
960 : : {
961 : : struct task_struct *t = current;
962 : :
963 : : if (likely(list_empty(¤t->rcu_node_entry)))
964 : : return;
965 : : t->rcu_read_lock_nesting = 1;
966 : : barrier();
967 : : t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
968 : : __rcu_read_unlock();
969 : : }
970 : :
971 : : #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
972 : :
973 : : static struct rcu_state *rcu_state = &rcu_sched_state;
974 : :
975 : : /*
976 : : * Tell them what RCU they are running.
977 : : */
978 : 0 : static void __init rcu_bootup_announce(void)
979 : : {
980 : 0 : pr_info("Hierarchical RCU implementation.\n");
981 : 0 : rcu_bootup_announce_oddness();
982 : 0 : }
983 : :
984 : : /*
985 : : * Return the number of RCU batches processed thus far for debug & stats.
986 : : */
987 : 0 : long rcu_batches_completed(void)
988 : : {
989 : 0 : return rcu_batches_completed_sched();
990 : : }
991 : : EXPORT_SYMBOL_GPL(rcu_batches_completed);
992 : :
993 : : /*
994 : : * Force a quiescent state for RCU, which, because there is no preemptible
995 : : * RCU, becomes the same as rcu-sched.
996 : : */
997 : 0 : void rcu_force_quiescent_state(void)
998 : : {
999 : : rcu_sched_force_quiescent_state();
1000 : 0 : }
1001 : : EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1002 : :
1003 : : /*
1004 : : * Because preemptible RCU does not exist, we never have to check for
1005 : : * CPUs being in quiescent states.
1006 : : */
1007 : : static void rcu_preempt_note_context_switch(int cpu)
1008 : : {
1009 : : }
1010 : :
1011 : : /*
1012 : : * Because preemptible RCU does not exist, there are never any preempted
1013 : : * RCU readers.
1014 : : */
1015 : : static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
1016 : : {
1017 : : return 0;
1018 : : }
1019 : :
1020 : : #ifdef CONFIG_HOTPLUG_CPU
1021 : :
1022 : : /* Because preemptible RCU does not exist, no quieting of tasks. */
1023 : : static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
1024 : : {
1025 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
1026 : : }
1027 : :
1028 : : #endif /* #ifdef CONFIG_HOTPLUG_CPU */
1029 : :
1030 : : /*
1031 : : * Because preemptible RCU does not exist, we never have to check for
1032 : : * tasks blocked within RCU read-side critical sections.
1033 : : */
1034 : : static void rcu_print_detail_task_stall(struct rcu_state *rsp)
1035 : : {
1036 : : }
1037 : :
1038 : : /*
1039 : : * Because preemptible RCU does not exist, we never have to check for
1040 : : * tasks blocked within RCU read-side critical sections.
1041 : : */
1042 : : static int rcu_print_task_stall(struct rcu_node *rnp)
1043 : : {
1044 : : return 0;
1045 : : }
1046 : :
1047 : : /*
1048 : : * Because there is no preemptible RCU, there can be no readers blocked,
1049 : : * so there is no need to check for blocked tasks. So check only for
1050 : : * bogus qsmask values.
1051 : : */
1052 : 0 : static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
1053 : : {
1054 [ - + ][ # # ]: 223588 : WARN_ON_ONCE(rnp->qsmask);
[ # # ]
1055 : 223588 : }
1056 : :
1057 : : #ifdef CONFIG_HOTPLUG_CPU
1058 : :
1059 : : /*
1060 : : * Because preemptible RCU does not exist, it never needs to migrate
1061 : : * tasks that were blocked within RCU read-side critical sections, and
1062 : : * such non-existent tasks cannot possibly have been blocking the current
1063 : : * grace period.
1064 : : */
1065 : : static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1066 : : struct rcu_node *rnp,
1067 : : struct rcu_data *rdp)
1068 : : {
1069 : : return 0;
1070 : : }
1071 : :
1072 : : #endif /* #ifdef CONFIG_HOTPLUG_CPU */
1073 : :
1074 : : /*
1075 : : * Because preemptible RCU does not exist, it never has any callbacks
1076 : : * to check.
1077 : : */
1078 : : static void rcu_preempt_check_callbacks(int cpu)
1079 : : {
1080 : : }
1081 : :
1082 : : /*
1083 : : * Queue an RCU callback for lazy invocation after a grace period.
1084 : : * This will likely be later named something like "call_rcu_lazy()",
1085 : : * but this change will require some way of tagging the lazy RCU
1086 : : * callbacks in the list of pending callbacks. Until then, this
1087 : : * function may only be called from __kfree_rcu().
1088 : : *
1089 : : * Because there is no preemptible RCU, we use RCU-sched instead.
1090 : : */
1091 : 0 : void kfree_call_rcu(struct rcu_head *head,
1092 : : void (*func)(struct rcu_head *rcu))
1093 : : {
1094 : 548569 : __call_rcu(head, func, &rcu_sched_state, -1, 1);
1095 : 566138 : }
1096 : : EXPORT_SYMBOL_GPL(kfree_call_rcu);
1097 : :
1098 : : /*
1099 : : * Wait for an rcu-preempt grace period, but make it happen quickly.
1100 : : * But because preemptible RCU does not exist, map to rcu-sched.
1101 : : */
1102 : 0 : void synchronize_rcu_expedited(void)
1103 : : {
1104 : 0 : synchronize_sched_expedited();
1105 : 0 : }
1106 : : EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1107 : :
1108 : : #ifdef CONFIG_HOTPLUG_CPU
1109 : :
1110 : : /*
1111 : : * Because preemptible RCU does not exist, there is never any need to
1112 : : * report on tasks preempted in RCU read-side critical sections during
1113 : : * expedited RCU grace periods.
1114 : : */
1115 : : static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1116 : : bool wake)
1117 : : {
1118 : : }
1119 : :
1120 : : #endif /* #ifdef CONFIG_HOTPLUG_CPU */
1121 : :
1122 : : /*
1123 : : * Because preemptible RCU does not exist, rcu_barrier() is just
1124 : : * another name for rcu_barrier_sched().
1125 : : */
1126 : 0 : void rcu_barrier(void)
1127 : : {
1128 : : rcu_barrier_sched();
1129 : 0 : }
1130 : : EXPORT_SYMBOL_GPL(rcu_barrier);
1131 : :
1132 : : /*
1133 : : * Because preemptible RCU does not exist, it need not be initialized.
1134 : : */
1135 : : static void __init __rcu_init_preempt(void)
1136 : : {
1137 : : }
1138 : :
1139 : : /*
1140 : : * Because preemptible RCU does not exist, tasks cannot possibly exit
1141 : : * while in preemptible RCU read-side critical sections.
1142 : : */
1143 : 0 : void exit_rcu(void)
1144 : : {
1145 : 1104228 : }
1146 : :
1147 : : #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1148 : :
1149 : : #ifdef CONFIG_RCU_BOOST
1150 : :
1151 : : #include "../locking/rtmutex_common.h"
1152 : :
1153 : : #ifdef CONFIG_RCU_TRACE
1154 : :
1155 : : static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1156 : : {
1157 : : if (list_empty(&rnp->blkd_tasks))
1158 : : rnp->n_balk_blkd_tasks++;
1159 : : else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1160 : : rnp->n_balk_exp_gp_tasks++;
1161 : : else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
1162 : : rnp->n_balk_boost_tasks++;
1163 : : else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
1164 : : rnp->n_balk_notblocked++;
1165 : : else if (rnp->gp_tasks != NULL &&
1166 : : ULONG_CMP_LT(jiffies, rnp->boost_time))
1167 : : rnp->n_balk_notyet++;
1168 : : else
1169 : : rnp->n_balk_nos++;
1170 : : }
1171 : :
1172 : : #else /* #ifdef CONFIG_RCU_TRACE */
1173 : :
1174 : : static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1175 : : {
1176 : : }
1177 : :
1178 : : #endif /* #else #ifdef CONFIG_RCU_TRACE */
1179 : :
1180 : : static void rcu_wake_cond(struct task_struct *t, int status)
1181 : : {
1182 : : /*
1183 : : * If the thread is yielding, only wake it when this
1184 : : * is invoked from idle
1185 : : */
1186 : : if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
1187 : : wake_up_process(t);
1188 : : }
1189 : :
1190 : : /*
1191 : : * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1192 : : * or ->boost_tasks, advancing the pointer to the next task in the
1193 : : * ->blkd_tasks list.
1194 : : *
1195 : : * Note that irqs must be enabled: boosting the task can block.
1196 : : * Returns 1 if there are more tasks needing to be boosted.
1197 : : */
1198 : : static int rcu_boost(struct rcu_node *rnp)
1199 : : {
1200 : : unsigned long flags;
1201 : : struct rt_mutex mtx;
1202 : : struct task_struct *t;
1203 : : struct list_head *tb;
1204 : :
1205 : : if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
1206 : : return 0; /* Nothing left to boost. */
1207 : :
1208 : : raw_spin_lock_irqsave(&rnp->lock, flags);
1209 : : smp_mb__after_unlock_lock();
1210 : :
1211 : : /*
1212 : : * Recheck under the lock: all tasks in need of boosting
1213 : : * might exit their RCU read-side critical sections on their own.
1214 : : */
1215 : : if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1216 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
1217 : : return 0;
1218 : : }
1219 : :
1220 : : /*
1221 : : * Preferentially boost tasks blocking expedited grace periods.
1222 : : * This cannot starve the normal grace periods because a second
1223 : : * expedited grace period must boost all blocked tasks, including
1224 : : * those blocking the pre-existing normal grace period.
1225 : : */
1226 : : if (rnp->exp_tasks != NULL) {
1227 : : tb = rnp->exp_tasks;
1228 : : rnp->n_exp_boosts++;
1229 : : } else {
1230 : : tb = rnp->boost_tasks;
1231 : : rnp->n_normal_boosts++;
1232 : : }
1233 : : rnp->n_tasks_boosted++;
1234 : :
1235 : : /*
1236 : : * We boost task t by manufacturing an rt_mutex that appears to
1237 : : * be held by task t. We leave a pointer to that rt_mutex where
1238 : : * task t can find it, and task t will release the mutex when it
1239 : : * exits its outermost RCU read-side critical section. Then
1240 : : * simply acquiring this artificial rt_mutex will boost task
1241 : : * t's priority. (Thanks to tglx for suggesting this approach!)
1242 : : *
1243 : : * Note that task t must acquire rnp->lock to remove itself from
1244 : : * the ->blkd_tasks list, which it will do from exit() if from
1245 : : * nowhere else. We therefore are guaranteed that task t will
1246 : : * stay around at least until we drop rnp->lock. Note that
1247 : : * rnp->lock also resolves races between our priority boosting
1248 : : * and task t's exiting its outermost RCU read-side critical
1249 : : * section.
1250 : : */
1251 : : t = container_of(tb, struct task_struct, rcu_node_entry);
1252 : : rt_mutex_init_proxy_locked(&mtx, t);
1253 : : t->rcu_boost_mutex = &mtx;
1254 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
1255 : : rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1256 : : rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1257 : :
1258 : : return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1259 : : ACCESS_ONCE(rnp->boost_tasks) != NULL;
1260 : : }
1261 : :
1262 : : /*
1263 : : * Priority-boosting kthread. One per leaf rcu_node and one for the
1264 : : * root rcu_node.
1265 : : */
1266 : : static int rcu_boost_kthread(void *arg)
1267 : : {
1268 : : struct rcu_node *rnp = (struct rcu_node *)arg;
1269 : : int spincnt = 0;
1270 : : int more2boost;
1271 : :
1272 : : trace_rcu_utilization(TPS("Start boost kthread@init"));
1273 : : for (;;) {
1274 : : rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1275 : : trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
1276 : : rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1277 : : trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
1278 : : rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1279 : : more2boost = rcu_boost(rnp);
1280 : : if (more2boost)
1281 : : spincnt++;
1282 : : else
1283 : : spincnt = 0;
1284 : : if (spincnt > 10) {
1285 : : rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
1286 : : trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
1287 : : schedule_timeout_interruptible(2);
1288 : : trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
1289 : : spincnt = 0;
1290 : : }
1291 : : }
1292 : : /* NOTREACHED */
1293 : : trace_rcu_utilization(TPS("End boost kthread@notreached"));
1294 : : return 0;
1295 : : }
1296 : :
1297 : : /*
1298 : : * Check to see if it is time to start boosting RCU readers that are
1299 : : * blocking the current grace period, and, if so, tell the per-rcu_node
1300 : : * kthread to start boosting them. If there is an expedited grace
1301 : : * period in progress, it is always time to boost.
1302 : : *
1303 : : * The caller must hold rnp->lock, which this function releases.
1304 : : * The ->boost_kthread_task is immortal, so we don't need to worry
1305 : : * about it going away.
1306 : : */
1307 : : static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1308 : : {
1309 : : struct task_struct *t;
1310 : :
1311 : : if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1312 : : rnp->n_balk_exp_gp_tasks++;
1313 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
1314 : : return;
1315 : : }
1316 : : if (rnp->exp_tasks != NULL ||
1317 : : (rnp->gp_tasks != NULL &&
1318 : : rnp->boost_tasks == NULL &&
1319 : : rnp->qsmask == 0 &&
1320 : : ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1321 : : if (rnp->exp_tasks == NULL)
1322 : : rnp->boost_tasks = rnp->gp_tasks;
1323 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
1324 : : t = rnp->boost_kthread_task;
1325 : : if (t)
1326 : : rcu_wake_cond(t, rnp->boost_kthread_status);
1327 : : } else {
1328 : : rcu_initiate_boost_trace(rnp);
1329 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
1330 : : }
1331 : : }
1332 : :
1333 : : /*
1334 : : * Wake up the per-CPU kthread to invoke RCU callbacks.
1335 : : */
1336 : : static void invoke_rcu_callbacks_kthread(void)
1337 : : {
1338 : : unsigned long flags;
1339 : :
1340 : : local_irq_save(flags);
1341 : : __this_cpu_write(rcu_cpu_has_work, 1);
1342 : : if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
1343 : : current != __this_cpu_read(rcu_cpu_kthread_task)) {
1344 : : rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
1345 : : __this_cpu_read(rcu_cpu_kthread_status));
1346 : : }
1347 : : local_irq_restore(flags);
1348 : : }
1349 : :
1350 : : /*
1351 : : * Is the current CPU running the RCU-callbacks kthread?
1352 : : * Caller must have preemption disabled.
1353 : : */
1354 : : static bool rcu_is_callbacks_kthread(void)
1355 : : {
1356 : : return __this_cpu_read(rcu_cpu_kthread_task) == current;
1357 : : }
1358 : :
1359 : : #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1360 : :
1361 : : /*
1362 : : * Do priority-boost accounting for the start of a new grace period.
1363 : : */
1364 : : static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1365 : : {
1366 : : rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1367 : : }
1368 : :
1369 : : /*
1370 : : * Create an RCU-boost kthread for the specified node if one does not
1371 : : * already exist. We only create this kthread for preemptible RCU.
1372 : : * Returns zero if all is well, a negated errno otherwise.
1373 : : */
1374 : : static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1375 : : struct rcu_node *rnp)
1376 : : {
1377 : : int rnp_index = rnp - &rsp->node[0];
1378 : : unsigned long flags;
1379 : : struct sched_param sp;
1380 : : struct task_struct *t;
1381 : :
1382 : : if (&rcu_preempt_state != rsp)
1383 : : return 0;
1384 : :
1385 : : if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
1386 : : return 0;
1387 : :
1388 : : rsp->boost = 1;
1389 : : if (rnp->boost_kthread_task != NULL)
1390 : : return 0;
1391 : : t = kthread_create(rcu_boost_kthread, (void *)rnp,
1392 : : "rcub/%d", rnp_index);
1393 : : if (IS_ERR(t))
1394 : : return PTR_ERR(t);
1395 : : raw_spin_lock_irqsave(&rnp->lock, flags);
1396 : : smp_mb__after_unlock_lock();
1397 : : rnp->boost_kthread_task = t;
1398 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
1399 : : sp.sched_priority = RCU_BOOST_PRIO;
1400 : : sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1401 : : wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1402 : : return 0;
1403 : : }
1404 : :
1405 : : static void rcu_kthread_do_work(void)
1406 : : {
1407 : : rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
1408 : : rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
1409 : : rcu_preempt_do_callbacks();
1410 : : }
1411 : :
1412 : : static void rcu_cpu_kthread_setup(unsigned int cpu)
1413 : : {
1414 : : struct sched_param sp;
1415 : :
1416 : : sp.sched_priority = RCU_KTHREAD_PRIO;
1417 : : sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1418 : : }
1419 : :
1420 : : static void rcu_cpu_kthread_park(unsigned int cpu)
1421 : : {
1422 : : per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1423 : : }
1424 : :
1425 : : static int rcu_cpu_kthread_should_run(unsigned int cpu)
1426 : : {
1427 : : return __this_cpu_read(rcu_cpu_has_work);
1428 : : }
1429 : :
1430 : : /*
1431 : : * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
1432 : : * RCU softirq used in flavors and configurations of RCU that do not
1433 : : * support RCU priority boosting.
1434 : : */
1435 : : static void rcu_cpu_kthread(unsigned int cpu)
1436 : : {
1437 : : unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
1438 : : char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
1439 : : int spincnt;
1440 : :
1441 : : for (spincnt = 0; spincnt < 10; spincnt++) {
1442 : : trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
1443 : : local_bh_disable();
1444 : : *statusp = RCU_KTHREAD_RUNNING;
1445 : : this_cpu_inc(rcu_cpu_kthread_loops);
1446 : : local_irq_disable();
1447 : : work = *workp;
1448 : : *workp = 0;
1449 : : local_irq_enable();
1450 : : if (work)
1451 : : rcu_kthread_do_work();
1452 : : local_bh_enable();
1453 : : if (*workp == 0) {
1454 : : trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
1455 : : *statusp = RCU_KTHREAD_WAITING;
1456 : : return;
1457 : : }
1458 : : }
1459 : : *statusp = RCU_KTHREAD_YIELDING;
1460 : : trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
1461 : : schedule_timeout_interruptible(2);
1462 : : trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
1463 : : *statusp = RCU_KTHREAD_WAITING;
1464 : : }
1465 : :
1466 : : /*
1467 : : * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1468 : : * served by the rcu_node in question. The CPU hotplug lock is still
1469 : : * held, so the value of rnp->qsmaskinit will be stable.
1470 : : *
1471 : : * We don't include outgoingcpu in the affinity set, use -1 if there is
1472 : : * no outgoing CPU. If there are no CPUs left in the affinity set,
1473 : : * this function allows the kthread to execute on any CPU.
1474 : : */
1475 : : static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1476 : : {
1477 : : struct task_struct *t = rnp->boost_kthread_task;
1478 : : unsigned long mask = rnp->qsmaskinit;
1479 : : cpumask_var_t cm;
1480 : : int cpu;
1481 : :
1482 : : if (!t)
1483 : : return;
1484 : : if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
1485 : : return;
1486 : : for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1487 : : if ((mask & 0x1) && cpu != outgoingcpu)
1488 : : cpumask_set_cpu(cpu, cm);
1489 : : if (cpumask_weight(cm) == 0) {
1490 : : cpumask_setall(cm);
1491 : : for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1492 : : cpumask_clear_cpu(cpu, cm);
1493 : : WARN_ON_ONCE(cpumask_weight(cm) == 0);
1494 : : }
1495 : : set_cpus_allowed_ptr(t, cm);
1496 : : free_cpumask_var(cm);
1497 : : }
1498 : :
1499 : : static struct smp_hotplug_thread rcu_cpu_thread_spec = {
1500 : : .store = &rcu_cpu_kthread_task,
1501 : : .thread_should_run = rcu_cpu_kthread_should_run,
1502 : : .thread_fn = rcu_cpu_kthread,
1503 : : .thread_comm = "rcuc/%u",
1504 : : .setup = rcu_cpu_kthread_setup,
1505 : : .park = rcu_cpu_kthread_park,
1506 : : };
1507 : :
1508 : : /*
1509 : : * Spawn all kthreads -- called as soon as the scheduler is running.
1510 : : */
1511 : : static int __init rcu_spawn_kthreads(void)
1512 : : {
1513 : : struct rcu_node *rnp;
1514 : : int cpu;
1515 : :
1516 : : rcu_scheduler_fully_active = 1;
1517 : : for_each_possible_cpu(cpu)
1518 : : per_cpu(rcu_cpu_has_work, cpu) = 0;
1519 : : BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1520 : : rnp = rcu_get_root(rcu_state);
1521 : : (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1522 : : if (NUM_RCU_NODES > 1) {
1523 : : rcu_for_each_leaf_node(rcu_state, rnp)
1524 : : (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1525 : : }
1526 : : return 0;
1527 : : }
1528 : : early_initcall(rcu_spawn_kthreads);
1529 : :
1530 : : static void rcu_prepare_kthreads(int cpu)
1531 : : {
1532 : : struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1533 : : struct rcu_node *rnp = rdp->mynode;
1534 : :
1535 : : /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1536 : : if (rcu_scheduler_fully_active)
1537 : : (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1538 : : }
1539 : :
1540 : : #else /* #ifdef CONFIG_RCU_BOOST */
1541 : :
1542 : : static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1543 : : {
1544 : 156644 : raw_spin_unlock_irqrestore(&rnp->lock, flags);
1545 : : }
1546 : :
1547 : 0 : static void invoke_rcu_callbacks_kthread(void)
1548 : : {
1549 [ # # ][ # # ]: 0 : WARN_ON_ONCE(1);
1550 : 0 : }
1551 : :
1552 : : static bool rcu_is_callbacks_kthread(void)
1553 : : {
1554 : : return false;
1555 : : }
1556 : :
1557 : : static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1558 : : {
1559 : : }
1560 : :
1561 : : static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1562 : : {
1563 : : }
1564 : :
1565 : 0 : static int __init rcu_scheduler_really_started(void)
1566 : : {
1567 : 0 : rcu_scheduler_fully_active = 1;
1568 : 0 : return 0;
1569 : : }
1570 : : early_initcall(rcu_scheduler_really_started);
1571 : :
1572 : : static void rcu_prepare_kthreads(int cpu)
1573 : : {
1574 : : }
1575 : :
1576 : : #endif /* #else #ifdef CONFIG_RCU_BOOST */
1577 : :
1578 : : #if !defined(CONFIG_RCU_FAST_NO_HZ)
1579 : :
1580 : : /*
1581 : : * Check to see if any future RCU-related work will need to be done
1582 : : * by the current CPU, even if none need be done immediately, returning
1583 : : * 1 if so. This function is part of the RCU implementation; it is -not-
1584 : : * an exported member of the RCU API.
1585 : : *
1586 : : * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1587 : : * any flavor of RCU.
1588 : : */
1589 : 0 : int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1590 : : {
1591 : 7497087 : *delta_jiffies = ULONG_MAX;
1592 : 7497087 : return rcu_cpu_has_callbacks(cpu, NULL);
1593 : : }
1594 : :
1595 : : /*
1596 : : * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1597 : : * after it.
1598 : : */
1599 : : static void rcu_cleanup_after_idle(int cpu)
1600 : : {
1601 : : }
1602 : :
1603 : : /*
1604 : : * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
1605 : : * is nothing.
1606 : : */
1607 : : static void rcu_prepare_for_idle(int cpu)
1608 : : {
1609 : : }
1610 : :
1611 : : /*
1612 : : * Don't bother keeping a running count of the number of RCU callbacks
1613 : : * posted because CONFIG_RCU_FAST_NO_HZ=n.
1614 : : */
1615 : : static void rcu_idle_count_callbacks_posted(void)
1616 : : {
1617 : : }
1618 : :
1619 : : #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1620 : :
1621 : : /*
1622 : : * This code is invoked when a CPU goes idle, at which point we want
1623 : : * to have the CPU do everything required for RCU so that it can enter
1624 : : * the energy-efficient dyntick-idle mode. This is handled by a
1625 : : * state machine implemented by rcu_prepare_for_idle() below.
1626 : : *
1627 : : * The following three proprocessor symbols control this state machine:
1628 : : *
1629 : : * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1630 : : * to sleep in dyntick-idle mode with RCU callbacks pending. This
1631 : : * is sized to be roughly one RCU grace period. Those energy-efficiency
1632 : : * benchmarkers who might otherwise be tempted to set this to a large
1633 : : * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
1634 : : * system. And if you are -that- concerned about energy efficiency,
1635 : : * just power the system down and be done with it!
1636 : : * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
1637 : : * permitted to sleep in dyntick-idle mode with only lazy RCU
1638 : : * callbacks pending. Setting this too high can OOM your system.
1639 : : *
1640 : : * The values below work well in practice. If future workloads require
1641 : : * adjustment, they can be converted into kernel config parameters, though
1642 : : * making the state machine smarter might be a better option.
1643 : : */
1644 : : #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
1645 : : #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1646 : :
1647 : : static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
1648 : : module_param(rcu_idle_gp_delay, int, 0644);
1649 : : static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
1650 : : module_param(rcu_idle_lazy_gp_delay, int, 0644);
1651 : :
1652 : : extern int tick_nohz_active;
1653 : :
1654 : : /*
1655 : : * Try to advance callbacks for all flavors of RCU on the current CPU, but
1656 : : * only if it has been awhile since the last time we did so. Afterwards,
1657 : : * if there are any callbacks ready for immediate invocation, return true.
1658 : : */
1659 : : static bool rcu_try_advance_all_cbs(void)
1660 : : {
1661 : : bool cbs_ready = false;
1662 : : struct rcu_data *rdp;
1663 : : struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1664 : : struct rcu_node *rnp;
1665 : : struct rcu_state *rsp;
1666 : :
1667 : : /* Exit early if we advanced recently. */
1668 : : if (jiffies == rdtp->last_advance_all)
1669 : : return 0;
1670 : : rdtp->last_advance_all = jiffies;
1671 : :
1672 : : for_each_rcu_flavor(rsp) {
1673 : : rdp = this_cpu_ptr(rsp->rda);
1674 : : rnp = rdp->mynode;
1675 : :
1676 : : /*
1677 : : * Don't bother checking unless a grace period has
1678 : : * completed since we last checked and there are
1679 : : * callbacks not yet ready to invoke.
1680 : : */
1681 : : if (rdp->completed != rnp->completed &&
1682 : : rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1683 : : note_gp_changes(rsp, rdp);
1684 : :
1685 : : if (cpu_has_callbacks_ready_to_invoke(rdp))
1686 : : cbs_ready = true;
1687 : : }
1688 : : return cbs_ready;
1689 : : }
1690 : :
1691 : : /*
1692 : : * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
1693 : : * to invoke. If the CPU has callbacks, try to advance them. Tell the
1694 : : * caller to set the timeout based on whether or not there are non-lazy
1695 : : * callbacks.
1696 : : *
1697 : : * The caller must have disabled interrupts.
1698 : : */
1699 : : int rcu_needs_cpu(int cpu, unsigned long *dj)
1700 : : {
1701 : : struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1702 : :
1703 : : /* Snapshot to detect later posting of non-lazy callback. */
1704 : : rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1705 : :
1706 : : /* If no callbacks, RCU doesn't need the CPU. */
1707 : : if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
1708 : : *dj = ULONG_MAX;
1709 : : return 0;
1710 : : }
1711 : :
1712 : : /* Attempt to advance callbacks. */
1713 : : if (rcu_try_advance_all_cbs()) {
1714 : : /* Some ready to invoke, so initiate later invocation. */
1715 : : invoke_rcu_core();
1716 : : return 1;
1717 : : }
1718 : : rdtp->last_accelerate = jiffies;
1719 : :
1720 : : /* Request timer delay depending on laziness, and round. */
1721 : : if (!rdtp->all_lazy) {
1722 : : *dj = round_up(rcu_idle_gp_delay + jiffies,
1723 : : rcu_idle_gp_delay) - jiffies;
1724 : : } else {
1725 : : *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
1726 : : }
1727 : : return 0;
1728 : : }
1729 : :
1730 : : /*
1731 : : * Prepare a CPU for idle from an RCU perspective. The first major task
1732 : : * is to sense whether nohz mode has been enabled or disabled via sysfs.
1733 : : * The second major task is to check to see if a non-lazy callback has
1734 : : * arrived at a CPU that previously had only lazy callbacks. The third
1735 : : * major task is to accelerate (that is, assign grace-period numbers to)
1736 : : * any recently arrived callbacks.
1737 : : *
1738 : : * The caller must have disabled interrupts.
1739 : : */
1740 : : static void rcu_prepare_for_idle(int cpu)
1741 : : {
1742 : : struct rcu_data *rdp;
1743 : : struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1744 : : struct rcu_node *rnp;
1745 : : struct rcu_state *rsp;
1746 : : int tne;
1747 : :
1748 : : /* Handle nohz enablement switches conservatively. */
1749 : : tne = ACCESS_ONCE(tick_nohz_active);
1750 : : if (tne != rdtp->tick_nohz_enabled_snap) {
1751 : : if (rcu_cpu_has_callbacks(cpu, NULL))
1752 : : invoke_rcu_core(); /* force nohz to see update. */
1753 : : rdtp->tick_nohz_enabled_snap = tne;
1754 : : return;
1755 : : }
1756 : : if (!tne)
1757 : : return;
1758 : :
1759 : : /* If this is a no-CBs CPU, no callbacks, just return. */
1760 : : if (rcu_is_nocb_cpu(cpu))
1761 : : return;
1762 : :
1763 : : /*
1764 : : * If a non-lazy callback arrived at a CPU having only lazy
1765 : : * callbacks, invoke RCU core for the side-effect of recalculating
1766 : : * idle duration on re-entry to idle.
1767 : : */
1768 : : if (rdtp->all_lazy &&
1769 : : rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
1770 : : rdtp->all_lazy = false;
1771 : : rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1772 : : invoke_rcu_core();
1773 : : return;
1774 : : }
1775 : :
1776 : : /*
1777 : : * If we have not yet accelerated this jiffy, accelerate all
1778 : : * callbacks on this CPU.
1779 : : */
1780 : : if (rdtp->last_accelerate == jiffies)
1781 : : return;
1782 : : rdtp->last_accelerate = jiffies;
1783 : : for_each_rcu_flavor(rsp) {
1784 : : rdp = per_cpu_ptr(rsp->rda, cpu);
1785 : : if (!*rdp->nxttail[RCU_DONE_TAIL])
1786 : : continue;
1787 : : rnp = rdp->mynode;
1788 : : raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1789 : : smp_mb__after_unlock_lock();
1790 : : rcu_accelerate_cbs(rsp, rnp, rdp);
1791 : : raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1792 : : }
1793 : : }
1794 : :
1795 : : /*
1796 : : * Clean up for exit from idle. Attempt to advance callbacks based on
1797 : : * any grace periods that elapsed while the CPU was idle, and if any
1798 : : * callbacks are now ready to invoke, initiate invocation.
1799 : : */
1800 : : static void rcu_cleanup_after_idle(int cpu)
1801 : : {
1802 : :
1803 : : if (rcu_is_nocb_cpu(cpu))
1804 : : return;
1805 : : if (rcu_try_advance_all_cbs())
1806 : : invoke_rcu_core();
1807 : : }
1808 : :
1809 : : /*
1810 : : * Keep a running count of the number of non-lazy callbacks posted
1811 : : * on this CPU. This running counter (which is never decremented) allows
1812 : : * rcu_prepare_for_idle() to detect when something out of the idle loop
1813 : : * posts a callback, even if an equal number of callbacks are invoked.
1814 : : * Of course, callbacks should only be posted from within a trace event
1815 : : * designed to be called from idle or from within RCU_NONIDLE().
1816 : : */
1817 : : static void rcu_idle_count_callbacks_posted(void)
1818 : : {
1819 : : __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
1820 : : }
1821 : :
1822 : : /*
1823 : : * Data for flushing lazy RCU callbacks at OOM time.
1824 : : */
1825 : : static atomic_t oom_callback_count;
1826 : : static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
1827 : :
1828 : : /*
1829 : : * RCU OOM callback -- decrement the outstanding count and deliver the
1830 : : * wake-up if we are the last one.
1831 : : */
1832 : : static void rcu_oom_callback(struct rcu_head *rhp)
1833 : : {
1834 : : if (atomic_dec_and_test(&oom_callback_count))
1835 : : wake_up(&oom_callback_wq);
1836 : : }
1837 : :
1838 : : /*
1839 : : * Post an rcu_oom_notify callback on the current CPU if it has at
1840 : : * least one lazy callback. This will unnecessarily post callbacks
1841 : : * to CPUs that already have a non-lazy callback at the end of their
1842 : : * callback list, but this is an infrequent operation, so accept some
1843 : : * extra overhead to keep things simple.
1844 : : */
1845 : : static void rcu_oom_notify_cpu(void *unused)
1846 : : {
1847 : : struct rcu_state *rsp;
1848 : : struct rcu_data *rdp;
1849 : :
1850 : : for_each_rcu_flavor(rsp) {
1851 : : rdp = __this_cpu_ptr(rsp->rda);
1852 : : if (rdp->qlen_lazy != 0) {
1853 : : atomic_inc(&oom_callback_count);
1854 : : rsp->call(&rdp->oom_head, rcu_oom_callback);
1855 : : }
1856 : : }
1857 : : }
1858 : :
1859 : : /*
1860 : : * If low on memory, ensure that each CPU has a non-lazy callback.
1861 : : * This will wake up CPUs that have only lazy callbacks, in turn
1862 : : * ensuring that they free up the corresponding memory in a timely manner.
1863 : : * Because an uncertain amount of memory will be freed in some uncertain
1864 : : * timeframe, we do not claim to have freed anything.
1865 : : */
1866 : : static int rcu_oom_notify(struct notifier_block *self,
1867 : : unsigned long notused, void *nfreed)
1868 : : {
1869 : : int cpu;
1870 : :
1871 : : /* Wait for callbacks from earlier instance to complete. */
1872 : : wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
1873 : : smp_mb(); /* Ensure callback reuse happens after callback invocation. */
1874 : :
1875 : : /*
1876 : : * Prevent premature wakeup: ensure that all increments happen
1877 : : * before there is a chance of the counter reaching zero.
1878 : : */
1879 : : atomic_set(&oom_callback_count, 1);
1880 : :
1881 : : get_online_cpus();
1882 : : for_each_online_cpu(cpu) {
1883 : : smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
1884 : : cond_resched();
1885 : : }
1886 : : put_online_cpus();
1887 : :
1888 : : /* Unconditionally decrement: no need to wake ourselves up. */
1889 : : atomic_dec(&oom_callback_count);
1890 : :
1891 : : return NOTIFY_OK;
1892 : : }
1893 : :
1894 : : static struct notifier_block rcu_oom_nb = {
1895 : : .notifier_call = rcu_oom_notify
1896 : : };
1897 : :
1898 : : static int __init rcu_register_oom_notifier(void)
1899 : : {
1900 : : register_oom_notifier(&rcu_oom_nb);
1901 : : return 0;
1902 : : }
1903 : : early_initcall(rcu_register_oom_notifier);
1904 : :
1905 : : #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1906 : :
1907 : : #ifdef CONFIG_RCU_CPU_STALL_INFO
1908 : :
1909 : : #ifdef CONFIG_RCU_FAST_NO_HZ
1910 : :
1911 : : static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1912 : : {
1913 : : struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1914 : : unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
1915 : :
1916 : : sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
1917 : : rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
1918 : : ulong2long(nlpd),
1919 : : rdtp->all_lazy ? 'L' : '.',
1920 : : rdtp->tick_nohz_enabled_snap ? '.' : 'D');
1921 : : }
1922 : :
1923 : : #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
1924 : :
1925 : : static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1926 : : {
1927 : : *cp = '\0';
1928 : : }
1929 : :
1930 : : #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
1931 : :
1932 : : /* Initiate the stall-info list. */
1933 : : static void print_cpu_stall_info_begin(void)
1934 : : {
1935 : : pr_cont("\n");
1936 : : }
1937 : :
1938 : : /*
1939 : : * Print out diagnostic information for the specified stalled CPU.
1940 : : *
1941 : : * If the specified CPU is aware of the current RCU grace period
1942 : : * (flavor specified by rsp), then print the number of scheduling
1943 : : * clock interrupts the CPU has taken during the time that it has
1944 : : * been aware. Otherwise, print the number of RCU grace periods
1945 : : * that this CPU is ignorant of, for example, "1" if the CPU was
1946 : : * aware of the previous grace period.
1947 : : *
1948 : : * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
1949 : : */
1950 : : static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1951 : : {
1952 : : char fast_no_hz[72];
1953 : : struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1954 : : struct rcu_dynticks *rdtp = rdp->dynticks;
1955 : : char *ticks_title;
1956 : : unsigned long ticks_value;
1957 : :
1958 : : if (rsp->gpnum == rdp->gpnum) {
1959 : : ticks_title = "ticks this GP";
1960 : : ticks_value = rdp->ticks_this_gp;
1961 : : } else {
1962 : : ticks_title = "GPs behind";
1963 : : ticks_value = rsp->gpnum - rdp->gpnum;
1964 : : }
1965 : : print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1966 : : pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
1967 : : cpu, ticks_value, ticks_title,
1968 : : atomic_read(&rdtp->dynticks) & 0xfff,
1969 : : rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1970 : : rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
1971 : : fast_no_hz);
1972 : : }
1973 : :
1974 : : /* Terminate the stall-info list. */
1975 : : static void print_cpu_stall_info_end(void)
1976 : : {
1977 : : pr_err("\t");
1978 : : }
1979 : :
1980 : : /* Zero ->ticks_this_gp for all flavors of RCU. */
1981 : : static void zero_cpu_stall_ticks(struct rcu_data *rdp)
1982 : : {
1983 : : rdp->ticks_this_gp = 0;
1984 : : rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
1985 : : }
1986 : :
1987 : : /* Increment ->ticks_this_gp for all flavors of RCU. */
1988 : : static void increment_cpu_stall_ticks(void)
1989 : : {
1990 : : struct rcu_state *rsp;
1991 : :
1992 : : for_each_rcu_flavor(rsp)
1993 : : __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
1994 : : }
1995 : :
1996 : : #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
1997 : :
1998 : : static void print_cpu_stall_info_begin(void)
1999 : : {
2000 : 0 : pr_cont(" {");
2001 : : }
2002 : :
2003 : : static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2004 : : {
2005 : 0 : pr_cont(" %d", cpu);
2006 : : }
2007 : :
2008 : : static void print_cpu_stall_info_end(void)
2009 : : {
2010 : 0 : pr_cont("} ");
2011 : : }
2012 : :
2013 : : static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2014 : : {
2015 : : }
2016 : :
2017 : : static void increment_cpu_stall_ticks(void)
2018 : : {
2019 : : }
2020 : :
2021 : : #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
2022 : :
2023 : : #ifdef CONFIG_RCU_NOCB_CPU
2024 : :
2025 : : /*
2026 : : * Offload callback processing from the boot-time-specified set of CPUs
2027 : : * specified by rcu_nocb_mask. For each CPU in the set, there is a
2028 : : * kthread created that pulls the callbacks from the corresponding CPU,
2029 : : * waits for a grace period to elapse, and invokes the callbacks.
2030 : : * The no-CBs CPUs do a wake_up() on their kthread when they insert
2031 : : * a callback into any empty list, unless the rcu_nocb_poll boot parameter
2032 : : * has been specified, in which case each kthread actively polls its
2033 : : * CPU. (Which isn't so great for energy efficiency, but which does
2034 : : * reduce RCU's overhead on that CPU.)
2035 : : *
2036 : : * This is intended to be used in conjunction with Frederic Weisbecker's
2037 : : * adaptive-idle work, which would seriously reduce OS jitter on CPUs
2038 : : * running CPU-bound user-mode computations.
2039 : : *
2040 : : * Offloading of callback processing could also in theory be used as
2041 : : * an energy-efficiency measure because CPUs with no RCU callbacks
2042 : : * queued are more aggressive about entering dyntick-idle mode.
2043 : : */
2044 : :
2045 : :
2046 : : /* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
2047 : : static int __init rcu_nocb_setup(char *str)
2048 : : {
2049 : : alloc_bootmem_cpumask_var(&rcu_nocb_mask);
2050 : : have_rcu_nocb_mask = true;
2051 : : cpulist_parse(str, rcu_nocb_mask);
2052 : : return 1;
2053 : : }
2054 : : __setup("rcu_nocbs=", rcu_nocb_setup);
2055 : :
2056 : : static int __init parse_rcu_nocb_poll(char *arg)
2057 : : {
2058 : : rcu_nocb_poll = 1;
2059 : : return 0;
2060 : : }
2061 : : early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
2062 : :
2063 : : /*
2064 : : * Do any no-CBs CPUs need another grace period?
2065 : : *
2066 : : * Interrupts must be disabled. If the caller does not hold the root
2067 : : * rnp_node structure's ->lock, the results are advisory only.
2068 : : */
2069 : : static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2070 : : {
2071 : : struct rcu_node *rnp = rcu_get_root(rsp);
2072 : :
2073 : : return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
2074 : : }
2075 : :
2076 : : /*
2077 : : * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
2078 : : * grace period.
2079 : : */
2080 : : static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2081 : : {
2082 : : wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
2083 : : }
2084 : :
2085 : : /*
2086 : : * Set the root rcu_node structure's ->need_future_gp field
2087 : : * based on the sum of those of all rcu_node structures. This does
2088 : : * double-count the root rcu_node structure's requests, but this
2089 : : * is necessary to handle the possibility of a rcu_nocb_kthread()
2090 : : * having awakened during the time that the rcu_node structures
2091 : : * were being updated for the end of the previous grace period.
2092 : : */
2093 : : static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2094 : : {
2095 : : rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
2096 : : }
2097 : :
2098 : : static void rcu_init_one_nocb(struct rcu_node *rnp)
2099 : : {
2100 : : init_waitqueue_head(&rnp->nocb_gp_wq[0]);
2101 : : init_waitqueue_head(&rnp->nocb_gp_wq[1]);
2102 : : }
2103 : :
2104 : : /* Is the specified CPU a no-CPUs CPU? */
2105 : : bool rcu_is_nocb_cpu(int cpu)
2106 : : {
2107 : : if (have_rcu_nocb_mask)
2108 : : return cpumask_test_cpu(cpu, rcu_nocb_mask);
2109 : : return false;
2110 : : }
2111 : :
2112 : : /*
2113 : : * Enqueue the specified string of rcu_head structures onto the specified
2114 : : * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
2115 : : * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
2116 : : * counts are supplied by rhcount and rhcount_lazy.
2117 : : *
2118 : : * If warranted, also wake up the kthread servicing this CPUs queues.
2119 : : */
2120 : : static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2121 : : struct rcu_head *rhp,
2122 : : struct rcu_head **rhtp,
2123 : : int rhcount, int rhcount_lazy,
2124 : : unsigned long flags)
2125 : : {
2126 : : int len;
2127 : : struct rcu_head **old_rhpp;
2128 : : struct task_struct *t;
2129 : :
2130 : : /* Enqueue the callback on the nocb list and update counts. */
2131 : : old_rhpp = xchg(&rdp->nocb_tail, rhtp);
2132 : : ACCESS_ONCE(*old_rhpp) = rhp;
2133 : : atomic_long_add(rhcount, &rdp->nocb_q_count);
2134 : : atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2135 : :
2136 : : /* If we are not being polled and there is a kthread, awaken it ... */
2137 : : t = ACCESS_ONCE(rdp->nocb_kthread);
2138 : : if (rcu_nocb_poll || !t) {
2139 : : trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2140 : : TPS("WakeNotPoll"));
2141 : : return;
2142 : : }
2143 : : len = atomic_long_read(&rdp->nocb_q_count);
2144 : : if (old_rhpp == &rdp->nocb_head) {
2145 : : if (!irqs_disabled_flags(flags)) {
2146 : : wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
2147 : : trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2148 : : TPS("WakeEmpty"));
2149 : : } else {
2150 : : rdp->nocb_defer_wakeup = true;
2151 : : trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2152 : : TPS("WakeEmptyIsDeferred"));
2153 : : }
2154 : : rdp->qlen_last_fqs_check = 0;
2155 : : } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2156 : : wake_up_process(t); /* ... or if many callbacks queued. */
2157 : : rdp->qlen_last_fqs_check = LONG_MAX / 2;
2158 : : trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
2159 : : } else {
2160 : : trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
2161 : : }
2162 : : return;
2163 : : }
2164 : :
2165 : : /*
2166 : : * This is a helper for __call_rcu(), which invokes this when the normal
2167 : : * callback queue is inoperable. If this is not a no-CBs CPU, this
2168 : : * function returns failure back to __call_rcu(), which can complain
2169 : : * appropriately.
2170 : : *
2171 : : * Otherwise, this function queues the callback where the corresponding
2172 : : * "rcuo" kthread can find it.
2173 : : */
2174 : : static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2175 : : bool lazy, unsigned long flags)
2176 : : {
2177 : :
2178 : : if (!rcu_is_nocb_cpu(rdp->cpu))
2179 : : return 0;
2180 : : __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
2181 : : if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2182 : : trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2183 : : (unsigned long)rhp->func,
2184 : : -atomic_long_read(&rdp->nocb_q_count_lazy),
2185 : : -atomic_long_read(&rdp->nocb_q_count));
2186 : : else
2187 : : trace_rcu_callback(rdp->rsp->name, rhp,
2188 : : -atomic_long_read(&rdp->nocb_q_count_lazy),
2189 : : -atomic_long_read(&rdp->nocb_q_count));
2190 : : return 1;
2191 : : }
2192 : :
2193 : : /*
2194 : : * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
2195 : : * not a no-CBs CPU.
2196 : : */
2197 : : static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2198 : : struct rcu_data *rdp,
2199 : : unsigned long flags)
2200 : : {
2201 : : long ql = rsp->qlen;
2202 : : long qll = rsp->qlen_lazy;
2203 : :
2204 : : /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2205 : : if (!rcu_is_nocb_cpu(smp_processor_id()))
2206 : : return 0;
2207 : : rsp->qlen = 0;
2208 : : rsp->qlen_lazy = 0;
2209 : :
2210 : : /* First, enqueue the donelist, if any. This preserves CB ordering. */
2211 : : if (rsp->orphan_donelist != NULL) {
2212 : : __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
2213 : : rsp->orphan_donetail, ql, qll, flags);
2214 : : ql = qll = 0;
2215 : : rsp->orphan_donelist = NULL;
2216 : : rsp->orphan_donetail = &rsp->orphan_donelist;
2217 : : }
2218 : : if (rsp->orphan_nxtlist != NULL) {
2219 : : __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
2220 : : rsp->orphan_nxttail, ql, qll, flags);
2221 : : ql = qll = 0;
2222 : : rsp->orphan_nxtlist = NULL;
2223 : : rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2224 : : }
2225 : : return 1;
2226 : : }
2227 : :
2228 : : /*
2229 : : * If necessary, kick off a new grace period, and either way wait
2230 : : * for a subsequent grace period to complete.
2231 : : */
2232 : : static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2233 : : {
2234 : : unsigned long c;
2235 : : bool d;
2236 : : unsigned long flags;
2237 : : struct rcu_node *rnp = rdp->mynode;
2238 : :
2239 : : raw_spin_lock_irqsave(&rnp->lock, flags);
2240 : : smp_mb__after_unlock_lock();
2241 : : c = rcu_start_future_gp(rnp, rdp);
2242 : : raw_spin_unlock_irqrestore(&rnp->lock, flags);
2243 : :
2244 : : /*
2245 : : * Wait for the grace period. Do so interruptibly to avoid messing
2246 : : * up the load average.
2247 : : */
2248 : : trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
2249 : : for (;;) {
2250 : : wait_event_interruptible(
2251 : : rnp->nocb_gp_wq[c & 0x1],
2252 : : (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
2253 : : if (likely(d))
2254 : : break;
2255 : : flush_signals(current);
2256 : : trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
2257 : : }
2258 : : trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
2259 : : smp_mb(); /* Ensure that CB invocation happens after GP end. */
2260 : : }
2261 : :
2262 : : /*
2263 : : * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
2264 : : * callbacks queued by the corresponding no-CBs CPU.
2265 : : */
2266 : : static int rcu_nocb_kthread(void *arg)
2267 : : {
2268 : : int c, cl;
2269 : : bool firsttime = 1;
2270 : : struct rcu_head *list;
2271 : : struct rcu_head *next;
2272 : : struct rcu_head **tail;
2273 : : struct rcu_data *rdp = arg;
2274 : :
2275 : : /* Each pass through this loop invokes one batch of callbacks */
2276 : : for (;;) {
2277 : : /* If not polling, wait for next batch of callbacks. */
2278 : : if (!rcu_nocb_poll) {
2279 : : trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2280 : : TPS("Sleep"));
2281 : : wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
2282 : : /* Memory barrier provide by xchg() below. */
2283 : : } else if (firsttime) {
2284 : : firsttime = 0;
2285 : : trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2286 : : TPS("Poll"));
2287 : : }
2288 : : list = ACCESS_ONCE(rdp->nocb_head);
2289 : : if (!list) {
2290 : : if (!rcu_nocb_poll)
2291 : : trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2292 : : TPS("WokeEmpty"));
2293 : : schedule_timeout_interruptible(1);
2294 : : flush_signals(current);
2295 : : continue;
2296 : : }
2297 : : firsttime = 1;
2298 : : trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2299 : : TPS("WokeNonEmpty"));
2300 : :
2301 : : /*
2302 : : * Extract queued callbacks, update counts, and wait
2303 : : * for a grace period to elapse.
2304 : : */
2305 : : ACCESS_ONCE(rdp->nocb_head) = NULL;
2306 : : tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2307 : : c = atomic_long_xchg(&rdp->nocb_q_count, 0);
2308 : : cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2309 : : ACCESS_ONCE(rdp->nocb_p_count) += c;
2310 : : ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2311 : : rcu_nocb_wait_gp(rdp);
2312 : :
2313 : : /* Each pass through the following loop invokes a callback. */
2314 : : trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
2315 : : c = cl = 0;
2316 : : while (list) {
2317 : : next = list->next;
2318 : : /* Wait for enqueuing to complete, if needed. */
2319 : : while (next == NULL && &list->next != tail) {
2320 : : trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2321 : : TPS("WaitQueue"));
2322 : : schedule_timeout_interruptible(1);
2323 : : trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2324 : : TPS("WokeQueue"));
2325 : : next = list->next;
2326 : : }
2327 : : debug_rcu_head_unqueue(list);
2328 : : local_bh_disable();
2329 : : if (__rcu_reclaim(rdp->rsp->name, list))
2330 : : cl++;
2331 : : c++;
2332 : : local_bh_enable();
2333 : : list = next;
2334 : : }
2335 : : trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2336 : : ACCESS_ONCE(rdp->nocb_p_count) -= c;
2337 : : ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
2338 : : rdp->n_nocbs_invoked += c;
2339 : : }
2340 : : return 0;
2341 : : }
2342 : :
2343 : : /* Is a deferred wakeup of rcu_nocb_kthread() required? */
2344 : : static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2345 : : {
2346 : : return ACCESS_ONCE(rdp->nocb_defer_wakeup);
2347 : : }
2348 : :
2349 : : /* Do a deferred wakeup of rcu_nocb_kthread(). */
2350 : : static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2351 : : {
2352 : : if (!rcu_nocb_need_deferred_wakeup(rdp))
2353 : : return;
2354 : : ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
2355 : : wake_up(&rdp->nocb_wq);
2356 : : trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
2357 : : }
2358 : :
2359 : : /* Initialize per-rcu_data variables for no-CBs CPUs. */
2360 : : static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2361 : : {
2362 : : rdp->nocb_tail = &rdp->nocb_head;
2363 : : init_waitqueue_head(&rdp->nocb_wq);
2364 : : }
2365 : :
2366 : : /* Create a kthread for each RCU flavor for each no-CBs CPU. */
2367 : : static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2368 : : {
2369 : : int cpu;
2370 : : struct rcu_data *rdp;
2371 : : struct task_struct *t;
2372 : :
2373 : : if (rcu_nocb_mask == NULL)
2374 : : return;
2375 : : for_each_cpu(cpu, rcu_nocb_mask) {
2376 : : rdp = per_cpu_ptr(rsp->rda, cpu);
2377 : : t = kthread_run(rcu_nocb_kthread, rdp,
2378 : : "rcuo%c/%d", rsp->abbr, cpu);
2379 : : BUG_ON(IS_ERR(t));
2380 : : ACCESS_ONCE(rdp->nocb_kthread) = t;
2381 : : }
2382 : : }
2383 : :
2384 : : /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2385 : : static bool init_nocb_callback_list(struct rcu_data *rdp)
2386 : : {
2387 : : if (rcu_nocb_mask == NULL ||
2388 : : !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2389 : : return false;
2390 : : rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2391 : : return true;
2392 : : }
2393 : :
2394 : : #else /* #ifdef CONFIG_RCU_NOCB_CPU */
2395 : :
2396 : : static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2397 : : {
2398 : : return 0;
2399 : : }
2400 : :
2401 : : static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2402 : : {
2403 : : }
2404 : :
2405 : : static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2406 : : {
2407 : : }
2408 : :
2409 : : static void rcu_init_one_nocb(struct rcu_node *rnp)
2410 : : {
2411 : : }
2412 : :
2413 : : static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2414 : : bool lazy, unsigned long flags)
2415 : : {
2416 : : return 0;
2417 : : }
2418 : :
2419 : : static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2420 : : struct rcu_data *rdp,
2421 : : unsigned long flags)
2422 : : {
2423 : : return 0;
2424 : : }
2425 : :
2426 : : static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2427 : : {
2428 : : }
2429 : :
2430 : : static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2431 : : {
2432 : : return false;
2433 : : }
2434 : :
2435 : : static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2436 : : {
2437 : : }
2438 : :
2439 : : static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2440 : : {
2441 : : }
2442 : :
2443 : : static bool init_nocb_callback_list(struct rcu_data *rdp)
2444 : : {
2445 : : return false;
2446 : : }
2447 : :
2448 : : #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
2449 : :
2450 : : /*
2451 : : * An adaptive-ticks CPU can potentially execute in kernel mode for an
2452 : : * arbitrarily long period of time with the scheduling-clock tick turned
2453 : : * off. RCU will be paying attention to this CPU because it is in the
2454 : : * kernel, but the CPU cannot be guaranteed to be executing the RCU state
2455 : : * machine because the scheduling-clock tick has been disabled. Therefore,
2456 : : * if an adaptive-ticks CPU is failing to respond to the current grace
2457 : : * period and has not be idle from an RCU perspective, kick it.
2458 : : */
2459 : : static void rcu_kick_nohz_cpu(int cpu)
2460 : : {
2461 : : #ifdef CONFIG_NO_HZ_FULL
2462 : : if (tick_nohz_full_cpu(cpu))
2463 : : smp_send_reschedule(cpu);
2464 : : #endif /* #ifdef CONFIG_NO_HZ_FULL */
2465 : : }
2466 : :
2467 : :
2468 : : #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
2469 : :
2470 : : /*
2471 : : * Define RCU flavor that holds sysidle state. This needs to be the
2472 : : * most active flavor of RCU.
2473 : : */
2474 : : #ifdef CONFIG_PREEMPT_RCU
2475 : : static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
2476 : : #else /* #ifdef CONFIG_PREEMPT_RCU */
2477 : : static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
2478 : : #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
2479 : :
2480 : : static int full_sysidle_state; /* Current system-idle state. */
2481 : : #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
2482 : : #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
2483 : : #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
2484 : : #define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
2485 : : #define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
2486 : :
2487 : : /*
2488 : : * Invoked to note exit from irq or task transition to idle. Note that
2489 : : * usermode execution does -not- count as idle here! After all, we want
2490 : : * to detect full-system idle states, not RCU quiescent states and grace
2491 : : * periods. The caller must have disabled interrupts.
2492 : : */
2493 : : static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2494 : : {
2495 : : unsigned long j;
2496 : :
2497 : : /* Adjust nesting, check for fully idle. */
2498 : : if (irq) {
2499 : : rdtp->dynticks_idle_nesting--;
2500 : : WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2501 : : if (rdtp->dynticks_idle_nesting != 0)
2502 : : return; /* Still not fully idle. */
2503 : : } else {
2504 : : if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
2505 : : DYNTICK_TASK_NEST_VALUE) {
2506 : : rdtp->dynticks_idle_nesting = 0;
2507 : : } else {
2508 : : rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
2509 : : WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2510 : : return; /* Still not fully idle. */
2511 : : }
2512 : : }
2513 : :
2514 : : /* Record start of fully idle period. */
2515 : : j = jiffies;
2516 : : ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
2517 : : smp_mb__before_atomic_inc();
2518 : : atomic_inc(&rdtp->dynticks_idle);
2519 : : smp_mb__after_atomic_inc();
2520 : : WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
2521 : : }
2522 : :
2523 : : /*
2524 : : * Unconditionally force exit from full system-idle state. This is
2525 : : * invoked when a normal CPU exits idle, but must be called separately
2526 : : * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
2527 : : * is that the timekeeping CPU is permitted to take scheduling-clock
2528 : : * interrupts while the system is in system-idle state, and of course
2529 : : * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
2530 : : * interrupt from any other type of interrupt.
2531 : : */
2532 : : void rcu_sysidle_force_exit(void)
2533 : : {
2534 : : int oldstate = ACCESS_ONCE(full_sysidle_state);
2535 : : int newoldstate;
2536 : :
2537 : : /*
2538 : : * Each pass through the following loop attempts to exit full
2539 : : * system-idle state. If contention proves to be a problem,
2540 : : * a trylock-based contention tree could be used here.
2541 : : */
2542 : : while (oldstate > RCU_SYSIDLE_SHORT) {
2543 : : newoldstate = cmpxchg(&full_sysidle_state,
2544 : : oldstate, RCU_SYSIDLE_NOT);
2545 : : if (oldstate == newoldstate &&
2546 : : oldstate == RCU_SYSIDLE_FULL_NOTED) {
2547 : : rcu_kick_nohz_cpu(tick_do_timer_cpu);
2548 : : return; /* We cleared it, done! */
2549 : : }
2550 : : oldstate = newoldstate;
2551 : : }
2552 : : smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
2553 : : }
2554 : :
2555 : : /*
2556 : : * Invoked to note entry to irq or task transition from idle. Note that
2557 : : * usermode execution does -not- count as idle here! The caller must
2558 : : * have disabled interrupts.
2559 : : */
2560 : : static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2561 : : {
2562 : : /* Adjust nesting, check for already non-idle. */
2563 : : if (irq) {
2564 : : rdtp->dynticks_idle_nesting++;
2565 : : WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2566 : : if (rdtp->dynticks_idle_nesting != 1)
2567 : : return; /* Already non-idle. */
2568 : : } else {
2569 : : /*
2570 : : * Allow for irq misnesting. Yes, it really is possible
2571 : : * to enter an irq handler then never leave it, and maybe
2572 : : * also vice versa. Handle both possibilities.
2573 : : */
2574 : : if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
2575 : : rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
2576 : : WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2577 : : return; /* Already non-idle. */
2578 : : } else {
2579 : : rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
2580 : : }
2581 : : }
2582 : :
2583 : : /* Record end of idle period. */
2584 : : smp_mb__before_atomic_inc();
2585 : : atomic_inc(&rdtp->dynticks_idle);
2586 : : smp_mb__after_atomic_inc();
2587 : : WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
2588 : :
2589 : : /*
2590 : : * If we are the timekeeping CPU, we are permitted to be non-idle
2591 : : * during a system-idle state. This must be the case, because
2592 : : * the timekeeping CPU has to take scheduling-clock interrupts
2593 : : * during the time that the system is transitioning to full
2594 : : * system-idle state. This means that the timekeeping CPU must
2595 : : * invoke rcu_sysidle_force_exit() directly if it does anything
2596 : : * more than take a scheduling-clock interrupt.
2597 : : */
2598 : : if (smp_processor_id() == tick_do_timer_cpu)
2599 : : return;
2600 : :
2601 : : /* Update system-idle state: We are clearly no longer fully idle! */
2602 : : rcu_sysidle_force_exit();
2603 : : }
2604 : :
2605 : : /*
2606 : : * Check to see if the current CPU is idle. Note that usermode execution
2607 : : * does not count as idle. The caller must have disabled interrupts.
2608 : : */
2609 : : static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2610 : : unsigned long *maxj)
2611 : : {
2612 : : int cur;
2613 : : unsigned long j;
2614 : : struct rcu_dynticks *rdtp = rdp->dynticks;
2615 : :
2616 : : /*
2617 : : * If some other CPU has already reported non-idle, if this is
2618 : : * not the flavor of RCU that tracks sysidle state, or if this
2619 : : * is an offline or the timekeeping CPU, nothing to do.
2620 : : */
2621 : : if (!*isidle || rdp->rsp != rcu_sysidle_state ||
2622 : : cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2623 : : return;
2624 : : if (rcu_gp_in_progress(rdp->rsp))
2625 : : WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
2626 : :
2627 : : /* Pick up current idle and NMI-nesting counter and check. */
2628 : : cur = atomic_read(&rdtp->dynticks_idle);
2629 : : if (cur & 0x1) {
2630 : : *isidle = false; /* We are not idle! */
2631 : : return;
2632 : : }
2633 : : smp_mb(); /* Read counters before timestamps. */
2634 : :
2635 : : /* Pick up timestamps. */
2636 : : j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
2637 : : /* If this CPU entered idle more recently, update maxj timestamp. */
2638 : : if (ULONG_CMP_LT(*maxj, j))
2639 : : *maxj = j;
2640 : : }
2641 : :
2642 : : /*
2643 : : * Is this the flavor of RCU that is handling full-system idle?
2644 : : */
2645 : : static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2646 : : {
2647 : : return rsp == rcu_sysidle_state;
2648 : : }
2649 : :
2650 : : /*
2651 : : * Bind the grace-period kthread for the sysidle flavor of RCU to the
2652 : : * timekeeping CPU.
2653 : : */
2654 : : static void rcu_bind_gp_kthread(void)
2655 : : {
2656 : : int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2657 : :
2658 : : if (cpu < 0 || cpu >= nr_cpu_ids)
2659 : : return;
2660 : : if (raw_smp_processor_id() != cpu)
2661 : : set_cpus_allowed_ptr(current, cpumask_of(cpu));
2662 : : }
2663 : :
2664 : : /*
2665 : : * Return a delay in jiffies based on the number of CPUs, rcu_node
2666 : : * leaf fanout, and jiffies tick rate. The idea is to allow larger
2667 : : * systems more time to transition to full-idle state in order to
2668 : : * avoid the cache thrashing that otherwise occur on the state variable.
2669 : : * Really small systems (less than a couple of tens of CPUs) should
2670 : : * instead use a single global atomically incremented counter, and later
2671 : : * versions of this will automatically reconfigure themselves accordingly.
2672 : : */
2673 : : static unsigned long rcu_sysidle_delay(void)
2674 : : {
2675 : : if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2676 : : return 0;
2677 : : return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
2678 : : }
2679 : :
2680 : : /*
2681 : : * Advance the full-system-idle state. This is invoked when all of
2682 : : * the non-timekeeping CPUs are idle.
2683 : : */
2684 : : static void rcu_sysidle(unsigned long j)
2685 : : {
2686 : : /* Check the current state. */
2687 : : switch (ACCESS_ONCE(full_sysidle_state)) {
2688 : : case RCU_SYSIDLE_NOT:
2689 : :
2690 : : /* First time all are idle, so note a short idle period. */
2691 : : ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
2692 : : break;
2693 : :
2694 : : case RCU_SYSIDLE_SHORT:
2695 : :
2696 : : /*
2697 : : * Idle for a bit, time to advance to next state?
2698 : : * cmpxchg failure means race with non-idle, let them win.
2699 : : */
2700 : : if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2701 : : (void)cmpxchg(&full_sysidle_state,
2702 : : RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
2703 : : break;
2704 : :
2705 : : case RCU_SYSIDLE_LONG:
2706 : :
2707 : : /*
2708 : : * Do an additional check pass before advancing to full.
2709 : : * cmpxchg failure means race with non-idle, let them win.
2710 : : */
2711 : : if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2712 : : (void)cmpxchg(&full_sysidle_state,
2713 : : RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
2714 : : break;
2715 : :
2716 : : default:
2717 : : break;
2718 : : }
2719 : : }
2720 : :
2721 : : /*
2722 : : * Found a non-idle non-timekeeping CPU, so kick the system-idle state
2723 : : * back to the beginning.
2724 : : */
2725 : : static void rcu_sysidle_cancel(void)
2726 : : {
2727 : : smp_mb();
2728 : : ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
2729 : : }
2730 : :
2731 : : /*
2732 : : * Update the sysidle state based on the results of a force-quiescent-state
2733 : : * scan of the CPUs' dyntick-idle state.
2734 : : */
2735 : : static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
2736 : : unsigned long maxj, bool gpkt)
2737 : : {
2738 : : if (rsp != rcu_sysidle_state)
2739 : : return; /* Wrong flavor, ignore. */
2740 : : if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2741 : : return; /* Running state machine from timekeeping CPU. */
2742 : : if (isidle)
2743 : : rcu_sysidle(maxj); /* More idle! */
2744 : : else
2745 : : rcu_sysidle_cancel(); /* Idle is over. */
2746 : : }
2747 : :
2748 : : /*
2749 : : * Wrapper for rcu_sysidle_report() when called from the grace-period
2750 : : * kthread's context.
2751 : : */
2752 : : static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2753 : : unsigned long maxj)
2754 : : {
2755 : : rcu_sysidle_report(rsp, isidle, maxj, true);
2756 : : }
2757 : :
2758 : : /* Callback and function for forcing an RCU grace period. */
2759 : : struct rcu_sysidle_head {
2760 : : struct rcu_head rh;
2761 : : int inuse;
2762 : : };
2763 : :
2764 : : static void rcu_sysidle_cb(struct rcu_head *rhp)
2765 : : {
2766 : : struct rcu_sysidle_head *rshp;
2767 : :
2768 : : /*
2769 : : * The following memory barrier is needed to replace the
2770 : : * memory barriers that would normally be in the memory
2771 : : * allocator.
2772 : : */
2773 : : smp_mb(); /* grace period precedes setting inuse. */
2774 : :
2775 : : rshp = container_of(rhp, struct rcu_sysidle_head, rh);
2776 : : ACCESS_ONCE(rshp->inuse) = 0;
2777 : : }
2778 : :
2779 : : /*
2780 : : * Check to see if the system is fully idle, other than the timekeeping CPU.
2781 : : * The caller must have disabled interrupts.
2782 : : */
2783 : : bool rcu_sys_is_idle(void)
2784 : : {
2785 : : static struct rcu_sysidle_head rsh;
2786 : : int rss = ACCESS_ONCE(full_sysidle_state);
2787 : :
2788 : : if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
2789 : : return false;
2790 : :
2791 : : /* Handle small-system case by doing a full scan of CPUs. */
2792 : : if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
2793 : : int oldrss = rss - 1;
2794 : :
2795 : : /*
2796 : : * One pass to advance to each state up to _FULL.
2797 : : * Give up if any pass fails to advance the state.
2798 : : */
2799 : : while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
2800 : : int cpu;
2801 : : bool isidle = true;
2802 : : unsigned long maxj = jiffies - ULONG_MAX / 4;
2803 : : struct rcu_data *rdp;
2804 : :
2805 : : /* Scan all the CPUs looking for nonidle CPUs. */
2806 : : for_each_possible_cpu(cpu) {
2807 : : rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
2808 : : rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
2809 : : if (!isidle)
2810 : : break;
2811 : : }
2812 : : rcu_sysidle_report(rcu_sysidle_state,
2813 : : isidle, maxj, false);
2814 : : oldrss = rss;
2815 : : rss = ACCESS_ONCE(full_sysidle_state);
2816 : : }
2817 : : }
2818 : :
2819 : : /* If this is the first observation of an idle period, record it. */
2820 : : if (rss == RCU_SYSIDLE_FULL) {
2821 : : rss = cmpxchg(&full_sysidle_state,
2822 : : RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
2823 : : return rss == RCU_SYSIDLE_FULL;
2824 : : }
2825 : :
2826 : : smp_mb(); /* ensure rss load happens before later caller actions. */
2827 : :
2828 : : /* If already fully idle, tell the caller (in case of races). */
2829 : : if (rss == RCU_SYSIDLE_FULL_NOTED)
2830 : : return true;
2831 : :
2832 : : /*
2833 : : * If we aren't there yet, and a grace period is not in flight,
2834 : : * initiate a grace period. Either way, tell the caller that
2835 : : * we are not there yet. We use an xchg() rather than an assignment
2836 : : * to make up for the memory barriers that would otherwise be
2837 : : * provided by the memory allocator.
2838 : : */
2839 : : if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
2840 : : !rcu_gp_in_progress(rcu_sysidle_state) &&
2841 : : !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
2842 : : call_rcu(&rsh.rh, rcu_sysidle_cb);
2843 : : return false;
2844 : : }
2845 : :
2846 : : /*
2847 : : * Initialize dynticks sysidle state for CPUs coming online.
2848 : : */
2849 : : static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2850 : : {
2851 : : rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
2852 : : }
2853 : :
2854 : : #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2855 : :
2856 : : static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2857 : : {
2858 : : }
2859 : :
2860 : : static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2861 : : {
2862 : : }
2863 : :
2864 : : static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2865 : : unsigned long *maxj)
2866 : : {
2867 : : }
2868 : :
2869 : : static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2870 : : {
2871 : : return false;
2872 : : }
2873 : :
2874 : : static void rcu_bind_gp_kthread(void)
2875 : : {
2876 : : }
2877 : :
2878 : : static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2879 : : unsigned long maxj)
2880 : : {
2881 : : }
2882 : :
2883 : : static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2884 : : {
2885 : : }
2886 : :
2887 : : #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2888 : :
2889 : : /*
2890 : : * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
2891 : : * grace-period kthread will do force_quiescent_state() processing?
2892 : : * The idea is to avoid waking up RCU core processing on such a
2893 : : * CPU unless the grace period has extended for too long.
2894 : : *
2895 : : * This code relies on the fact that all NO_HZ_FULL CPUs are also
2896 : : * CONFIG_RCU_NOCB_CPUs.
2897 : : */
2898 : : static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
2899 : : {
2900 : : #ifdef CONFIG_NO_HZ_FULL
2901 : : if (tick_nohz_full_cpu(smp_processor_id()) &&
2902 : : (!rcu_gp_in_progress(rsp) ||
2903 : : ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
2904 : : return 1;
2905 : : #endif /* #ifdef CONFIG_NO_HZ_FULL */
2906 : : return 0;
2907 : : }
|