Branch data Line data Source code
1 : : /*
2 : : * menu.c - the menu idle governor
3 : : *
4 : : * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
5 : : * Copyright (C) 2009 Intel Corporation
6 : : * Author:
7 : : * Arjan van de Ven <arjan@linux.intel.com>
8 : : *
9 : : * This code is licenced under the GPL version 2 as described
10 : : * in the COPYING file that acompanies the Linux Kernel.
11 : : */
12 : :
13 : : #include <linux/kernel.h>
14 : : #include <linux/cpuidle.h>
15 : : #include <linux/pm_qos.h>
16 : : #include <linux/time.h>
17 : : #include <linux/ktime.h>
18 : : #include <linux/hrtimer.h>
19 : : #include <linux/tick.h>
20 : : #include <linux/sched.h>
21 : : #include <linux/math64.h>
22 : : #include <linux/module.h>
23 : :
24 : : /*
25 : : * Please note when changing the tuning values:
26 : : * If (MAX_INTERESTING-1) * RESOLUTION > UINT_MAX, the result of
27 : : * a scaling operation multiplication may overflow on 32 bit platforms.
28 : : * In that case, #define RESOLUTION as ULL to get 64 bit result:
29 : : * #define RESOLUTION 1024ULL
30 : : *
31 : : * The default values do not overflow.
32 : : */
33 : : #define BUCKETS 12
34 : : #define INTERVALS 8
35 : : #define RESOLUTION 1024
36 : : #define DECAY 8
37 : : #define MAX_INTERESTING 50000
38 : : #define STDDEV_THRESH 400
39 : :
40 : :
41 : : /*
42 : : * Concepts and ideas behind the menu governor
43 : : *
44 : : * For the menu governor, there are 3 decision factors for picking a C
45 : : * state:
46 : : * 1) Energy break even point
47 : : * 2) Performance impact
48 : : * 3) Latency tolerance (from pmqos infrastructure)
49 : : * These these three factors are treated independently.
50 : : *
51 : : * Energy break even point
52 : : * -----------------------
53 : : * C state entry and exit have an energy cost, and a certain amount of time in
54 : : * the C state is required to actually break even on this cost. CPUIDLE
55 : : * provides us this duration in the "target_residency" field. So all that we
56 : : * need is a good prediction of how long we'll be idle. Like the traditional
57 : : * menu governor, we start with the actual known "next timer event" time.
58 : : *
59 : : * Since there are other source of wakeups (interrupts for example) than
60 : : * the next timer event, this estimation is rather optimistic. To get a
61 : : * more realistic estimate, a correction factor is applied to the estimate,
62 : : * that is based on historic behavior. For example, if in the past the actual
63 : : * duration always was 50% of the next timer tick, the correction factor will
64 : : * be 0.5.
65 : : *
66 : : * menu uses a running average for this correction factor, however it uses a
67 : : * set of factors, not just a single factor. This stems from the realization
68 : : * that the ratio is dependent on the order of magnitude of the expected
69 : : * duration; if we expect 500 milliseconds of idle time the likelihood of
70 : : * getting an interrupt very early is much higher than if we expect 50 micro
71 : : * seconds of idle time. A second independent factor that has big impact on
72 : : * the actual factor is if there is (disk) IO outstanding or not.
73 : : * (as a special twist, we consider every sleep longer than 50 milliseconds
74 : : * as perfect; there are no power gains for sleeping longer than this)
75 : : *
76 : : * For these two reasons we keep an array of 12 independent factors, that gets
77 : : * indexed based on the magnitude of the expected duration as well as the
78 : : * "is IO outstanding" property.
79 : : *
80 : : * Repeatable-interval-detector
81 : : * ----------------------------
82 : : * There are some cases where "next timer" is a completely unusable predictor:
83 : : * Those cases where the interval is fixed, for example due to hardware
84 : : * interrupt mitigation, but also due to fixed transfer rate devices such as
85 : : * mice.
86 : : * For this, we use a different predictor: We track the duration of the last 8
87 : : * intervals and if the stand deviation of these 8 intervals is below a
88 : : * threshold value, we use the average of these intervals as prediction.
89 : : *
90 : : * Limiting Performance Impact
91 : : * ---------------------------
92 : : * C states, especially those with large exit latencies, can have a real
93 : : * noticeable impact on workloads, which is not acceptable for most sysadmins,
94 : : * and in addition, less performance has a power price of its own.
95 : : *
96 : : * As a general rule of thumb, menu assumes that the following heuristic
97 : : * holds:
98 : : * The busier the system, the less impact of C states is acceptable
99 : : *
100 : : * This rule-of-thumb is implemented using a performance-multiplier:
101 : : * If the exit latency times the performance multiplier is longer than
102 : : * the predicted duration, the C state is not considered a candidate
103 : : * for selection due to a too high performance impact. So the higher
104 : : * this multiplier is, the longer we need to be idle to pick a deep C
105 : : * state, and thus the less likely a busy CPU will hit such a deep
106 : : * C state.
107 : : *
108 : : * Two factors are used in determing this multiplier:
109 : : * a value of 10 is added for each point of "per cpu load average" we have.
110 : : * a value of 5 points is added for each process that is waiting for
111 : : * IO on this CPU.
112 : : * (these values are experimentally determined)
113 : : *
114 : : * The load average factor gives a longer term (few seconds) input to the
115 : : * decision, while the iowait value gives a cpu local instantanious input.
116 : : * The iowait factor may look low, but realize that this is also already
117 : : * represented in the system load average.
118 : : *
119 : : */
120 : :
121 : : struct menu_device {
122 : : int last_state_idx;
123 : : int needs_update;
124 : :
125 : : unsigned int expected_us;
126 : : unsigned int predicted_us;
127 : : unsigned int exit_us;
128 : : unsigned int bucket;
129 : : unsigned int correction_factor[BUCKETS];
130 : : unsigned int intervals[INTERVALS];
131 : : int interval_ptr;
132 : : };
133 : :
134 : :
135 : : #define LOAD_INT(x) ((x) >> FSHIFT)
136 : : #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
137 : :
138 : : static int get_loadavg(void)
139 : : {
140 : : unsigned long this = this_cpu_load();
141 : :
142 : :
143 : : return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10;
144 : : }
145 : :
146 : : static inline int which_bucket(unsigned int duration)
147 : : {
148 : : int bucket = 0;
149 : :
150 : : /*
151 : : * We keep two groups of stats; one with no
152 : : * IO pending, one without.
153 : : * This allows us to calculate
154 : : * E(duration)|iowait
155 : : */
156 [ + + ]: 5637432 : if (nr_iowait_cpu(smp_processor_id()))
157 : : bucket = BUCKETS/2;
158 : :
159 [ + + ]: 5637597 : if (duration < 10)
160 : : return bucket;
161 [ + + ]: 5633297 : if (duration < 100)
162 : 136981 : return bucket + 1;
163 [ + + ]: 5496316 : if (duration < 1000)
164 : 284891 : return bucket + 2;
165 [ + + ]: 5211425 : if (duration < 10000)
166 : 3078426 : return bucket + 3;
167 [ + + ]: 2132999 : if (duration < 100000)
168 : 927511 : return bucket + 4;
169 : 1205488 : return bucket + 5;
170 : : }
171 : :
172 : : /*
173 : : * Return a multiplier for the exit latency that is intended
174 : : * to take performance requirements into account.
175 : : * The more performance critical we estimate the system
176 : : * to be, the higher this multiplier, and thus the higher
177 : : * the barrier to go to an expensive C state.
178 : : */
179 : : static inline int performance_multiplier(void)
180 : : {
181 : : int mult = 1;
182 : :
183 : : /* for higher loadavg, we are more reluctant */
184 : :
185 : : /*
186 : : * this doesn't work as intended - it is almost always 0, but can
187 : : * sometimes, depending on workload, spike very high into the hundreds
188 : : * even when the average cpu load is under 10%.
189 : : */
190 : : /* mult += 2 * get_loadavg(); */
191 : :
192 : : /* for IO wait tasks (per cpu!) we add 5x each */
193 : 5637597 : mult += 10 * nr_iowait_cpu(smp_processor_id());
194 : :
195 : : return mult;
196 : : }
197 : :
198 : : static DEFINE_PER_CPU(struct menu_device, menu_devices);
199 : :
200 : : static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
201 : :
202 : : /* This implements DIV_ROUND_CLOSEST but avoids 64 bit division */
203 : 0 : static u64 div_round64(u64 dividend, u32 divisor)
204 : : {
205 : 11274798 : return div_u64(dividend + (divisor / 2), divisor);
206 : : }
207 : :
208 : : /*
209 : : * Try detecting repeating patterns by keeping track of the last 8
210 : : * intervals, and checking if the standard deviation of that set
211 : : * of points is below a threshold. If it is... then use the
212 : : * average of these 8 points as the estimated value.
213 : : */
214 : 5635678 : static void get_typical_interval(struct menu_device *data)
215 : : {
216 : : int i, divisor;
217 : : unsigned int max, thresh;
218 : : uint64_t avg, stddev;
219 : :
220 : : thresh = UINT_MAX; /* Discard outliers above this value */
221 : :
222 : : again:
223 : :
224 : : /* First calculate the average of past intervals */
225 : : max = 0;
226 : : avg = 0;
227 : : divisor = 0;
228 [ + + ]: 111751569 : for (i = 0; i < INTERVALS; i++) {
229 : 99334112 : unsigned int value = data->intervals[i];
230 [ + + ]: 99334112 : if (value <= thresh) {
231 : 89058523 : avg += value;
232 : 89058523 : divisor++;
233 [ + + ]: 89058523 : if (value > max)
234 : : max = value;
235 : : }
236 : : }
237 [ - + ][ # # ]: 12417457 : do_div(avg, divisor);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ]
238 : :
239 : : /* Then try to determine standard deviation */
240 : : stddev = 0;
241 [ + + ]: 111752953 : for (i = 0; i < INTERVALS; i++) {
242 : 99335496 : unsigned int value = data->intervals[i];
243 [ + + ]: 99335496 : if (value <= thresh) {
244 : 89069444 : int64_t diff = value - avg;
245 : 89069444 : stddev += diff * diff;
246 : : }
247 : : }
248 [ - + ][ # # ]: 12417457 : do_div(stddev, divisor);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ]
249 : : /*
250 : : * The typical interval is obtained when standard deviation is small
251 : : * or standard deviation is small compared to the average interval.
252 : : *
253 : : * int_sqrt() formal parameter type is unsigned long. When the
254 : : * greatest difference to an outlier exceeds ~65 ms * sqrt(divisor)
255 : : * the resulting squared standard deviation exceeds the input domain
256 : : * of int_sqrt on platforms where unsigned long is 32 bits in size.
257 : : * In such case reject the candidate average.
258 : : *
259 : : * Use this result only if there is no timer to wake us up sooner.
260 : : */
261 [ + + ]: 12417457 : if (likely(stddev <= ULONG_MAX)) {
262 : 11989865 : stddev = int_sqrt(stddev);
263 [ + + ][ + ]: 11989864 : if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3))
264 [ + + ]: 15686520 : || stddev <= 20) {
265 [ + ]: 0 : if (data->expected_us > avg)
266 : 2213534 : data->predicted_us = avg;
267 : : return;
268 : : }
269 : : }
270 : :
271 : : /*
272 : : * If we have outliers to the upside in our distribution, discard
273 : : * those by setting the threshold to exclude these outliers, then
274 : : * calculate the average and standard deviation again. Once we get
275 : : * down to the bottom 3/4 of our samples, stop excluding samples.
276 : : *
277 : : * This can deal with workloads that have long pauses interspersed
278 : : * with sporadic activity with a bunch of short pauses.
279 : : */
280 [ + + ]: 10018535 : if ((divisor * 4) <= INTERVALS * 3)
281 : : return;
282 : :
283 : 6781779 : thresh = max - 1;
284 : 6781779 : goto again;
285 : : }
286 : :
287 : : /**
288 : : * menu_select - selects the next idle state to enter
289 : : * @drv: cpuidle driver containing state data
290 : : * @dev: the CPU
291 : : */
292 : 0 : static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
293 : : {
294 : 11275260 : struct menu_device *data = &__get_cpu_var(menu_devices);
295 : 5637630 : int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
296 : : int i;
297 : : int multiplier;
298 : : struct timespec t;
299 : :
300 [ + + ]: 5637405 : if (data->needs_update) {
301 : 5637342 : menu_update(drv, dev);
302 : 5637488 : data->needs_update = 0;
303 : : }
304 : :
305 : 5637551 : data->last_state_idx = 0;
306 : 5637551 : data->exit_us = 0;
307 : :
308 : : /* Special case when user has set very strict latency requirement */
309 [ + ]: 5637551 : if (unlikely(latency_req == 0))
310 : : return 0;
311 : :
312 : : /* determine the expected residency time, round up */
313 : 5637580 : t = ktime_to_timespec(tick_nohz_get_sleep_length());
314 : 5637432 : data->expected_us =
315 : 5637432 : t.tv_sec * USEC_PER_SEC + t.tv_nsec / NSEC_PER_USEC;
316 : :
317 : :
318 : 5637597 : data->bucket = which_bucket(data->expected_us);
319 : :
320 : : multiplier = performance_multiplier();
321 : :
322 : : /*
323 : : * if the correction factor is 0 (eg first time init or cpu hotplug
324 : : * etc), we actually want to start out with a unity factor.
325 : : */
326 [ + + ]: 5637603 : if (data->correction_factor[data->bucket] == 0)
327 : 1 : data->correction_factor[data->bucket] = RESOLUTION * DECAY;
328 : :
329 : : /*
330 : : * Force the result of multiplication to be 64 bits even if both
331 : : * operands are 32 bits.
332 : : * Make sure to round up for half microseconds.
333 : : */
334 : 5637603 : data->predicted_us = div_round64((uint64_t)data->expected_us *
335 : 5637603 : data->correction_factor[data->bucket],
336 : : RESOLUTION * DECAY);
337 : :
338 : 5637290 : get_typical_interval(data);
339 : :
340 : : /*
341 : : * We want to default to C1 (hlt), not to busy polling
342 : : * unless the timer is happening really really soon.
343 : : */
344 [ + + ][ + + ]: 11275169 : if (data->expected_us > 5 &&
345 [ + ]: 5635389 : !drv->states[CPUIDLE_DRIVER_STATE_START].disabled &&
346 : 5635389 : dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0)
347 : 11275169 : data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
348 : :
349 : : /*
350 : : * Find the idle state with the lowest power while satisfying
351 : : * our constraints.
352 : : */
353 [ + + ]: 16911946 : for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
354 : 11274407 : struct cpuidle_state *s = &drv->states[i];
355 : 11274407 : struct cpuidle_state_usage *su = &dev->states_usage[i];
356 : :
357 [ + + ][ + ]: 11274407 : if (s->disabled || su->disable)
358 : 0 : continue;
359 [ + + ]: 11275035 : if (s->target_residency > data->predicted_us)
360 : 4254106 : continue;
361 [ - + ]: 7020929 : if (s->exit_latency > latency_req)
362 : 0 : continue;
363 [ + + ]: 7020929 : if (s->exit_latency * multiplier > data->predicted_us)
364 : 190796 : continue;
365 : :
366 : 6830133 : data->last_state_idx = i;
367 : 6830133 : data->exit_us = s->exit_latency;
368 : : }
369 : :
370 : 5637539 : return data->last_state_idx;
371 : : }
372 : :
373 : : /**
374 : : * menu_reflect - records that data structures need update
375 : : * @dev: the CPU
376 : : * @index: the index of actual entered state
377 : : *
378 : : * NOTE: it's important to be fast here because this operation will add to
379 : : * the overall exit latency.
380 : : */
381 : 0 : static void menu_reflect(struct cpuidle_device *dev, int index)
382 : : {
383 : 11275448 : struct menu_device *data = &__get_cpu_var(menu_devices);
384 : 5637724 : data->last_state_idx = index;
385 [ + ]: 5637724 : if (index >= 0)
386 : 5637727 : data->needs_update = 1;
387 : 0 : }
388 : :
389 : : /**
390 : : * menu_update - attempts to guess what happened after entry
391 : : * @drv: cpuidle driver containing state data
392 : : * @dev: the CPU
393 : : */
394 : 0 : static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
395 : : {
396 : 11275288 : struct menu_device *data = &__get_cpu_var(menu_devices);
397 : 5637644 : int last_idx = data->last_state_idx;
398 : 5637644 : unsigned int last_idle_us = cpuidle_get_last_residency(dev);
399 : 5637644 : struct cpuidle_state *target = &drv->states[last_idx];
400 : : unsigned int measured_us;
401 : : unsigned int new_factor;
402 : :
403 : : /*
404 : : * Ugh, this idle state doesn't support residency measurements, so we
405 : : * are basically lost in the dark. As a compromise, assume we slept
406 : : * for the whole expected time.
407 : : */
408 [ - + ]: 5637644 : if (unlikely(!(target->flags & CPUIDLE_FLAG_TIME_VALID)))
409 : 0 : last_idle_us = data->expected_us;
410 : :
411 : :
412 : : measured_us = last_idle_us;
413 : :
414 : : /*
415 : : * We correct for the exit latency; we are assuming here that the
416 : : * exit latency happens after the event that we're interested in.
417 : : */
418 [ + + ]: 5637644 : if (measured_us > data->exit_us)
419 : 5451200 : measured_us -= data->exit_us;
420 : :
421 : :
422 : : /* Update our correction ratio */
423 : 5637644 : new_factor = data->correction_factor[data->bucket];
424 : 5637644 : new_factor -= new_factor / DECAY;
425 : :
426 [ + + ][ + + ]: 5637644 : if (data->expected_us > 0 && measured_us < MAX_INTERESTING)
427 : 5240352 : new_factor += RESOLUTION * measured_us / data->expected_us;
428 : : else
429 : : /*
430 : : * we were idle so long that we count it as a perfect
431 : : * prediction
432 : : */
433 : 397292 : new_factor += RESOLUTION;
434 : :
435 : : /*
436 : : * We don't want 0 as factor; we always want at least
437 : : * a tiny bit of estimated time. Fortunately, due to rounding,
438 : : * new_factor will stay nonzero regardless of measured_us values
439 : : * and the compiler can eliminate this test as long as DECAY > 1.
440 : : */
441 : : if (DECAY == 1 && unlikely(new_factor == 0))
442 : : new_factor = 1;
443 : :
444 : 5637644 : data->correction_factor[data->bucket] = new_factor;
445 : :
446 : : /* update the repeating-pattern data */
447 : 5637644 : data->intervals[data->interval_ptr++] = last_idle_us;
448 [ + + ]: 5637644 : if (data->interval_ptr >= INTERVALS)
449 : 704717 : data->interval_ptr = 0;
450 : 0 : }
451 : :
452 : : /**
453 : : * menu_enable_device - scans a CPU's states and does setup
454 : : * @drv: cpuidle driver
455 : : * @dev: the CPU
456 : : */
457 : 0 : static int menu_enable_device(struct cpuidle_driver *drv,
458 : : struct cpuidle_device *dev)
459 : : {
460 : 0 : struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
461 : :
462 : 0 : memset(data, 0, sizeof(struct menu_device));
463 : :
464 : 0 : return 0;
465 : : }
466 : :
467 : : static struct cpuidle_governor menu_governor = {
468 : : .name = "menu",
469 : : .rating = 20,
470 : : .enable = menu_enable_device,
471 : : .select = menu_select,
472 : : .reflect = menu_reflect,
473 : : .owner = THIS_MODULE,
474 : : };
475 : :
476 : : /**
477 : : * init_menu - initializes the governor
478 : : */
479 : 0 : static int __init init_menu(void)
480 : : {
481 : 0 : return cpuidle_register_governor(&menu_governor);
482 : : }
483 : :
484 : : postcore_initcall(init_menu);
|