From 069ce2ef1a6dd84cbd4d897b333e30f825e021f0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Oct 2019 23:32:17 +0200 Subject: [PATCH 1/4] cpuidle: teo: Ignore disabled idle states that are too deep Prevent disabled CPU idle state with target residencies beyond the anticipated idle duration from being taken into account by the TEO governor. Fixes: b26bf6ab716f ("cpuidle: New timer events oriented governor for tickless systems") Signed-off-by: Rafael J. Wysocki Cc: 5.1+ # 5.1+ --- drivers/cpuidle/governors/teo.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index b5a0e498f798..8806db95a913 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -257,6 +257,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, struct cpuidle_state_usage *su = &dev->states_usage[i]; if (s->disabled || su->disable) { + /* + * Ignore disabled states with target residencies beyond + * the anticipated idle duration. + */ + if (s->target_residency > duration_us) + continue; + /* * If the "early hits" metric of a disabled state is * greater than the current maximum, it should be taken From 4f690bb8ce4cc5d3fabe3a8e9c2401de1554cdc1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Oct 2019 23:32:59 +0200 Subject: [PATCH 2/4] cpuidle: teo: Rename local variable in teo_select() Rename a local variable in teo_select() in preparation for subsequent code modifications, no intentional impact. Signed-off-by: Rafael J. Wysocki Cc: 5.1+ # 5.1+ --- drivers/cpuidle/governors/teo.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 8806db95a913..de3139b17a50 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -233,7 +233,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); int latency_req = cpuidle_governor_latency_req(dev->cpu); - unsigned int duration_us, count; + unsigned int duration_us, early_hits; int max_early_idx, constraint_idx, idx, i; ktime_t delta_tick; @@ -247,7 +247,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick); duration_us = ktime_to_us(cpu_data->sleep_length_ns); - count = 0; + early_hits = 0; max_early_idx = -1; constraint_idx = drv->state_count; idx = -1; @@ -270,12 +270,12 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * into account, because it would be a mistake to select * a deeper state with lower "early hits" metric. The * index cannot be changed to point to it, however, so - * just increase the max count alone and let the index - * still point to a shallower idle state. + * just increase the "early hits" count alone and let + * the index still point to a shallower idle state. */ if (max_early_idx >= 0 && - count < cpu_data->states[i].early_hits) - count = cpu_data->states[i].early_hits; + early_hits < cpu_data->states[i].early_hits) + early_hits = cpu_data->states[i].early_hits; continue; } @@ -291,10 +291,10 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, idx = i; - if (count < cpu_data->states[i].early_hits && + if (early_hits < cpu_data->states[i].early_hits && !(tick_nohz_tick_stopped() && drv->states[i].target_residency < TICK_USEC)) { - count = cpu_data->states[i].early_hits; + early_hits = cpu_data->states[i].early_hits; max_early_idx = i; } } @@ -323,10 +323,9 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (idx < 0) { idx = 0; /* No states enabled. Must use 0. */ } else if (idx > 0) { + unsigned int count = 0; u64 sum = 0; - count = 0; - /* * Count and sum the most recent idle duration values less than * the current expected idle duration value. From e43dcf20215f0287ea113102617ca04daa76b70e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Oct 2019 23:36:15 +0200 Subject: [PATCH 3/4] cpuidle: teo: Consider hits and misses metrics of disabled states The TEO governor uses idle duration "bins" defined in accordance with the CPU idle states table provided by the driver, so that each "bin" covers the idle duration range between the target residency of the idle state corresponding to it and the target residency of the closest deeper idle state. The governor collects statistics for each bin regardless of whether or not the idle state corresponding to it is currently enabled. In particular, the "hits" and "misses" metrics measure the likelihood of a situation in which both the time till the next timer (sleep length) and the idle duration measured after wakeup fall into the given bin. Namely, if the "hits" value is greater than the "misses" one, that situation is more likely than the one in which the sleep length falls into the given bin, but the idle duration measured after wakeup falls into a bin corresponding to one of the shallower idle states. If the idle state corresponding to the given bin is disabled, it cannot be selected and if it turns out to be the one that should be selected, a shallower idle state needs to be used instead of it. Nevertheless, the metrics collected for the bin corresponding to it are still valid and need to be taken into account as though that state had not been disabled. For this reason, make teo_select() always use the "hits" and "misses" values of the idle duration range that the sleep length falls into even if the specific idle state corresponding to it is disabled and if the "hits" values is greater than the "misses" one, select the closest enabled shallower idle state in that case. Fixes: b26bf6ab716f ("cpuidle: New timer events oriented governor for tickless systems") Signed-off-by: Rafael J. Wysocki Cc: 5.1+ # 5.1+ --- drivers/cpuidle/governors/teo.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index de3139b17a50..5a0f60ea4ab9 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -233,7 +233,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); int latency_req = cpuidle_governor_latency_req(dev->cpu); - unsigned int duration_us, early_hits; + unsigned int duration_us, hits, misses, early_hits; int max_early_idx, constraint_idx, idx, i; ktime_t delta_tick; @@ -247,6 +247,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick); duration_us = ktime_to_us(cpu_data->sleep_length_ns); + hits = 0; + misses = 0; early_hits = 0; max_early_idx = -1; constraint_idx = drv->state_count; @@ -264,6 +266,17 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (s->target_residency > duration_us) continue; + /* + * This state is disabled, so the range of idle duration + * values corresponding to it is covered by the current + * candidate state, but still the "hits" and "misses" + * metrics of the disabled state need to be used to + * decide whether or not the state covering the range in + * question is good enough. + */ + hits = cpu_data->states[i].hits; + misses = cpu_data->states[i].misses; + /* * If the "early hits" metric of a disabled state is * greater than the current maximum, it should be taken @@ -280,8 +293,11 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, continue; } - if (idx < 0) + if (idx < 0) { idx = i; /* first enabled state */ + hits = cpu_data->states[i].hits; + misses = cpu_data->states[i].misses; + } if (s->target_residency > duration_us) break; @@ -290,6 +306,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, constraint_idx = i; idx = i; + hits = cpu_data->states[i].hits; + misses = cpu_data->states[i].misses; if (early_hits < cpu_data->states[i].early_hits && !(tick_nohz_tick_stopped() && @@ -307,8 +325,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * "early hits" metric, but if that cannot be determined, just use the * state selected so far. */ - if (cpu_data->states[idx].hits <= cpu_data->states[idx].misses && - max_early_idx >= 0) { + if (hits <= misses && max_early_idx >= 0) { idx = max_early_idx; duration_us = drv->states[idx].target_residency; } From 159e48560f51d9c2aa02d762a18cd24f7868ab27 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Oct 2019 23:37:39 +0200 Subject: [PATCH 4/4] cpuidle: teo: Fix "early hits" handling for disabled idle states The TEO governor uses idle duration "bins" defined in accordance with the CPU idle states table provided by the driver, so that each "bin" covers the idle duration range between the target residency of the idle state corresponding to it and the target residency of the closest deeper idle state. The governor collects statistics for each bin regardless of whether or not the idle state corresponding to it is currently enabled. In particular, the "early hits" metric measures the likelihood of a situation in which the idle duration measured after wakeup falls into to given bin, but the time till the next timer (sleep length) falls into a bin corresponding to one of the deeper idle states. It is used when the "hits" and "misses" metrics indicate that the state "matching" the sleep length should not be selected, so that the state with the maximum "early hits" value is selected instead of it. If the idle state corresponding to the given bin is disabled, it cannot be selected and if it turns out to be the one that should be selected, a shallower idle state needs to be used instead of it. Nevertheless, the metrics collected for the bin corresponding to it are still valid and need to be taken into account as though that state had not been disabled. As far as the "early hits" metric is concerned, teo_select() tries to take disabled states into account, but the state index corresponding to the maximum "early hits" value computed by it may be incorrect. Namely, it always uses the index of the previous maximum "early hits" state then, but there may be enabled idle states closer to the disabled one in question. In particular, if the current candidate state (whose index is the idx value) is closer to the disabled one and the "early hits" value of the disabled state is greater than the current maximum, the index of the current candidate state (idx) should replace the "maximum early hits state" index. Modify the code to handle that case correctly. Fixes: b26bf6ab716f ("cpuidle: New timer events oriented governor for tickless systems") Reported-by: Doug Smythies Signed-off-by: Rafael J. Wysocki Cc: 5.1+ # 5.1+ --- drivers/cpuidle/governors/teo.c | 35 ++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 5a0f60ea4ab9..b9b9156618e6 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -277,18 +277,35 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, hits = cpu_data->states[i].hits; misses = cpu_data->states[i].misses; + if (early_hits >= cpu_data->states[i].early_hits || + idx < 0) + continue; + /* - * If the "early hits" metric of a disabled state is - * greater than the current maximum, it should be taken - * into account, because it would be a mistake to select - * a deeper state with lower "early hits" metric. The - * index cannot be changed to point to it, however, so - * just increase the "early hits" count alone and let - * the index still point to a shallower idle state. + * If the current candidate state has been the one with + * the maximum "early hits" metric so far, the "early + * hits" metric of the disabled state replaces the + * current "early hits" count to avoid selecting a + * deeper state with lower "early hits" metric. */ - if (max_early_idx >= 0 && - early_hits < cpu_data->states[i].early_hits) + if (max_early_idx == idx) { early_hits = cpu_data->states[i].early_hits; + continue; + } + + /* + * The current candidate state is closer to the disabled + * one than the current maximum "early hits" state, so + * replace the latter with it, but in case the maximum + * "early hits" state index has not been set so far, + * check if the current candidate state is not too + * shallow for that role. + */ + if (!(tick_nohz_tick_stopped() && + drv->states[idx].target_residency < TICK_USEC)) { + early_hits = cpu_data->states[i].early_hits; + max_early_idx = idx; + } continue; }