From fb398de362e0dd83013990ceebe79d9b4e2438bd Mon Sep 17 00:00:00 2001 From: Scott B Date: Thu, 10 Feb 2022 05:48:41 -0800 Subject: [PATCH] Revert "XANMOD: fair: Remove all energy efficiency functions" This reverts commit 05bdfcdb4122ff03c18c3f3e9ba5c59684484ef8. --- kernel/sched/fair.c | 273 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 273 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b9f607aeee97..069e01772d92 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6609,6 +6609,271 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) return min_t(unsigned long, util, capacity_orig_of(cpu)); } +/* + * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) + * to @dst_cpu. + */ +static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +{ + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); + + /* + * If @p migrates from @cpu to another, remove its contribution. Or, + * if @p migrates from another CPU to @cpu, add its contribution. In + * the other cases, @cpu is not impacted by the migration, so the + * util_avg should already be correct. + */ + if (task_cpu(p) == cpu && dst_cpu != cpu) + lsub_positive(&util, task_util(p)); + else if (task_cpu(p) != cpu && dst_cpu == cpu) + util += task_util(p); + + if (sched_feat(UTIL_EST)) { + util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + + /* + * During wake-up, the task isn't enqueued yet and doesn't + * appear in the cfs_rq->avg.util_est.enqueued of any rq, + * so just add it (if needed) to "simulate" what will be + * cpu_util() after the task has been enqueued. + */ + if (dst_cpu == cpu) + util_est += _task_util_est(p); + + util = max(util, util_est); + } + + return min(util, capacity_orig_of(cpu)); +} + +/* + * compute_energy(): Estimates the energy that @pd would consume if @p was + * migrated to @dst_cpu. compute_energy() predicts what will be the utilization + * landscape of @pd's CPUs after the task migration, and uses the Energy Model + * to compute what would be the energy if we decided to actually migrate that + * task. + */ +static long +compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) +{ + struct cpumask *pd_mask = perf_domain_span(pd); + unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); + unsigned long max_util = 0, sum_util = 0; + unsigned long _cpu_cap = cpu_cap; + int cpu; + + _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask)); + + /* + * The capacity state of CPUs of the current rd can be driven by CPUs + * of another rd if they belong to the same pd. So, account for the + * utilization of these CPUs too by masking pd with cpu_online_mask + * instead of the rd span. + * + * If an entire pd is outside of the current rd, it will not appear in + * its pd list and will not be accounted by compute_energy(). + */ + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu); + unsigned long cpu_util, util_running = util_freq; + struct task_struct *tsk = NULL; + + /* + * When @p is placed on @cpu: + * + * util_running = max(cpu_util, cpu_util_est) + + * max(task_util, _task_util_est) + * + * while cpu_util_next is: max(cpu_util + task_util, + * cpu_util_est + _task_util_est) + */ + if (cpu == dst_cpu) { + tsk = p; + util_running = + cpu_util_next(cpu, p, -1) + task_util_est(p); + } + + /* + * Busy time computation: utilization clamping is not + * required since the ratio (sum_util / cpu_capacity) + * is already enough to scale the EM reported power + * consumption at the (eventually clamped) cpu_capacity. + */ + cpu_util = effective_cpu_util(cpu, util_running, cpu_cap, + ENERGY_UTIL, NULL); + + sum_util += min(cpu_util, _cpu_cap); + + /* + * Performance domain frequency: utilization clamping + * must be considered since it affects the selection + * of the performance domain frequency. + * NOTE: in case RT tasks are running, by default the + * FREQUENCY_UTIL's utilization can be max OPP. + */ + cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap, + FREQUENCY_UTIL, tsk); + max_util = max(max_util, min(cpu_util, _cpu_cap)); + } + + return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap); +} + +/* + * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the + * waking task. find_energy_efficient_cpu() looks for the CPU with maximum + * spare capacity in each performance domain and uses it as a potential + * candidate to execute the task. Then, it uses the Energy Model to figure + * out which of the CPU candidates is the most energy-efficient. + * + * The rationale for this heuristic is as follows. In a performance domain, + * all the most energy efficient CPU candidates (according to the Energy + * Model) are those for which we'll request a low frequency. When there are + * several CPUs for which the frequency request will be the same, we don't + * have enough data to break the tie between them, because the Energy Model + * only includes active power costs. With this model, if we assume that + * frequency requests follow utilization (e.g. using schedutil), the CPU with + * the maximum spare capacity in a performance domain is guaranteed to be among + * the best candidates of the performance domain. + * + * In practice, it could be preferable from an energy standpoint to pack + * small tasks on a CPU in order to let other CPUs go in deeper idle states, + * but that could also hurt our chances to go cluster idle, and we have no + * ways to tell with the current Energy Model if this is actually a good + * idea or not. So, find_energy_efficient_cpu() basically favors + * cluster-packing, and spreading inside a cluster. That should at least be + * a good thing for latency, and this is consistent with the idea that most + * of the energy savings of EAS come from the asymmetry of the system, and + * not so much from breaking the tie between identical CPUs. That's also the + * reason why EAS is enabled in the topology code only for systems where + * SD_ASYM_CPUCAPACITY is set. + * + * NOTE: Forkees are not accepted in the energy-aware wake-up path because + * they don't have any useful utilization data yet and it's not possible to + * forecast their impact on energy consumption. Consequently, they will be + * placed by find_idlest_cpu() on the least loaded CPU, which might turn out + * to be energy-inefficient in some use-cases. The alternative would be to + * bias new tasks towards specific types of CPUs first, or to try to infer + * their util_avg from the parent task, but those heuristics could hurt + * other use-cases too. So, until someone finds a better way to solve this, + * let's keep things simple by re-using the existing slow path. + */ +static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) +{ + unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int cpu, best_energy_cpu = prev_cpu, target = -1; + unsigned long cpu_cap, util, base_energy = 0; + struct sched_domain *sd; + struct perf_domain *pd; + + rcu_read_lock(); + pd = rcu_dereference(rd->pd); + if (!pd || READ_ONCE(rd->overutilized)) + goto unlock; + + /* + * Energy-aware wake-up happens on the lowest sched_domain starting + * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. + */ + sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); + while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) + sd = sd->parent; + if (!sd) + goto unlock; + + target = prev_cpu; + + sync_entity_load_avg(&p->se); + if (!task_util_est(p)) + goto unlock; + + for (; pd; pd = pd->next) { + unsigned long cur_delta, spare_cap, max_spare_cap = 0; + bool compute_prev_delta = false; + unsigned long base_energy_pd; + int max_spare_cap_cpu = -1; + + for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + continue; + + util = cpu_util_next(cpu, p, cpu); + cpu_cap = capacity_of(cpu); + spare_cap = cpu_cap; + lsub_positive(&spare_cap, util); + + /* + * Skip CPUs that cannot satisfy the capacity request. + * IOW, placing the task there would make the CPU + * overutilized. Take uclamp into account to see how + * much capacity we can get out of the CPU; this is + * aligned with sched_cpu_util(). + */ + util = uclamp_rq_util_with(cpu_rq(cpu), util, p); + if (!fits_capacity(util, cpu_cap)) + continue; + + if (cpu == prev_cpu) { + /* Always use prev_cpu as a candidate. */ + compute_prev_delta = true; + } else if (spare_cap > max_spare_cap) { + /* + * Find the CPU with the maximum spare capacity + * in the performance domain. + */ + max_spare_cap = spare_cap; + max_spare_cap_cpu = cpu; + } + } + + if (max_spare_cap_cpu < 0 && !compute_prev_delta) + continue; + + /* Compute the 'base' energy of the pd, without @p */ + base_energy_pd = compute_energy(p, -1, pd); + base_energy += base_energy_pd; + + /* Evaluate the energy impact of using prev_cpu. */ + if (compute_prev_delta) { + prev_delta = compute_energy(p, prev_cpu, pd); + if (prev_delta < base_energy_pd) + goto unlock; + prev_delta -= base_energy_pd; + best_delta = min(best_delta, prev_delta); + } + + /* Evaluate the energy impact of using max_spare_cap_cpu. */ + if (max_spare_cap_cpu >= 0) { + cur_delta = compute_energy(p, max_spare_cap_cpu, pd); + if (cur_delta < base_energy_pd) + goto unlock; + cur_delta -= base_energy_pd; + if (cur_delta < best_delta) { + best_delta = cur_delta; + best_energy_cpu = max_spare_cap_cpu; + } + } + } + rcu_read_unlock(); + + /* + * Pick the best CPU if prev_cpu cannot be used, or if it saves at + * least 6% of the energy used by prev_cpu. + */ + if ((prev_delta == ULONG_MAX) || + (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) + target = best_energy_cpu; + + return target; + +unlock: + rcu_read_unlock(); + + return target; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, @@ -6636,6 +6901,14 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) lockdep_assert_held(&p->pi_lock); if (wake_flags & WF_TTWU) { record_wakee(p); + + if (sched_energy_enabled()) { + new_cpu = find_energy_efficient_cpu(p, prev_cpu); + if (new_cpu >= 0) + return new_cpu; + new_cpu = prev_cpu; + } + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); } -- 2.35.1