diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 68a9bdde6604..1f9c2c734b20 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -63,6 +63,7 @@ extern unsigned int sysctl_sched_new_task_windows; extern unsigned int sysctl_sched_pred_alert_freq; extern unsigned int sysctl_sched_freq_aggregate; extern unsigned int sysctl_sched_enable_thread_grouping; +extern unsigned int sysctl_sched_freq_aggregate_threshold_pct; #else /* CONFIG_SCHED_HMP */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ce58e2245b4b..e893b0fcac6b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2992,7 +2992,7 @@ static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats) static inline int wake_to_idle(struct task_struct *p) { return (current->flags & PF_WAKE_UP_IDLE) || - (p->flags & PF_WAKE_UP_IDLE); + (p->flags & PF_WAKE_UP_IDLE) || sysctl_sched_wake_to_idle; } static inline bool diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 8da0147b4f89..447f3880f645 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -821,6 +821,9 @@ static DEFINE_RWLOCK(related_thread_group_lock); static __read_mostly unsigned int sched_freq_aggregate; __read_mostly unsigned int sysctl_sched_freq_aggregate; +unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct; +static unsigned int __read_mostly sched_freq_aggregate_threshold; + /* Initial task load. Newly created tasks are assigned this load. */ unsigned int __read_mostly sched_init_task_load_windows; unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; @@ -959,6 +962,9 @@ void set_hmp_defaults(void) sched_big_waker_task_load = div64_u64((u64)sysctl_sched_big_waker_task_load_pct * (u64)sched_ravg_window, 100); + + sched_freq_aggregate_threshold = + pct_to_real(sysctl_sched_freq_aggregate_threshold_pct); } u32 sched_get_init_task_load(struct task_struct *p) @@ -1475,7 +1481,18 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write, if (write && (old_val == *data)) goto done; - if (data != &sysctl_sched_select_prev_cpu_us) { + /* + * Special handling for sched_freq_aggregate_threshold_pct + * which can be greater than 100. Use 1000 as an upper bound + * value which works for all practical use cases. + */ + if (data == &sysctl_sched_freq_aggregate_threshold_pct) { + if (*data > 1000) { + *data = old_val; + ret = -EINVAL; + goto done; + } + } else if (data != &sysctl_sched_select_prev_cpu_us) { /* * all tunables other than sched_select_prev_cpu_us are * in percentage. @@ -2947,6 +2964,8 @@ void sched_get_cpus_busy(struct sched_load *busy, u64 max_prev_sum = 0; int max_busy_cpu = cpumask_first(query_cpus); struct related_thread_group *grp; + u64 total_group_load = 0, total_ngload = 0; + bool aggregate_load = false; if (unlikely(cpus == 0)) return; @@ -3006,6 +3025,11 @@ void sched_get_cpus_busy(struct sched_load *busy, } } + group_load_in_freq_domain( + &cpu_rq(max_busy_cpu)->freq_domain_cpumask, + &total_group_load, &total_ngload); + aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold); + i = 0; for_each_cpu(cpu, query_cpus) { group_load[i] = 0; @@ -3015,11 +3039,11 @@ void sched_get_cpus_busy(struct sched_load *busy, goto skip_early; rq = cpu_rq(cpu); - if (!notifier_sent) { - if (cpu == max_busy_cpu) - group_load_in_freq_domain( - &rq->freq_domain_cpumask, - &group_load[i], &ngload[i]); + if (aggregate_load) { + if (cpu == max_busy_cpu) { + group_load[i] = total_group_load; + ngload[i] = total_ngload; + } } else { _group_load_in_cpu(cpu, &group_load[i], &ngload[i]); } @@ -3056,7 +3080,19 @@ skip_early: goto exit_early; } - if (!notifier_sent) { + /* + * When the load aggregation is controlled by + * sched_freq_aggregate_threshold, allow reporting loads + * greater than 100 @ Fcur to ramp up the frequency + * faster. + */ + if (notifier_sent || (aggregate_load && + sched_freq_aggregate_threshold)) { + load[i] = scale_load_to_freq(load[i], max_freq[i], + cpu_max_possible_freq(cpu)); + nload[i] = scale_load_to_freq(nload[i], max_freq[i], + cpu_max_possible_freq(cpu)); + } else { load[i] = scale_load_to_freq(load[i], max_freq[i], cur_freq[i]); nload[i] = scale_load_to_freq(nload[i], max_freq[i], @@ -3070,11 +3106,6 @@ skip_early: cpu_max_possible_freq(cpu)); nload[i] = scale_load_to_freq(nload[i], cur_freq[i], cpu_max_possible_freq(cpu)); - } else { - load[i] = scale_load_to_freq(load[i], max_freq[i], - cpu_max_possible_freq(cpu)); - nload[i] = scale_load_to_freq(nload[i], max_freq[i], - cpu_max_possible_freq(cpu)); } pload[i] = scale_load_to_freq(pload[i], max_freq[i], rq->cluster->max_possible_freq); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ac34212f6881..07fef40d1274 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -434,6 +434,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_window_update_handler, }, + { + .procname = "sched_freq_aggregate_threshold", + .data = &sysctl_sched_freq_aggregate_threshold_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + }, { .procname = "sched_boost", .data = &sysctl_sched_boost,