diff --git a/Documentation/scheduler/sched-hmp.txt b/Documentation/scheduler/sched-hmp.txt index 091d49ea80cf..766c01d321b5 100644 --- a/Documentation/scheduler/sched-hmp.txt +++ b/Documentation/scheduler/sched-hmp.txt @@ -726,6 +726,16 @@ d. /proc/sys/kernel/sched_select_prev_cpu_us Default value of sched_select_prev_cpu_us is 2000 (2ms). This can be turned off by setting it to 0. +e. /proc/sys/kernel/sched_short_burst_ns + This threshold controls whether a task is considered as "short-burst" + or not. "short-burst" tasks are eligible for packing to avoid overhead + associated with waking up an idle CPU. "non-idle" CPUs which are not + loaded with IRQs and can accommodate the waking task without exceeding + spill limits are considered. The ties are broken with load followed + by previous CPU. This tunable does not affect cluster selection. + It only affects CPU selection in a given cluster. This packing is + skipped for tasks that are eligible for "wake-up-idle" and "boost". + **** 5.2.4 Wakeup Logic for Task "p" Wakeup task placement logic is as follows: diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 5d0899df64ff..e4aff5e6e17f 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -68,6 +68,7 @@ extern unsigned int sysctl_sched_freq_aggregate; extern unsigned int sysctl_sched_enable_thread_grouping; extern unsigned int sysctl_sched_freq_aggregate_threshold_pct; extern unsigned int sysctl_sched_prefer_sync_wakee_to_waker; +extern unsigned int sysctl_sched_short_burst; #else /* CONFIG_SCHED_HMP */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3db77aff2433..95b961dc7b14 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2591,6 +2591,7 @@ static u32 __compute_runnable_contrib(u64 n) #define SBC_FLAG_CSTATE_LOAD 0x100 #define SBC_FLAG_BEST_SIBLING 0x200 #define SBC_FLAG_WAKER_CPU 0x400 +#define SBC_FLAG_PACK_TASK 0x800 /* Cluster selection flag */ #define SBC_FLAG_COLOC_CLUSTER 0x10000 @@ -2607,6 +2608,7 @@ struct cpu_select_env { u8 sync:1; u8 ignore_prev_cpu:1; enum sched_boost_policy boost_policy; + u8 pack_task:1; int prev_cpu; DECLARE_BITMAP(candidate_list, NR_CPUS); DECLARE_BITMAP(backup_list, NR_CPUS); @@ -2958,8 +2960,17 @@ static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, { int cpu_cost; - cpu_cost = power_cost(cpu, task_load(env->p) + + /* + * We try to find the least loaded *busy* CPU irrespective + * of the power cost. + */ + if (env->pack_task) + cpu_cost = cpu_min_power_cost(cpu); + + else + cpu_cost = power_cost(cpu, task_load(env->p) + cpu_cravg_sync(cpu, env->sync)); + if (cpu_cost <= stats->min_cost) __update_cluster_stats(cpu, stats, env, cpu_cost); } @@ -3034,6 +3045,15 @@ static inline int wake_to_idle(struct task_struct *p) (p->flags & PF_WAKE_UP_IDLE) || sysctl_sched_wake_to_idle; } +static inline bool env_has_special_flags(struct cpu_select_env *env) +{ + if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE || + env->reason) + return true; + + return false; +} + static inline bool bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) { @@ -3041,9 +3061,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) struct task_struct *task = env->p; struct sched_cluster *cluster; - if (env->boost_policy != SCHED_BOOST_NONE || env->reason || - !task->ravg.mark_start || - env->need_idle || !sched_short_sleep_task_threshold) + if (!task->ravg.mark_start || !sched_short_sleep_task_threshold) return false; prev_cpu = env->prev_cpu; @@ -3092,8 +3110,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) static inline bool wake_to_waker_cluster(struct cpu_select_env *env) { - return env->boost_policy == SCHED_BOOST_NONE && - !env->need_idle && !env->reason && env->sync && + return env->sync && task_load(current) > sched_big_waker_task_load && task_load(env->p) < sched_small_wakee_task_load; } @@ -3118,7 +3135,6 @@ cluster_allowed(struct task_struct *p, struct sched_cluster *cluster) return !cpumask_empty(&tmp_mask); } - /* return cheapest cpu that can fit this task */ static int select_best_cpu(struct task_struct *p, int target, int reason, int sync) @@ -3128,6 +3144,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, struct related_thread_group *grp; unsigned int sbc_flag = 0; int cpu = raw_smp_processor_id(); + bool special; struct cpu_select_env env = { .p = p, @@ -3140,6 +3157,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, .rtg = NULL, .sbc_best_flag = 0, .sbc_best_cluster_flag = 0, + .pack_task = false, }; env.boost_policy = task_sched_boost(p) ? @@ -3149,6 +3167,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, bitmap_zero(env.backup_list, NR_CPUS); init_cluster_cpu_stats(&stats); + special = env_has_special_flags(&env); rcu_read_lock(); @@ -3160,7 +3179,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, clear_bit(pref_cluster->id, env.candidate_list); else env.rtg = grp; - } else { + } else if (!special) { cluster = cpu_rq(cpu)->cluster; if (wake_to_waker_cluster(&env)) { if (bias_to_waker_cpu(p, cpu)) { @@ -3181,6 +3200,10 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, } } + if (!special && is_short_burst_task(p)) { + env.pack_task = true; + sbc_flag = SBC_FLAG_PACK_TASK; + } retry: cluster = select_least_power_cluster(&env); diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index b2f3013bfe31..95e618ee1124 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -961,6 +961,13 @@ sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; +/* + * Scheduler tries to avoid waking up idle CPUs for tasks running + * in short bursts. If the task average burst is less than + * sysctl_sched_short_burst nanoseconds, it is eligible for packing. + */ +unsigned int __read_mostly sysctl_sched_short_burst; + static void _update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate) { @@ -1553,7 +1560,13 @@ void init_new_task_load(struct task_struct *p, bool idle_task) memset(&p->ravg, 0, sizeof(struct ravg)); p->cpu_cycles = 0; p->ravg.curr_burst = 0; - p->ravg.avg_burst = 0; + /* + * Initialize the avg_burst to twice the threshold, so that + * a task would not be classified as short burst right away + * after fork. It takes at least 6 sleep-wakeup cycles for + * the avg_burst to go below the threshold. + */ + p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst; p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); @@ -2987,6 +3000,8 @@ void reset_task_stats(struct task_struct *p) p->ravg.curr_window_cpu = curr_window_ptr; p->ravg.prev_window_cpu = prev_window_ptr; + p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst; + /* Retain EXITING_TASK marker */ p->ravg.sum_history[0] = sum; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 52edd6b158ed..624bededfb85 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1679,6 +1679,7 @@ static int find_lowest_rq_hmp(struct task_struct *task) int i; int restrict_cluster; int boost_on_big; + int pack_task, wakeup_latency, least_wakeup_latency = INT_MAX; boost_on_big = sched_boost() == FULL_THROTTLE_BOOST && sched_boost_policy() == SCHED_BOOST_ON_BIG; @@ -1695,6 +1696,8 @@ static int find_lowest_rq_hmp(struct task_struct *task) if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) return best_cpu; /* No targets found */ + pack_task = is_short_burst_task(task); + /* * At this point we have built a mask of cpus representing the * lowest priority tasks in the system. Now we want to elect @@ -1720,6 +1723,20 @@ static int find_lowest_rq_hmp(struct task_struct *task) if (!restrict_cluster) cpu_load = scale_load_to_cpu(cpu_load, i); + if (pack_task) { + wakeup_latency = cpu_rq(i)->wakeup_latency; + + if (wakeup_latency > least_wakeup_latency) + continue; + + if (wakeup_latency < least_wakeup_latency) { + least_wakeup_latency = wakeup_latency; + min_load = cpu_load; + best_cpu = i; + continue; + } + } + if (cpu_load < min_load || (cpu_load == min_load && (i == prev_cpu || (best_cpu != prev_cpu && @@ -1728,6 +1745,7 @@ static int find_lowest_rq_hmp(struct task_struct *task) best_cpu = i; } } + if (restrict_cluster && best_cpu != -1) break; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6b9f11d9a47c..b9a109e5ef94 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1188,6 +1188,11 @@ static inline int cpu_max_power_cost(int cpu) return cpu_rq(cpu)->cluster->max_power_cost; } +static inline int cpu_min_power_cost(int cpu) +{ + return cpu_rq(cpu)->cluster->min_power_cost; +} + static inline u32 cpu_cycles_to_freq(u64 cycles, u32 period) { return div64_u64(cycles, period); @@ -1385,6 +1390,11 @@ static inline u64 cpu_cravg_sync(int cpu, int sync) return load; } +static inline bool is_short_burst_task(struct task_struct *p) +{ + return p->ravg.avg_burst < sysctl_sched_short_burst; +} + extern void check_for_migration(struct rq *rq, struct task_struct *p); extern void pre_big_task_count_change(const struct cpumask *cpus); extern void post_big_task_count_change(const struct cpumask *cpus); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b7cbd7940f7b..d4682d0cdeb1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -507,6 +507,13 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &three, }, + { + .procname = "sched_short_burst_ns", + .data = &sysctl_sched_short_burst, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif /* CONFIG_SCHED_HMP */ #ifdef CONFIG_SCHED_DEBUG {