diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 6848454c5447..5d0899df64ff 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -53,6 +53,8 @@ extern unsigned int sysctl_sched_spill_nr_run; extern unsigned int sysctl_sched_spill_load_pct; extern unsigned int sysctl_sched_upmigrate_pct; extern unsigned int sysctl_sched_downmigrate_pct; +extern unsigned int sysctl_sched_group_upmigrate_pct; +extern unsigned int sysctl_sched_group_downmigrate_pct; extern unsigned int sysctl_early_detection_duration; extern unsigned int sysctl_sched_boost; extern unsigned int sysctl_sched_small_wakee_task_load_pct; diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 7778ff3947de..72bbed9ad5db 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -133,6 +133,7 @@ TRACE_EVENT(sched_task_load, __field( u32, flags ) __field( int, best_cpu ) __field( u64, latency ) + __field( int, grp_id ) ), TP_fast_assign( @@ -148,12 +149,13 @@ TRACE_EVENT(sched_task_load, __entry->latency = p->state == TASK_WAKING ? sched_ktime_clock() - p->ravg.mark_start : 0; + __entry->grp_id = p->grp ? p->grp->id : 0; ), - TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x best_cpu=%d latency=%llu", + TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x grp=%d best_cpu=%d latency=%llu", __entry->pid, __entry->comm, __entry->demand, __entry->boost, __entry->reason, __entry->sync, - __entry->need_idle, __entry->flags, + __entry->need_idle, __entry->flags, __entry->grp_id, __entry->best_cpu, __entry->latency) ); @@ -164,9 +166,12 @@ TRACE_EVENT(sched_set_preferred_cluster, TP_ARGS(grp, total_demand), TP_STRUCT__entry( - __field( int, id ) - __field( u64, demand ) - __field( int, cluster_first_cpu ) + __field( int, id ) + __field( u64, demand ) + __field( int, cluster_first_cpu ) + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field(unsigned int, task_demand ) ), TP_fast_assign( @@ -245,19 +250,19 @@ DEFINE_EVENT(sched_cpu_load, sched_cpu_load_cgroup, TRACE_EVENT(sched_set_boost, - TP_PROTO(int ref_count), + TP_PROTO(int type), - TP_ARGS(ref_count), + TP_ARGS(type), TP_STRUCT__entry( - __field(unsigned int, ref_count ) + __field(int, type ) ), TP_fast_assign( - __entry->ref_count = ref_count; + __entry->type = type; ), - TP_printk("ref_count=%d", __entry->ref_count) + TP_printk("type %d", __entry->type) ); #if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_HMP) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7d0d34c53e08..7c0382a3eace 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -15,7 +15,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o sched_avg.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o -obj-$(CONFIG_SCHED_HMP) += hmp.o +obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/boost.c b/kernel/sched/boost.c new file mode 100644 index 000000000000..fcfda385b74a --- /dev/null +++ b/kernel/sched/boost.c @@ -0,0 +1,226 @@ +/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "sched.h" +#include +#include +#include + +/* + * Scheduler boost is a mechanism to temporarily place tasks on CPUs + * with higher capacity than those where a task would have normally + * ended up with their load characteristics. Any entity enabling + * boost is responsible for disabling it as well. + */ + +unsigned int sysctl_sched_boost; +static enum sched_boost_policy boost_policy; +static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE; +static DEFINE_MUTEX(boost_mutex); +static unsigned int freq_aggr_threshold_backup; + +static inline void boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) + smp_send_reschedule(cpu); +} + +static void boost_kick_cpus(void) +{ + int i; + struct cpumask kick_mask; + + if (boost_policy != SCHED_BOOST_ON_BIG) + return; + + cpumask_andnot(&kick_mask, cpu_online_mask, cpu_isolated_mask); + + for_each_cpu(i, &kick_mask) { + if (cpu_capacity(i) != max_capacity) + boost_kick(i); + } +} + +int got_boost_kick(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + return test_bit(BOOST_KICK, &rq->hmp_flags); +} + +void clear_boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(BOOST_KICK, &rq->hmp_flags); +} + +/* + * Scheduler boost type and boost policy might at first seem unrelated, + * however, there exists a connection between them that will allow us + * to use them interchangeably during placement decisions. We'll explain + * the connection here in one possible way so that the implications are + * clear when looking at placement policies. + * + * When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED + * When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can + * neither be none nor RESTRAINED. + */ +static void set_boost_policy(int type) +{ + if (type == SCHED_BOOST_NONE || type == RESTRAINED_BOOST) { + boost_policy = SCHED_BOOST_NONE; + return; + } + + if (boost_policy_dt) { + boost_policy = boost_policy_dt; + return; + } + + if (min_possible_efficiency != max_possible_efficiency) { + boost_policy = SCHED_BOOST_ON_BIG; + return; + } + + boost_policy = SCHED_BOOST_ON_ALL; +} + +enum sched_boost_policy sched_boost_policy(void) +{ + return boost_policy; +} + +static bool verify_boost_params(int old_val, int new_val) +{ + /* + * Boost can only be turned on or off. There is no possiblity of + * switching from one boost type to another or to set the same + * kind of boost several times. + */ + return !(!!old_val == !!new_val); +} + +static void _sched_set_boost(int old_val, int type) +{ + switch (type) { + case NO_BOOST: + if (old_val == FULL_THROTTLE_BOOST) + core_ctl_set_boost(false); + else if (old_val == CONSERVATIVE_BOOST) + restore_cgroup_boost_settings(); + else + update_freq_aggregate_threshold( + freq_aggr_threshold_backup); + break; + + case FULL_THROTTLE_BOOST: + core_ctl_set_boost(true); + boost_kick_cpus(); + break; + + case CONSERVATIVE_BOOST: + update_cgroup_boost_settings(); + boost_kick_cpus(); + break; + + case RESTRAINED_BOOST: + freq_aggr_threshold_backup = + update_freq_aggregate_threshold(1); + break; + + default: + WARN_ON(1); + return; + } + + set_boost_policy(type); + sysctl_sched_boost = type; + trace_sched_set_boost(type); +} + +void sched_boost_parse_dt(void) +{ + struct device_node *sn; + const char *boost_policy; + + if (!sched_enable_hmp) + return; + + sn = of_find_node_by_path("/sched-hmp"); + if (!sn) + return; + + if (!of_property_read_string(sn, "boost-policy", &boost_policy)) { + if (!strcmp(boost_policy, "boost-on-big")) + boost_policy_dt = SCHED_BOOST_ON_BIG; + else if (!strcmp(boost_policy, "boost-on-all")) + boost_policy_dt = SCHED_BOOST_ON_ALL; + } +} + +int sched_set_boost(int type) +{ + int ret = 0; + + if (!sched_enable_hmp) + return -EINVAL; + + mutex_lock(&boost_mutex); + + if (verify_boost_params(sysctl_sched_boost, type)) + _sched_set_boost(sysctl_sched_boost, type); + else + ret = -EINVAL; + + mutex_unlock(&boost_mutex); + return ret; +} + +int sched_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int *data = (unsigned int *)table->data; + unsigned int old_val; + + if (!sched_enable_hmp) + return -EINVAL; + + mutex_lock(&boost_mutex); + + old_val = *data; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + goto done; + + if (verify_boost_params(old_val, *data)) { + _sched_set_boost(old_val, *data); + } else { + *data = old_val; + ret = -EINVAL; + } + +done: + mutex_unlock(&boost_mutex); + return ret; +} + +int sched_boost(void) +{ + return sysctl_sched_boost; +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 84563da000cf..a5d101e8a5f2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7846,7 +7846,6 @@ void __init sched_init_smp(void) hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); update_cluster_topology(); - init_sched_hmp_boost_policy(); init_hrtick(); @@ -7895,7 +7894,7 @@ void __init sched_init(void) BUG_ON(num_possible_cpus() > BITS_PER_LONG); - sched_hmp_parse_dt(); + sched_boost_parse_dt(); init_clusters(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1674b1054f83..3db77aff2433 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2596,6 +2596,7 @@ static u32 __compute_runnable_contrib(u64 n) #define SBC_FLAG_COLOC_CLUSTER 0x10000 #define SBC_FLAG_WAKER_CLUSTER 0x20000 #define SBC_FLAG_BACKUP_CLUSTER 0x40000 +#define SBC_FLAG_BOOST_CLUSTER 0x80000 struct cpu_select_env { struct task_struct *p; @@ -2605,7 +2606,7 @@ struct cpu_select_env { u8 need_waker_cluster:1; u8 sync:1; u8 ignore_prev_cpu:1; - enum sched_boost_type boost_type; + enum sched_boost_policy boost_policy; int prev_cpu; DECLARE_BITMAP(candidate_list, NR_CPUS); DECLARE_BITMAP(backup_list, NR_CPUS); @@ -2705,10 +2706,38 @@ select_least_power_cluster(struct cpu_select_env *env) struct sched_cluster *cluster; if (env->rtg) { - env->task_load = scale_load_to_cpu(task_load(env->p), - cluster_first_cpu(env->rtg->preferred_cluster)); - env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER; - return env->rtg->preferred_cluster; + int cpu = cluster_first_cpu(env->rtg->preferred_cluster); + + env->task_load = scale_load_to_cpu(task_load(env->p), cpu); + + if (task_load_will_fit(env->p, env->task_load, + cpu, env->boost_policy)) { + env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER; + + if (env->boost_policy == SCHED_BOOST_NONE) + return env->rtg->preferred_cluster; + + for_each_sched_cluster(cluster) { + if (cluster != env->rtg->preferred_cluster) { + __set_bit(cluster->id, + env->backup_list); + __clear_bit(cluster->id, + env->candidate_list); + } + } + + return env->rtg->preferred_cluster; + } + + /* + * Since the task load does not fit on the preferred + * cluster anymore, pretend that the task does not + * have any preferred cluster. This allows the waking + * task to get the appropriate CPU it needs as per the + * non co-location placement policy without having to + * wait until the preferred cluster is updated. + */ + env->rtg = NULL; } for_each_sched_cluster(cluster) { @@ -2718,7 +2747,7 @@ select_least_power_cluster(struct cpu_select_env *env) env->task_load = scale_load_to_cpu(task_load(env->p), cpu); if (task_load_will_fit(env->p, env->task_load, cpu, - env->boost_type)) + env->boost_policy)) return cluster; __set_bit(cluster->id, env->backup_list); @@ -2961,7 +2990,14 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c, update_spare_capacity(stats, env, i, c->capacity, env->cpu_load); - if (env->boost_type == SCHED_BOOST_ON_ALL || + /* + * need_idle takes precedence over sched boost but when both + * are set, idlest CPU with in all the clusters is selected + * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the + * big cluster is selected within boost_policy = BOOST_ON_BIG. + */ + if ((!env->need_idle && + env->boost_policy != SCHED_BOOST_NONE) || env->need_waker_cluster || sched_cpu_high_irqload(i) || spill_threshold_crossed(env, cpu_rq(i))) @@ -3005,7 +3041,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) struct task_struct *task = env->p; struct sched_cluster *cluster; - if (env->boost_type != SCHED_BOOST_NONE || env->reason || + if (env->boost_policy != SCHED_BOOST_NONE || env->reason || !task->ravg.mark_start || env->need_idle || !sched_short_sleep_task_threshold) return false; @@ -3034,7 +3070,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) cluster = cpu_rq(prev_cpu)->cluster; if (!task_load_will_fit(task, env->task_load, prev_cpu, - sched_boost_type())) { + sched_boost_policy())) { __set_bit(cluster->id, env->backup_list); __clear_bit(cluster->id, env->candidate_list); @@ -3056,7 +3092,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) static inline bool wake_to_waker_cluster(struct cpu_select_env *env) { - return env->boost_type == SCHED_BOOST_NONE && + return env->boost_policy == SCHED_BOOST_NONE && !env->need_idle && !env->reason && env->sync && task_load(current) > sched_big_waker_task_load && task_load(env->p) < sched_small_wakee_task_load; @@ -3098,7 +3134,6 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, .reason = reason, .need_idle = wake_to_idle(p), .need_waker_cluster = 0, - .boost_type = sched_boost_type(), .sync = sync, .prev_cpu = target, .ignore_prev_cpu = 0, @@ -3107,6 +3142,9 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, .sbc_best_cluster_flag = 0, }; + env.boost_policy = task_sched_boost(p) ? + sched_boost_policy() : SCHED_BOOST_NONE; + bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS); bitmap_zero(env.backup_list, NR_CPUS); @@ -3178,12 +3216,23 @@ retry: sbc_flag |= env.sbc_best_flag; target = stats.best_cpu; } else { - if (env.rtg) { + if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) { env.rtg = NULL; goto retry; } - find_backup_cluster(&env, &stats); + /* + * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with + * backup_list = little cluster, candidate_list = none and + * stats->best_capacity_cpu points the best spare capacity + * CPU among the CPUs in the big cluster. + */ + if (env.boost_policy == SCHED_BOOST_ON_BIG && + stats.best_capacity_cpu >= 0) + sbc_flag |= SBC_FLAG_BOOST_CLUSTER; + else + find_backup_cluster(&env, &stats); + if (stats.best_capacity_cpu >= 0) { target = stats.best_capacity_cpu; sbc_flag |= SBC_FLAG_BEST_CAP_CPU; @@ -3193,8 +3242,8 @@ retry: out: sbc_flag |= env.sbc_best_cluster_flag; rcu_read_unlock(); - trace_sched_task_load(p, sched_boost(), env.reason, env.sync, - env.need_idle, sbc_flag, target); + trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p), + env.reason, env.sync, env.need_idle, sbc_flag, target); return target; } @@ -3402,11 +3451,9 @@ static inline int migration_needed(struct task_struct *p, int cpu) if (task_will_be_throttled(p)) return 0; - if (sched_boost_type() == SCHED_BOOST_ON_BIG) { - if (cpu_capacity(cpu) != max_capacity) - return UP_MIGRATION; - return 0; - } + if (sched_boost_policy() == SCHED_BOOST_ON_BIG && + cpu_capacity(cpu) != max_capacity && task_sched_boost(p)) + return UP_MIGRATION; if (sched_cpu_high_irqload(cpu)) return IRQLOAD_MIGRATION; @@ -3420,7 +3467,7 @@ static inline int migration_needed(struct task_struct *p, int cpu) return DOWN_MIGRATION; } - if (!grp && !task_will_fit(p, cpu)) { + if (!task_will_fit(p, cpu)) { rcu_read_unlock(); return UP_MIGRATION; } @@ -6648,10 +6695,7 @@ enum fbq_type { regular, remote, all }; #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 -#define LBF_SCHED_BOOST_ACTIVE_BALANCE 0x40 #define LBF_BIG_TASK_ACTIVE_BALANCE 0x80 -#define LBF_HMP_ACTIVE_BALANCE (LBF_SCHED_BOOST_ACTIVE_BALANCE | \ - LBF_BIG_TASK_ACTIVE_BALANCE) #define LBF_IGNORE_BIG_TASKS 0x100 #define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200 #define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400 @@ -6682,6 +6726,7 @@ struct lb_env { enum fbq_type fbq_type; struct list_head tasks; + enum sched_boost_policy boost_policy; }; /* @@ -6826,9 +6871,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu) && - nr_big_tasks(env->src_rq) && !is_big_task(p)) - return 0; + if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) { + if (nr_big_tasks(env->src_rq) && !is_big_task(p)) + return 0; + + if (env->boost_policy == SCHED_BOOST_ON_BIG && + !task_sched_boost(p)) + return 0; + } twf = task_will_fit(p, env->dst_cpu); @@ -6951,8 +7001,7 @@ static int detach_tasks(struct lb_env *env) if (env->imbalance <= 0) return 0; - if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu) && - !sched_boost()) + if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu)) env->flags |= LBF_IGNORE_BIG_TASKS; else if (!same_cluster(env->dst_cpu, env->src_cpu)) env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS; @@ -7255,8 +7304,10 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) int local_capacity, busiest_capacity; int local_pwr_cost, busiest_pwr_cost; int nr_cpus; + int boost = sched_boost(); - if (!sysctl_sched_restrict_cluster_spill || sched_boost()) + if (!sysctl_sched_restrict_cluster_spill || + boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST) return 0; local_cpu = group_first_cpu(sds->local); @@ -7628,11 +7679,6 @@ static bool update_sd_pick_busiest_active_balance(struct lb_env *env, { if (env->idle != CPU_NOT_IDLE && cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) { - if (sched_boost() && !sds->busiest && sgs->sum_nr_running) { - env->flags |= LBF_SCHED_BOOST_ACTIVE_BALANCE; - return true; - } - if (sgs->sum_nr_big_tasks > sds->busiest_stat.sum_nr_big_tasks) { env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE; @@ -8045,7 +8091,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - if (env->flags & LBF_HMP_ACTIVE_BALANCE) + if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE) goto force_balance; if (bail_inter_cluster_balance(env, &sds)) @@ -8257,7 +8303,7 @@ static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; - if (env->flags & LBF_HMP_ACTIVE_BALANCE) + if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE) return 1; if (env->idle == CPU_NEWLY_IDLE) { @@ -8348,20 +8394,21 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { - .sd = sd, - .dst_cpu = this_cpu, - .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), - .idle = idle, - .loop_break = sched_nr_migrate_break, - .cpus = cpus, - .fbq_type = all, - .tasks = LIST_HEAD_INIT(env.tasks), - .imbalance = 0, - .flags = 0, - .loop = 0, + .sd = sd, + .dst_cpu = this_cpu, + .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), + .idle = idle, + .loop_break = sched_nr_migrate_break, + .cpus = cpus, + .fbq_type = all, + .tasks = LIST_HEAD_INIT(env.tasks), + .imbalance = 0, + .flags = 0, + .loop = 0, .busiest_nr_running = 0, .busiest_grp_capacity = 0, + .boost_policy = sched_boost_policy(), }; /* @@ -8510,7 +8557,7 @@ more_balance: no_move: if (!ld_moved) { - if (!(env.flags & LBF_HMP_ACTIVE_BALANCE)) + if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) schedstat_inc(sd, lb_failed[idle]); /* @@ -8520,7 +8567,7 @@ no_move: * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE && - !(env.flags & LBF_HMP_ACTIVE_BALANCE)) + !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) sd->nr_balance_failed++; if (need_active_balance(&env)) { @@ -8797,6 +8844,7 @@ static int active_load_balance_cpu_stop(void *data) .busiest_grp_capacity = 0, .flags = 0, .loop = 0, + .boost_policy = sched_boost_policy(), }; bool moved = false; @@ -9272,7 +9320,8 @@ static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) if (rq->nr_running < 2) return 0; - if (!sysctl_sched_restrict_cluster_spill || sched_boost()) + if (!sysctl_sched_restrict_cluster_spill || + sched_boost_policy() == SCHED_BOOST_ON_ALL) return 1; if (cpu_max_power_cost(cpu) == max_power_cost) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 30391aae0822..968a41e0e81e 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -17,8 +17,6 @@ #include #include #include -#include -#include #include "sched.h" @@ -231,52 +229,6 @@ fail: return ret; } -/* - * It is possible that CPUs of the same micro architecture can have slight - * difference in the efficiency due to other factors like cache size. The - * BOOST_ON_BIG policy may not be optimial for such systems. The required - * boost policy can be specified via device tree to handle this. - */ -static int __read_mostly sched_boost_policy = SCHED_BOOST_NONE; - -/* - * This should be called after clusters are populated and - * the respective efficiency values are initialized. - */ -void init_sched_hmp_boost_policy(void) -{ - /* - * Initialize the boost type here if it is not passed from - * device tree. - */ - if (sched_boost_policy == SCHED_BOOST_NONE) { - if (max_possible_efficiency != min_possible_efficiency) - sched_boost_policy = SCHED_BOOST_ON_BIG; - else - sched_boost_policy = SCHED_BOOST_ON_ALL; - } -} - -void sched_hmp_parse_dt(void) -{ - struct device_node *sn; - const char *boost_policy; - - if (!sched_enable_hmp) - return; - - sn = of_find_node_by_path("/sched-hmp"); - if (!sn) - return; - - if (!of_property_read_string(sn, "boost-policy", &boost_policy)) { - if (!strcmp(boost_policy, "boost-on-big")) - sched_boost_policy = SCHED_BOOST_ON_BIG; - else if (!strcmp(boost_policy, "boost-on-all")) - sched_boost_policy = SCHED_BOOST_ON_ALL; - } -} - unsigned int max_possible_efficiency = 1; unsigned int min_possible_efficiency = UINT_MAX; @@ -680,29 +632,6 @@ int __init set_sched_enable_hmp(char *str) early_param("sched_enable_hmp", set_sched_enable_hmp); -int got_boost_kick(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - return test_bit(BOOST_KICK, &rq->hmp_flags); -} - -inline void clear_boost_kick(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - clear_bit(BOOST_KICK, &rq->hmp_flags); -} - -inline void boost_kick(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) - smp_send_reschedule(cpu); -} - /* Clear any HMP scheduler related requests pending from or on cpu */ void clear_hmp_request(int cpu) { @@ -840,6 +769,9 @@ min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ /* Window size (in ns) */ __read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; +/* Maximum allowed threshold before freq aggregation must be enabled */ +#define MAX_FREQ_AGGR_THRESH 1000 + /* Temporarily disable window-stats activity on all cpus */ unsigned int __read_mostly sched_disable_window_stats; @@ -919,8 +851,8 @@ static const unsigned int top_tasks_bitmap_size = * C1 busy time = 5 + 5 + 6 = 16ms * */ -static __read_mostly unsigned int sched_freq_aggregate; -__read_mostly unsigned int sysctl_sched_freq_aggregate; +static __read_mostly unsigned int sched_freq_aggregate = 1; +__read_mostly unsigned int sysctl_sched_freq_aggregate = 1; unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct; static unsigned int __read_mostly sched_freq_aggregate_threshold; @@ -937,14 +869,6 @@ unsigned int max_task_load(void) /* Use this knob to turn on or off HMP-aware task placement logic */ unsigned int __read_mostly sched_enable_hmp; -/* - * Scheduler boost is a mechanism to temporarily place tasks on CPUs - * with higher capacity than those where a task would have normally - * ended up with their load characteristics. Any entity enabling - * boost is responsible for disabling it as well. - */ -unsigned int sysctl_sched_boost; - /* A cpu can no longer accommodate more tasks if: * * rq->nr_running > sysctl_sched_spill_nr_run || @@ -995,6 +919,21 @@ unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80; unsigned int __read_mostly sched_downmigrate; unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60; +/* + * Task groups whose aggregate demand on a cpu is more than + * sched_group_upmigrate need to be up-migrated if possible. + */ +unsigned int __read_mostly sched_group_upmigrate; +unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100; + +/* + * Task groups, once up-migrated, will need to drop their aggregate + * demand to less than sched_group_downmigrate before they are "down" + * migrated. + */ +unsigned int __read_mostly sched_group_downmigrate; +unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95; + /* * The load scale factor of a CPU gets boosted when its max frequency * is restricted due to which the tasks are migrating to higher capacity @@ -1017,33 +956,46 @@ sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; -void update_up_down_migrate(void) +static void +_update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate) { - unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); - unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); unsigned int delta; if (up_down_migrate_scale_factor == 1024) - goto done; + return; - delta = up_migrate - down_migrate; + delta = *up_migrate - *down_migrate; - up_migrate /= NSEC_PER_USEC; - up_migrate *= up_down_migrate_scale_factor; - up_migrate >>= 10; - up_migrate *= NSEC_PER_USEC; + *up_migrate /= NSEC_PER_USEC; + *up_migrate *= up_down_migrate_scale_factor; + *up_migrate >>= 10; + *up_migrate *= NSEC_PER_USEC; - up_migrate = min(up_migrate, sched_ravg_window); + *up_migrate = min(*up_migrate, sched_ravg_window); - down_migrate /= NSEC_PER_USEC; - down_migrate *= up_down_migrate_scale_factor; - down_migrate >>= 10; - down_migrate *= NSEC_PER_USEC; + *down_migrate /= NSEC_PER_USEC; + *down_migrate *= up_down_migrate_scale_factor; + *down_migrate >>= 10; + *down_migrate *= NSEC_PER_USEC; - down_migrate = min(down_migrate, up_migrate - delta); -done: + *down_migrate = min(*down_migrate, *up_migrate - delta); +} + +static void update_up_down_migrate(void) +{ + unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); + unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); + + _update_up_down_migrate(&up_migrate, &down_migrate); sched_upmigrate = up_migrate; sched_downmigrate = down_migrate; + + up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct); + down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct); + + _update_up_down_migrate(&up_migrate, &down_migrate); + sched_group_upmigrate = up_migrate; + sched_group_downmigrate = down_migrate; } void set_hmp_defaults(void) @@ -1134,82 +1086,6 @@ u64 cpu_load_sync(int cpu, int sync) return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu); } -static int boost_refcount; -static DEFINE_SPINLOCK(boost_lock); -static DEFINE_MUTEX(boost_mutex); - -static void boost_kick_cpus(void) -{ - int i; - - for_each_online_cpu(i) { - if (cpu_capacity(i) != max_capacity) - boost_kick(i); - } -} - -int sched_boost(void) -{ - return boost_refcount > 0; -} - -int sched_set_boost(int enable) -{ - unsigned long flags; - int ret = 0; - int old_refcount; - - if (!sched_enable_hmp) - return -EINVAL; - - spin_lock_irqsave(&boost_lock, flags); - - old_refcount = boost_refcount; - - if (enable == 1) { - boost_refcount++; - } else if (!enable) { - if (boost_refcount >= 1) - boost_refcount--; - else - ret = -EINVAL; - } else { - ret = -EINVAL; - } - - if (!old_refcount && boost_refcount) - boost_kick_cpus(); - - if (boost_refcount <= 1) - core_ctl_set_boost(boost_refcount == 1); - trace_sched_set_boost(boost_refcount); - spin_unlock_irqrestore(&boost_lock, flags); - - return ret; -} - -int sched_boost_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - - mutex_lock(&boost_mutex); - if (!write) - sysctl_sched_boost = sched_boost(); - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret || !write) - goto done; - - ret = (sysctl_sched_boost <= 1) ? - sched_set_boost(sysctl_sched_boost) : -EINVAL; - -done: - mutex_unlock(&boost_mutex); - return ret; -} - /* * Task will fit on a cpu if it's bandwidth consumption on that cpu * will be less than sched_upmigrate. A big task that was previously @@ -1219,60 +1095,63 @@ done: * tasks with load close to the upmigrate threshold */ int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, - enum sched_boost_type boost_type) + enum sched_boost_policy boost_policy) { - int upmigrate; + int upmigrate = sched_upmigrate; if (cpu_capacity(cpu) == max_capacity) return 1; - if (boost_type != SCHED_BOOST_ON_BIG) { + if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) + upmigrate = sched_downmigrate; + + if (boost_policy != SCHED_BOOST_ON_BIG) { if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p)) return 1; - upmigrate = sched_upmigrate; - if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) - upmigrate = sched_downmigrate; - if (task_load < upmigrate) return 1; + } else { + if (task_sched_boost(p) || task_load >= upmigrate) + return 0; + + return 1; } return 0; } -enum sched_boost_type sched_boost_type(void) -{ - if (sched_boost()) - return sched_boost_policy; - - return SCHED_BOOST_NONE; -} - int task_will_fit(struct task_struct *p, int cpu) { u64 tload = scale_load_to_cpu(task_load(p), cpu); - return task_load_will_fit(p, tload, cpu, sched_boost_type()); + return task_load_will_fit(p, tload, cpu, sched_boost_policy()); } -int group_will_fit(struct sched_cluster *cluster, - struct related_thread_group *grp, u64 demand) +static int +group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp, + u64 demand, bool group_boost) { int cpu = cluster_first_cpu(cluster); int prev_capacity = 0; - unsigned int threshold = sched_upmigrate; + unsigned int threshold = sched_group_upmigrate; u64 load; if (cluster->capacity == max_capacity) return 1; + if (group_boost) + return 0; + + if (!demand) + return 1; + if (grp->preferred_cluster) prev_capacity = grp->preferred_cluster->capacity; if (cluster->capacity < prev_capacity) - threshold = sched_downmigrate; + threshold = sched_group_downmigrate; load = scale_load_to_cpu(demand, cpu); if (load < threshold) @@ -1495,6 +1374,23 @@ void post_big_task_count_change(const struct cpumask *cpus) DEFINE_MUTEX(policy_mutex); +unsigned int update_freq_aggregate_threshold(unsigned int threshold) +{ + unsigned int old_threshold; + + mutex_lock(&policy_mutex); + + old_threshold = sysctl_sched_freq_aggregate_threshold_pct; + + sysctl_sched_freq_aggregate_threshold_pct = threshold; + sched_freq_aggregate_threshold = + pct_to_real(sysctl_sched_freq_aggregate_threshold_pct); + + mutex_unlock(&policy_mutex); + + return old_threshold; +} + static inline int invalid_value_freq_input(unsigned int *data) { if (data == &sysctl_sched_freq_aggregate) @@ -1578,7 +1474,9 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write, if (write && (old_val == *data)) goto done; - if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) { + if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct || + sysctl_sched_group_downmigrate_pct > + sysctl_sched_group_upmigrate_pct) { *data = old_val; ret = -EINVAL; goto done; @@ -3110,37 +3008,9 @@ static void reset_all_task_stats(void) { struct task_struct *g, *p; - read_lock(&tasklist_lock); do_each_thread(g, p) { - raw_spin_lock_irq(&p->pi_lock); reset_task_stats(p); - raw_spin_unlock_irq(&p->pi_lock); } while_each_thread(g, p); - read_unlock(&tasklist_lock); -} - -static void disable_window_stats(void) -{ - unsigned long flags; - int i; - - local_irq_save(flags); - for_each_possible_cpu(i) - raw_spin_lock(&cpu_rq(i)->lock); - - sched_disable_window_stats = 1; - - for_each_possible_cpu(i) - raw_spin_unlock(&cpu_rq(i)->lock); - - local_irq_restore(flags); -} - -/* Called with all cpu's rq->lock held */ -static void enable_window_stats(void) -{ - sched_disable_window_stats = 0; - } enum reset_reason_code { @@ -3166,16 +3036,21 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) unsigned int old = 0, new = 0; struct related_thread_group *grp; + local_irq_save(flags); + + read_lock(&tasklist_lock); + read_lock(&related_thread_group_lock); - disable_window_stats(); + /* Taking all runqueue locks prevents race with sched_exit(). */ + for_each_possible_cpu(cpu) + raw_spin_lock(&cpu_rq(cpu)->lock); + + sched_disable_window_stats = 1; reset_all_task_stats(); - local_irq_save(flags); - - for_each_possible_cpu(cpu) - raw_spin_lock(&cpu_rq(cpu)->lock); + read_unlock(&tasklist_lock); list_for_each_entry(grp, &related_thread_groups, list) { int j; @@ -3196,7 +3071,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES; } - enable_window_stats(); + sched_disable_window_stats = 0; for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); @@ -3239,10 +3114,10 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) for_each_possible_cpu(cpu) raw_spin_unlock(&cpu_rq(cpu)->lock); - local_irq_restore(flags); - read_unlock(&related_thread_group_lock); + local_irq_restore(flags); + trace_sched_reset_all_window_stats(window_start, window_size, sched_ktime_clock() - start_ts, reason, old, new); } @@ -3824,13 +3699,13 @@ static void check_for_up_down_migrate_update(const struct cpumask *cpus) } /* Return cluster which can offer required capacity for group */ -static struct sched_cluster * -best_cluster(struct related_thread_group *grp, u64 total_demand) +static struct sched_cluster *best_cluster(struct related_thread_group *grp, + u64 total_demand, bool group_boost) { struct sched_cluster *cluster = NULL; for_each_sched_cluster(cluster) { - if (group_will_fit(cluster, grp, total_demand)) + if (group_will_fit(cluster, grp, total_demand, group_boost)) return cluster; } @@ -3841,6 +3716,9 @@ static void _set_preferred_cluster(struct related_thread_group *grp) { struct task_struct *p; u64 combined_demand = 0; + bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG; + bool group_boost = false; + u64 wallclock; if (!sysctl_sched_enable_colocation) { grp->last_update = sched_ktime_clock(); @@ -3848,31 +3726,43 @@ static void _set_preferred_cluster(struct related_thread_group *grp) return; } + if (list_empty(&grp->tasks)) + return; + + wallclock = sched_ktime_clock(); + /* * wakeup of two or more related tasks could race with each other and * could result in multiple calls to _set_preferred_cluster being issued * at same time. Avoid overhead in such cases of rechecking preferred * cluster */ - if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10) + if (wallclock - grp->last_update < sched_ravg_window / 10) return; - list_for_each_entry(p, &grp->tasks, grp_list) + list_for_each_entry(p, &grp->tasks, grp_list) { + if (boost_on_big && task_sched_boost(p)) { + group_boost = true; + break; + } + + if (p->ravg.mark_start < wallclock - + (sched_ravg_window * sched_ravg_hist_size)) + continue; + combined_demand += p->ravg.demand; - grp->preferred_cluster = best_cluster(grp, combined_demand); + } + + grp->preferred_cluster = best_cluster(grp, + combined_demand, group_boost); grp->last_update = sched_ktime_clock(); trace_sched_set_preferred_cluster(grp, combined_demand); } void set_preferred_cluster(struct related_thread_group *grp) { - /* - * Prevent possible deadlock with update_children(). Not updating - * the preferred cluster once is not a big deal. - */ - if (!raw_spin_trylock(&grp->lock)) - return; + raw_spin_lock(&grp->lock); _set_preferred_cluster(grp); raw_spin_unlock(&grp->lock); } @@ -3880,6 +3770,8 @@ void set_preferred_cluster(struct related_thread_group *grp) #define ADD_TASK 0 #define REM_TASK 1 +#define DEFAULT_CGROUP_COLOC_ID 1 + static inline void free_group_cputime(struct related_thread_group *grp) { free_percpu(grp->cpu_time); @@ -4116,64 +4008,19 @@ static void free_related_thread_group(struct rcu_head *rcu) kfree(grp); } -/* - * The thread group for a task can change while we are here. However, - * add_new_task_to_grp() will take care of any tasks that we miss here. - * When a parent exits, and a child thread is simultaneously exiting, - * sched_set_group_id() will synchronize those operations. - */ -static void update_children(struct task_struct *leader, - struct related_thread_group *grp, int event) -{ - struct task_struct *child; - struct rq *rq; - unsigned long flags; - - if (!thread_group_leader(leader)) - return; - - if (event == ADD_TASK && !sysctl_sched_enable_thread_grouping) - return; - - if (thread_group_empty(leader)) - return; - - child = next_thread(leader); - - do { - rq = task_rq_lock(child, &flags); - - if (event == REM_TASK && child->grp && grp == child->grp) { - transfer_busy_time(rq, grp, child, event); - list_del_init(&child->grp_list); - rcu_assign_pointer(child->grp, NULL); - } else if (event == ADD_TASK && !child->grp) { - transfer_busy_time(rq, grp, child, event); - list_add(&child->grp_list, &grp->tasks); - rcu_assign_pointer(child->grp, grp); - } - - task_rq_unlock(rq, child, &flags); - } while_each_thread(leader, child); - -} - static void remove_task_from_group(struct task_struct *p) { struct related_thread_group *grp = p->grp; struct rq *rq; int empty_group = 1; - unsigned long flags; raw_spin_lock(&grp->lock); - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); transfer_busy_time(rq, p->grp, p, REM_TASK); list_del_init(&p->grp_list); rcu_assign_pointer(p->grp, NULL); - task_rq_unlock(rq, p, &flags); - - update_children(p, grp, REM_TASK); + __task_rq_unlock(rq); if (!list_empty(&grp->tasks)) { empty_group = 0; @@ -4182,7 +4029,8 @@ static void remove_task_from_group(struct task_struct *p) raw_spin_unlock(&grp->lock); - if (empty_group) { + /* Reserved groups cannot be destroyed */ + if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) { list_del(&grp->list); call_rcu(&grp->rcu, free_related_thread_group); } @@ -4192,7 +4040,6 @@ static int add_task_to_group(struct task_struct *p, struct related_thread_group *grp) { struct rq *rq; - unsigned long flags; raw_spin_lock(&grp->lock); @@ -4200,13 +4047,11 @@ add_task_to_group(struct task_struct *p, struct related_thread_group *grp) * Change p->grp under rq->lock. Will prevent races with read-side * reference of p->grp in various hot-paths */ - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); transfer_busy_time(rq, grp, p, ADD_TASK); list_add(&p->grp_list, &grp->tasks); rcu_assign_pointer(p->grp, grp); - task_rq_unlock(rq, p, &flags); - - update_children(p, grp, ADD_TASK); + __task_rq_unlock(rq); _set_preferred_cluster(grp); @@ -4219,23 +4064,33 @@ void add_new_task_to_grp(struct task_struct *new) { unsigned long flags; struct related_thread_group *grp; - struct task_struct *parent; + struct task_struct *leader = new->group_leader; + unsigned int leader_grp_id = sched_get_group_id(leader); - if (!sysctl_sched_enable_thread_grouping) + if (!sysctl_sched_enable_thread_grouping && + leader_grp_id != DEFAULT_CGROUP_COLOC_ID) return; if (thread_group_leader(new)) return; - parent = new->group_leader; + if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) { + if (!same_schedtune(new, leader)) + return; + } write_lock_irqsave(&related_thread_group_lock, flags); rcu_read_lock(); - grp = task_related_thread_group(parent); + grp = task_related_thread_group(leader); rcu_read_unlock(); - /* Its possible that update_children() already added us to the group */ + /* + * It's possible that someone already added the new task to the + * group. A leader's thread group is updated prior to calling + * this function. It's also possible that the leader has exited + * the group. In either case, there is nothing else to do. + */ if (!grp || new->grp) { write_unlock_irqrestore(&related_thread_group_lock, flags); return; @@ -4250,14 +4105,55 @@ void add_new_task_to_grp(struct task_struct *new) write_unlock_irqrestore(&related_thread_group_lock, flags); } +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +/* + * We create a default colocation group at boot. There is no need to + * synchronize tasks between cgroups at creation time because the + * correct cgroup hierarchy is not available at boot. Therefore cgroup + * colocation is turned off by default even though the colocation group + * itself has been allocated. Furthermore this colocation group cannot + * be destroyted once it has been created. All of this has been as part + * of runtime optimizations. + * + * The job of synchronizing tasks to the colocation group is done when + * the colocation flag in the cgroup is turned on. + */ +static int __init create_default_coloc_group(void) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + + grp = alloc_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + if (IS_ERR(grp)) { + WARN_ON(1); + return -ENOMEM; + } + + write_lock_irqsave(&related_thread_group_lock, flags); + list_add(&grp->list, &related_thread_groups); + write_unlock_irqrestore(&related_thread_group_lock, flags); + + update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH); + return 0; +} +late_initcall(create_default_coloc_group); + +int sync_cgroup_colocation(struct task_struct *p, bool insert) +{ + unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; + + return sched_set_group_id(p, grp_id); +} +#endif + int sched_set_group_id(struct task_struct *p, unsigned int group_id) { int rc = 0; unsigned long flags; struct related_thread_group *grp = NULL; - /* Prevents tasks from exiting while we are managing groups. */ - write_lock_irqsave(&related_thread_group_lock, flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); + write_lock(&related_thread_group_lock); /* Switching from one group to another directly is not permitted */ if ((current != p && p->flags & PF_EXITING) || @@ -4272,6 +4168,12 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) grp = lookup_related_thread_group(group_id); if (!grp) { + /* This is a reserved id */ + if (group_id == DEFAULT_CGROUP_COLOC_ID) { + rc = -EINVAL; + goto done; + } + grp = alloc_related_thread_group(group_id); if (IS_ERR(grp)) { rc = -ENOMEM; @@ -4281,10 +4183,10 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) list_add(&grp->list, &related_thread_groups); } - BUG_ON(!grp); rc = add_task_to_group(p, grp); done: - write_unlock_irqrestore(&related_thread_group_lock, flags); + write_unlock(&related_thread_group_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); return rc; } @@ -4529,7 +4431,7 @@ bool early_detection_notify(struct rq *rq, u64 wallclock) struct task_struct *p; int loop_max = 10; - if (!sched_boost() || !rq->cfs.h_nr_running) + if (sched_boost_policy() == SCHED_BOOST_NONE || !rq->cfs.h_nr_running) return 0; rq->ed_task = NULL; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ba4403e910d8..12a04f30ef77 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1677,8 +1677,13 @@ static int find_lowest_rq_hmp(struct task_struct *task) int prev_cpu = task_cpu(task); u64 cpu_load, min_load = ULLONG_MAX; int i; - int restrict_cluster = sched_boost() ? 0 : - sysctl_sched_restrict_cluster_spill; + int restrict_cluster; + int boost_on_big; + + boost_on_big = sched_boost() == FULL_THROTTLE_BOOST && + sched_boost_policy() == SCHED_BOOST_ON_BIG; + + restrict_cluster = sysctl_sched_restrict_cluster_spill; /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) @@ -1697,6 +1702,9 @@ static int find_lowest_rq_hmp(struct task_struct *task) */ for_each_sched_cluster(cluster) { + if (boost_on_big && cluster->capacity != max_possible_capacity) + continue; + cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); cpumask_andnot(&candidate_mask, &candidate_mask, cpu_isolated_mask); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4289bf6cd642..30838bb9b442 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1061,8 +1061,6 @@ extern unsigned int max_load_scale_factor; extern unsigned int max_possible_capacity; extern unsigned int min_max_possible_capacity; extern unsigned int max_power_cost; -extern unsigned int sched_upmigrate; -extern unsigned int sched_downmigrate; extern unsigned int sched_init_task_load_windows; extern unsigned int up_down_migrate_scale_factor; extern unsigned int sysctl_sched_restrict_cluster_spill; @@ -1106,18 +1104,23 @@ extern void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock); extern unsigned int cpu_temp(int cpu); extern unsigned int nr_eligible_big_tasks(int cpu); -extern void update_up_down_migrate(void); extern int update_preferred_cluster(struct related_thread_group *grp, struct task_struct *p, u32 old_load); extern void set_preferred_cluster(struct related_thread_group *grp); extern void add_new_task_to_grp(struct task_struct *new); +extern unsigned int update_freq_aggregate_threshold(unsigned int threshold); -enum sched_boost_type { +enum sched_boost_policy { SCHED_BOOST_NONE, SCHED_BOOST_ON_BIG, SCHED_BOOST_ON_ALL, }; +#define NO_BOOST 0 +#define FULL_THROTTLE_BOOST 1 +#define CONSERVATIVE_BOOST 2 +#define RESTRAINED_BOOST 3 + static inline struct sched_cluster *cpu_cluster(int cpu) { return cpu_rq(cpu)->cluster; @@ -1387,14 +1390,11 @@ extern void set_hmp_defaults(void); extern int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost); extern unsigned int power_cost(int cpu, u64 demand); extern void reset_all_window_stats(u64 window_start, unsigned int window_size); -extern void boost_kick(int cpu); extern int sched_boost(void); extern int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, - enum sched_boost_type boost_type); -extern enum sched_boost_type sched_boost_type(void); + enum sched_boost_policy boost_policy); +extern enum sched_boost_policy sched_boost_policy(void); extern int task_will_fit(struct task_struct *p, int cpu); -extern int group_will_fit(struct sched_cluster *cluster, - struct related_thread_group *grp, u64 demand); extern u64 cpu_load(int cpu); extern u64 cpu_load_sync(int cpu, int sync); extern int preferred_cluster(struct sched_cluster *cluster, @@ -1422,10 +1422,32 @@ extern u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, struct cftype *cft); extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 upmigrate_discourage); -extern void sched_hmp_parse_dt(void); -extern void init_sched_hmp_boost_policy(void); +extern void sched_boost_parse_dt(void); extern void clear_top_tasks_bitmap(unsigned long *bitmap); +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +extern bool task_sched_boost(struct task_struct *p); +extern int sync_cgroup_colocation(struct task_struct *p, bool insert); +extern bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2); +extern void update_cgroup_boost_settings(void); +extern void restore_cgroup_boost_settings(void); + +#else +static inline bool +same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2) +{ + return true; +} + +static inline bool task_sched_boost(struct task_struct *p) +{ + return true; +} + +static inline void update_cgroup_boost_settings(void) { } +static inline void restore_cgroup_boost_settings(void) { } +#endif + #else /* CONFIG_SCHED_HMP */ struct hmp_sched_stats; @@ -1615,8 +1637,7 @@ static inline void post_big_task_count_change(void) { } static inline void set_hmp_defaults(void) { } static inline void clear_reserved(int cpu) { } -static inline void sched_hmp_parse_dt(void) {} -static inline void init_sched_hmp_boost_policy(void) {} +static inline void sched_boost_parse_dt(void) {} #define trace_sched_cpu_load(...) #define trace_sched_cpu_load_lb(...) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 4f8182302e5e..ee2af8e0b5ce 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -25,6 +25,33 @@ struct schedtune { /* Boost value for tasks on that SchedTune CGroup */ int boost; +#ifdef CONFIG_SCHED_HMP + /* Toggle ability to override sched boost enabled */ + bool sched_boost_no_override; + + /* + * Controls whether a cgroup is eligible for sched boost or not. This + * can temporariliy be disabled by the kernel based on the no_override + * flag above. + */ + bool sched_boost_enabled; + + /* + * This tracks the default value of sched_boost_enabled and is used + * restore the value following any temporary changes to that flag. + */ + bool sched_boost_enabled_backup; + + /* + * Controls whether tasks of this cgroup should be colocated with each + * other and tasks of other cgroups that have the same flag turned on. + */ + bool colocate; + + /* Controls whether further updates are allowed to the colocate flag */ + bool colocate_update_disabled; +#endif + }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -54,6 +81,13 @@ static inline struct schedtune *parent_st(struct schedtune *st) static struct schedtune root_schedtune = { .boost = 0, +#ifdef CONFIG_SCHED_HMP + .sched_boost_no_override = false, + .sched_boost_enabled = true, + .sched_boost_enabled_backup = true, + .colocate = false, + .colocate_update_disabled = false, +#endif }; /* @@ -97,6 +131,121 @@ struct boost_groups { /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); +#ifdef CONFIG_SCHED_HMP +static inline void init_sched_boost(struct schedtune *st) +{ + st->sched_boost_no_override = false; + st->sched_boost_enabled = true; + st->sched_boost_enabled_backup = st->sched_boost_enabled; + st->colocate = false; + st->colocate_update_disabled = false; +} + +bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2) +{ + return task_schedtune(tsk1) == task_schedtune(tsk2); +} + +void update_cgroup_boost_settings(void) +{ + int i; + + for (i = 0; i < BOOSTGROUPS_COUNT; i++) { + if (!allocated_group[i]) + break; + + if (allocated_group[i]->sched_boost_no_override) + continue; + + allocated_group[i]->sched_boost_enabled = false; + } +} + +void restore_cgroup_boost_settings(void) +{ + int i; + + for (i = 0; i < BOOSTGROUPS_COUNT; i++) { + if (!allocated_group[i]) + break; + + allocated_group[i]->sched_boost_enabled = + allocated_group[i]->sched_boost_enabled_backup; + } +} + +bool task_sched_boost(struct task_struct *p) +{ + struct schedtune *st = task_schedtune(p); + + return st->sched_boost_enabled; +} + +static u64 +sched_boost_override_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->sched_boost_no_override; +} + +static int sched_boost_override_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 override) +{ + struct schedtune *st = css_st(css); + + st->sched_boost_no_override = !!override; + + return 0; +} + +static u64 sched_boost_enabled_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->sched_boost_enabled; +} + +static int sched_boost_enabled_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 enable) +{ + struct schedtune *st = css_st(css); + + st->sched_boost_enabled = !!enable; + st->sched_boost_enabled_backup = st->sched_boost_enabled; + + return 0; +} + +static u64 sched_colocate_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->colocate; +} + +static int sched_colocate_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 colocate) +{ + struct schedtune *st = css_st(css); + + if (st->colocate_update_disabled) + return -EPERM; + + st->colocate = !!colocate; + st->colocate_update_disabled = true; + return 0; +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void init_sched_boost(struct schedtune *st) { } + +#endif /* CONFIG_SCHED_HMP */ + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -121,12 +270,45 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, return 0; } +static void schedtune_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + struct schedtune *st; + bool colocate; + + cgroup_taskset_first(tset, &css); + st = css_st(css); + + colocate = st->colocate; + + cgroup_taskset_for_each(task, css, tset) + sync_cgroup_colocation(task, colocate); +} + static struct cftype files[] = { { .name = "boost", .read_u64 = boost_read, .write_u64 = boost_write, }, +#ifdef CONFIG_SCHED_HMP + { + .name = "sched_boost_no_override", + .read_u64 = sched_boost_override_read, + .write_u64 = sched_boost_override_write, + }, + { + .name = "sched_boost_enabled", + .read_u64 = sched_boost_enabled_read, + .write_u64 = sched_boost_enabled_write, + }, + { + .name = "colocate", + .read_u64 = sched_colocate_read, + .write_u64 = sched_colocate_write, + }, +#endif { } /* terminate */ }; @@ -189,6 +371,7 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css) /* Initialize per CPUs boost group support */ st->idx = idx; + init_sched_boost(st); if (schedtune_boostgroup_init(st)) goto release; @@ -222,6 +405,7 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .legacy_cftypes = files, .early_init = 1, .allow_attach = subsys_cgroup_allow_attach, + .attach = schedtune_attach, }; #endif /* CONFIG_CGROUP_SCHEDTUNE */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 574316f1c344..b7cbd7940f7b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -124,6 +124,7 @@ static int __maybe_unused neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; @@ -376,6 +377,22 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .procname = "sched_group_upmigrate", + .data = &sysctl_sched_group_upmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + }, + { + .procname = "sched_group_downmigrate", + .data = &sysctl_sched_group_downmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + }, { .procname = "sched_init_task_load", .data = &sysctl_sched_init_task_load_pct, @@ -487,6 +504,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_boost_handler, + .extra1 = &zero, + .extra2 = &three, }, #endif /* CONFIG_SCHED_HMP */ #ifdef CONFIG_SCHED_DEBUG