From 8b74c7eb5f0b8badc7d76122d6e359bd43115b58 Mon Sep 17 00:00:00 2001 From: Syed Rameez Mustafa Date: Wed, 9 Nov 2016 17:22:23 -0800 Subject: [PATCH 1/3] sched: Remove thread group iteration from colocation Iterating a leader task's thread group in order to add them to a colocation group involves a complex locking chain that ends up causing a deadlock. The deadlock is as follows when the same task is being referenced on three different CPUs: ----- ------ ----- CPU 0 CPU 1 CPU 2 ----- ------ ----- add_task_to_group(p) __schedule(prev = p) write_lock( ttwu(p) related_thread_grp_lock) lock(pi_lock) idle_balance() wait for p->on_cpu load_balance() unable to acquire p->pi_lock send_notification() wait for read_lock( related_thread_grp_lock) unable to set p->on_cpu There are a couple of ways to resolve this deadlock in the kernel, however, they are not trivial. For the sake of simplicity, move the responsibility of thread group iteration back to userspace. This would apply to both adding and removing the leader task from a colocation group. The kernel would continue to automatically add newly forked children of the colocated leader to the colocation group. This still leaves an issue with the locking order of the pi_lock and the related_thread_group_lock. To solve all deadlocks, we need to avoid taking the pi_lock in reset_all_task_stats() and instead rely on a more heavy handed approach of taking all rq locks. The pi_lock was taken to avoid a race between reset_all_task_stats() and sched_exit(). The race can be avoided with rq locks as well. Change-Id: I15323e3ef91401142d3841db59c18fd8fee753fd Signed-off-by: Syed Rameez Mustafa --- kernel/sched/hmp.c | 126 ++++++++++----------------------------------- 1 file changed, 28 insertions(+), 98 deletions(-) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 30391aae0822..c1c35bb582fb 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -3110,37 +3110,9 @@ static void reset_all_task_stats(void) { struct task_struct *g, *p; - read_lock(&tasklist_lock); do_each_thread(g, p) { - raw_spin_lock_irq(&p->pi_lock); reset_task_stats(p); - raw_spin_unlock_irq(&p->pi_lock); } while_each_thread(g, p); - read_unlock(&tasklist_lock); -} - -static void disable_window_stats(void) -{ - unsigned long flags; - int i; - - local_irq_save(flags); - for_each_possible_cpu(i) - raw_spin_lock(&cpu_rq(i)->lock); - - sched_disable_window_stats = 1; - - for_each_possible_cpu(i) - raw_spin_unlock(&cpu_rq(i)->lock); - - local_irq_restore(flags); -} - -/* Called with all cpu's rq->lock held */ -static void enable_window_stats(void) -{ - sched_disable_window_stats = 0; - } enum reset_reason_code { @@ -3166,16 +3138,21 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) unsigned int old = 0, new = 0; struct related_thread_group *grp; + local_irq_save(flags); + + read_lock(&tasklist_lock); + read_lock(&related_thread_group_lock); - disable_window_stats(); + /* Taking all runqueue locks prevents race with sched_exit(). */ + for_each_possible_cpu(cpu) + raw_spin_lock(&cpu_rq(cpu)->lock); + + sched_disable_window_stats = 1; reset_all_task_stats(); - local_irq_save(flags); - - for_each_possible_cpu(cpu) - raw_spin_lock(&cpu_rq(cpu)->lock); + read_unlock(&tasklist_lock); list_for_each_entry(grp, &related_thread_groups, list) { int j; @@ -3196,7 +3173,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES; } - enable_window_stats(); + sched_disable_window_stats = 0; for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); @@ -3239,10 +3216,10 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) for_each_possible_cpu(cpu) raw_spin_unlock(&cpu_rq(cpu)->lock); - local_irq_restore(flags); - read_unlock(&related_thread_group_lock); + local_irq_restore(flags); + trace_sched_reset_all_window_stats(window_start, window_size, sched_ktime_clock() - start_ts, reason, old, new); } @@ -3867,12 +3844,7 @@ static void _set_preferred_cluster(struct related_thread_group *grp) void set_preferred_cluster(struct related_thread_group *grp) { - /* - * Prevent possible deadlock with update_children(). Not updating - * the preferred cluster once is not a big deal. - */ - if (!raw_spin_trylock(&grp->lock)) - return; + raw_spin_lock(&grp->lock); _set_preferred_cluster(grp); raw_spin_unlock(&grp->lock); } @@ -4116,64 +4088,19 @@ static void free_related_thread_group(struct rcu_head *rcu) kfree(grp); } -/* - * The thread group for a task can change while we are here. However, - * add_new_task_to_grp() will take care of any tasks that we miss here. - * When a parent exits, and a child thread is simultaneously exiting, - * sched_set_group_id() will synchronize those operations. - */ -static void update_children(struct task_struct *leader, - struct related_thread_group *grp, int event) -{ - struct task_struct *child; - struct rq *rq; - unsigned long flags; - - if (!thread_group_leader(leader)) - return; - - if (event == ADD_TASK && !sysctl_sched_enable_thread_grouping) - return; - - if (thread_group_empty(leader)) - return; - - child = next_thread(leader); - - do { - rq = task_rq_lock(child, &flags); - - if (event == REM_TASK && child->grp && grp == child->grp) { - transfer_busy_time(rq, grp, child, event); - list_del_init(&child->grp_list); - rcu_assign_pointer(child->grp, NULL); - } else if (event == ADD_TASK && !child->grp) { - transfer_busy_time(rq, grp, child, event); - list_add(&child->grp_list, &grp->tasks); - rcu_assign_pointer(child->grp, grp); - } - - task_rq_unlock(rq, child, &flags); - } while_each_thread(leader, child); - -} - static void remove_task_from_group(struct task_struct *p) { struct related_thread_group *grp = p->grp; struct rq *rq; int empty_group = 1; - unsigned long flags; raw_spin_lock(&grp->lock); - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); transfer_busy_time(rq, p->grp, p, REM_TASK); list_del_init(&p->grp_list); rcu_assign_pointer(p->grp, NULL); - task_rq_unlock(rq, p, &flags); - - update_children(p, grp, REM_TASK); + __task_rq_unlock(rq); if (!list_empty(&grp->tasks)) { empty_group = 0; @@ -4192,7 +4119,6 @@ static int add_task_to_group(struct task_struct *p, struct related_thread_group *grp) { struct rq *rq; - unsigned long flags; raw_spin_lock(&grp->lock); @@ -4200,13 +4126,11 @@ add_task_to_group(struct task_struct *p, struct related_thread_group *grp) * Change p->grp under rq->lock. Will prevent races with read-side * reference of p->grp in various hot-paths */ - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); transfer_busy_time(rq, grp, p, ADD_TASK); list_add(&p->grp_list, &grp->tasks); rcu_assign_pointer(p->grp, grp); - task_rq_unlock(rq, p, &flags); - - update_children(p, grp, ADD_TASK); + __task_rq_unlock(rq); _set_preferred_cluster(grp); @@ -4235,7 +4159,12 @@ void add_new_task_to_grp(struct task_struct *new) grp = task_related_thread_group(parent); rcu_read_unlock(); - /* Its possible that update_children() already added us to the group */ + /* + * It's possible that someone already added the new task to the + * group. A leader's thread group is updated prior to calling + * this function. It's also possible that the leader has exited + * the group. In either case, there is nothing else to do. + */ if (!grp || new->grp) { write_unlock_irqrestore(&related_thread_group_lock, flags); return; @@ -4256,8 +4185,8 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) unsigned long flags; struct related_thread_group *grp = NULL; - /* Prevents tasks from exiting while we are managing groups. */ - write_lock_irqsave(&related_thread_group_lock, flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); + write_lock(&related_thread_group_lock); /* Switching from one group to another directly is not permitted */ if ((current != p && p->flags & PF_EXITING) || @@ -4284,7 +4213,8 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) BUG_ON(!grp); rc = add_task_to_group(p, grp); done: - write_unlock_irqrestore(&related_thread_group_lock, flags); + write_unlock(&related_thread_group_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); return rc; } From fd5b530593795c9aa145c7f9aa3eb817d3841af3 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Fri, 23 Sep 2016 12:21:55 -0700 Subject: [PATCH 2/3] sched: revise boost logic when boost_type is SCHED_BOOST_ON_BIG At present HMP scheduler boost tends to pack tasks by taking into account of power cost and cstate. This is suboptimal to performance as it can lead preemption and higher latency. Revise logic to prefer the least loaded CPU among the big cluster CPUs when boost type is SCHED_BOOST_ON_BIG. New logic still honor the behaviour that scheduler can place tasks on the little CPUs when the big CPUs are all overcommitted. Also, it's found that need_idle with boost can easily return previous CPU when there is no idle CPU found. Fix this issue by making need_idle flag to take precedence over sched_boost. CRs-fixed: 1074879 Change-Id: I470bcd0588e038b4a540d337fe6a412f2fa74920 Signed-off-by: Joonwoo Park Signed-off-by: Syed Rameez Mustafa --- kernel/sched/fair.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1674b1054f83..40c2e7ac69e0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2596,6 +2596,7 @@ static u32 __compute_runnable_contrib(u64 n) #define SBC_FLAG_COLOC_CLUSTER 0x10000 #define SBC_FLAG_WAKER_CLUSTER 0x20000 #define SBC_FLAG_BACKUP_CLUSTER 0x40000 +#define SBC_FLAG_BOOST_CLUSTER 0x80000 struct cpu_select_env { struct task_struct *p; @@ -2708,6 +2709,18 @@ select_least_power_cluster(struct cpu_select_env *env) env->task_load = scale_load_to_cpu(task_load(env->p), cluster_first_cpu(env->rtg->preferred_cluster)); env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER; + + if (env->boost_type != SCHED_BOOST_NONE) { + for_each_sched_cluster(cluster) { + if (cluster != env->rtg->preferred_cluster) { + __set_bit(cluster->id, + env->backup_list); + __clear_bit(cluster->id, + env->candidate_list); + } + } + } + return env->rtg->preferred_cluster; } @@ -2961,7 +2974,13 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c, update_spare_capacity(stats, env, i, c->capacity, env->cpu_load); - if (env->boost_type == SCHED_BOOST_ON_ALL || + /* + * need_idle takes precedence over sched boost but when both + * are set, idlest CPU with in all the clusters is selected + * when boost_type = BOOST_ON_ALL whereas idlest CPU in the + * big cluster is selected within boost_type = BOOST_ON_BIG. + */ + if ((!env->need_idle && env->boost_type != SCHED_BOOST_NONE) || env->need_waker_cluster || sched_cpu_high_irqload(i) || spill_threshold_crossed(env, cpu_rq(i))) @@ -3178,12 +3197,23 @@ retry: sbc_flag |= env.sbc_best_flag; target = stats.best_cpu; } else { - if (env.rtg) { + if (env.rtg && env.boost_type == SCHED_BOOST_NONE) { env.rtg = NULL; goto retry; } - find_backup_cluster(&env, &stats); + /* + * With boost_type == SCHED_BOOST_ON_BIG, we reach here with + * backup_list = little cluster, candidate_list = none and + * stats->best_capacity_cpu points the best spare capacity + * CPU among the CPUs in the big cluster. + */ + if (env.boost_type == SCHED_BOOST_ON_BIG && + stats.best_capacity_cpu >= 0) + sbc_flag |= SBC_FLAG_BOOST_CLUSTER; + else + find_backup_cluster(&env, &stats); + if (stats.best_capacity_cpu >= 0) { target = stats.best_capacity_cpu; sbc_flag |= SBC_FLAG_BEST_CAP_CPU; From 30fc7742350f45a6b3667880de0a72c68509ccc5 Mon Sep 17 00:00:00 2001 From: Syed Rameez Mustafa Date: Wed, 31 Aug 2016 16:54:12 -0700 Subject: [PATCH 3/3] sched/hmp: Enhance co-location and scheduler boost features The recent introduction of the schedtune cgroup controller has provided the scheduler with added flexibility in terms of some of it's placement features. In particular each cgroup under the schedtune controller can now specify: 1) Whether it needs co-location along with other cgroups 2) Whether it is eligible for scheduler boost (sched_boost_enabled) 3) Whether the kernel can override the boost eligibility when necessary (sched_boost_no_override) The scheduler now creates a reserved co-location group at boot. This group is used to co-locate all tasks that form part of any one of the cgroups that have co-location enabled. This reserved group can neither be destroyed nor reused for other purposes. Furthermore, cgroups are only allowed to indicate their co-location preference once at boot. Further updates are disallowed. Since we are now creating co-location groups for an extended period of time, there are a few other factors to consider when determining the preferred cluster for the group. We first exclude any tasks in the group that have not been observed to be running for a significant amount of time. Secondly we introduce the notion of group up and down migrate tunables to allow different migration policies than individual tasks. Lastly we break co-location if a single task in a group exceeds up-migrate but the total load of the group does not exceed group up-migrate. In terms of sched_boost, the scheduler now supports multiple types of boost. These are: 1) FULL_THROTTLE : Force up-migrate tasks belonging any cgroup that has the sched_boost_enabled flag turned on. Little CPUs will only be used when big CPUs can no longer accommodate tasks. Also up-migrate all RT tasks. 2) CONSERVATIVE : Override the sched_boost_enabled flag for all cgroups except those that have the sched_boost_no_override flag set. Force up-migrate all tasks belonging to only those cgroups that still remain eligible for boost. RT tasks do not get force up migrated. 3) RESTRAINED : Start frequency aggregation for co-located tasks. This type of boost does not force up-migrate any task. Finally the boost API removes ref-counting. This means that there can only be a single entity using boost at any given time. If multiple entities are managing boost, they are required to be well behaved so that they don't interfere with one another. Even for a single client, it is not possible to switch directly from one boost type to another. Boost must be first turned off before switching over to a new type. Change-Id: I8d224a70cbef162f27078b62b73acaa22670861d Signed-off-by: Syed Rameez Mustafa --- include/linux/sched/sysctl.h | 2 + include/trace/events/sched.h | 25 ++- kernel/sched/Makefile | 2 +- kernel/sched/boost.c | 226 ++++++++++++++++++++ kernel/sched/core.c | 3 +- kernel/sched/fair.c | 131 +++++++----- kernel/sched/hmp.c | 386 ++++++++++++++++------------------- kernel/sched/rt.c | 12 +- kernel/sched/sched.h | 47 +++-- kernel/sched/tune.c | 184 +++++++++++++++++ kernel/sysctl.c | 19 ++ 11 files changed, 746 insertions(+), 291 deletions(-) create mode 100644 kernel/sched/boost.c diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 6848454c5447..5d0899df64ff 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -53,6 +53,8 @@ extern unsigned int sysctl_sched_spill_nr_run; extern unsigned int sysctl_sched_spill_load_pct; extern unsigned int sysctl_sched_upmigrate_pct; extern unsigned int sysctl_sched_downmigrate_pct; +extern unsigned int sysctl_sched_group_upmigrate_pct; +extern unsigned int sysctl_sched_group_downmigrate_pct; extern unsigned int sysctl_early_detection_duration; extern unsigned int sysctl_sched_boost; extern unsigned int sysctl_sched_small_wakee_task_load_pct; diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 7778ff3947de..72bbed9ad5db 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -133,6 +133,7 @@ TRACE_EVENT(sched_task_load, __field( u32, flags ) __field( int, best_cpu ) __field( u64, latency ) + __field( int, grp_id ) ), TP_fast_assign( @@ -148,12 +149,13 @@ TRACE_EVENT(sched_task_load, __entry->latency = p->state == TASK_WAKING ? sched_ktime_clock() - p->ravg.mark_start : 0; + __entry->grp_id = p->grp ? p->grp->id : 0; ), - TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x best_cpu=%d latency=%llu", + TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x grp=%d best_cpu=%d latency=%llu", __entry->pid, __entry->comm, __entry->demand, __entry->boost, __entry->reason, __entry->sync, - __entry->need_idle, __entry->flags, + __entry->need_idle, __entry->flags, __entry->grp_id, __entry->best_cpu, __entry->latency) ); @@ -164,9 +166,12 @@ TRACE_EVENT(sched_set_preferred_cluster, TP_ARGS(grp, total_demand), TP_STRUCT__entry( - __field( int, id ) - __field( u64, demand ) - __field( int, cluster_first_cpu ) + __field( int, id ) + __field( u64, demand ) + __field( int, cluster_first_cpu ) + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field(unsigned int, task_demand ) ), TP_fast_assign( @@ -245,19 +250,19 @@ DEFINE_EVENT(sched_cpu_load, sched_cpu_load_cgroup, TRACE_EVENT(sched_set_boost, - TP_PROTO(int ref_count), + TP_PROTO(int type), - TP_ARGS(ref_count), + TP_ARGS(type), TP_STRUCT__entry( - __field(unsigned int, ref_count ) + __field(int, type ) ), TP_fast_assign( - __entry->ref_count = ref_count; + __entry->type = type; ), - TP_printk("ref_count=%d", __entry->ref_count) + TP_printk("type %d", __entry->type) ); #if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_HMP) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7d0d34c53e08..7c0382a3eace 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -15,7 +15,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o sched_avg.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o -obj-$(CONFIG_SCHED_HMP) += hmp.o +obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/boost.c b/kernel/sched/boost.c new file mode 100644 index 000000000000..fcfda385b74a --- /dev/null +++ b/kernel/sched/boost.c @@ -0,0 +1,226 @@ +/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "sched.h" +#include +#include +#include + +/* + * Scheduler boost is a mechanism to temporarily place tasks on CPUs + * with higher capacity than those where a task would have normally + * ended up with their load characteristics. Any entity enabling + * boost is responsible for disabling it as well. + */ + +unsigned int sysctl_sched_boost; +static enum sched_boost_policy boost_policy; +static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE; +static DEFINE_MUTEX(boost_mutex); +static unsigned int freq_aggr_threshold_backup; + +static inline void boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) + smp_send_reschedule(cpu); +} + +static void boost_kick_cpus(void) +{ + int i; + struct cpumask kick_mask; + + if (boost_policy != SCHED_BOOST_ON_BIG) + return; + + cpumask_andnot(&kick_mask, cpu_online_mask, cpu_isolated_mask); + + for_each_cpu(i, &kick_mask) { + if (cpu_capacity(i) != max_capacity) + boost_kick(i); + } +} + +int got_boost_kick(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + return test_bit(BOOST_KICK, &rq->hmp_flags); +} + +void clear_boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(BOOST_KICK, &rq->hmp_flags); +} + +/* + * Scheduler boost type and boost policy might at first seem unrelated, + * however, there exists a connection between them that will allow us + * to use them interchangeably during placement decisions. We'll explain + * the connection here in one possible way so that the implications are + * clear when looking at placement policies. + * + * When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED + * When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can + * neither be none nor RESTRAINED. + */ +static void set_boost_policy(int type) +{ + if (type == SCHED_BOOST_NONE || type == RESTRAINED_BOOST) { + boost_policy = SCHED_BOOST_NONE; + return; + } + + if (boost_policy_dt) { + boost_policy = boost_policy_dt; + return; + } + + if (min_possible_efficiency != max_possible_efficiency) { + boost_policy = SCHED_BOOST_ON_BIG; + return; + } + + boost_policy = SCHED_BOOST_ON_ALL; +} + +enum sched_boost_policy sched_boost_policy(void) +{ + return boost_policy; +} + +static bool verify_boost_params(int old_val, int new_val) +{ + /* + * Boost can only be turned on or off. There is no possiblity of + * switching from one boost type to another or to set the same + * kind of boost several times. + */ + return !(!!old_val == !!new_val); +} + +static void _sched_set_boost(int old_val, int type) +{ + switch (type) { + case NO_BOOST: + if (old_val == FULL_THROTTLE_BOOST) + core_ctl_set_boost(false); + else if (old_val == CONSERVATIVE_BOOST) + restore_cgroup_boost_settings(); + else + update_freq_aggregate_threshold( + freq_aggr_threshold_backup); + break; + + case FULL_THROTTLE_BOOST: + core_ctl_set_boost(true); + boost_kick_cpus(); + break; + + case CONSERVATIVE_BOOST: + update_cgroup_boost_settings(); + boost_kick_cpus(); + break; + + case RESTRAINED_BOOST: + freq_aggr_threshold_backup = + update_freq_aggregate_threshold(1); + break; + + default: + WARN_ON(1); + return; + } + + set_boost_policy(type); + sysctl_sched_boost = type; + trace_sched_set_boost(type); +} + +void sched_boost_parse_dt(void) +{ + struct device_node *sn; + const char *boost_policy; + + if (!sched_enable_hmp) + return; + + sn = of_find_node_by_path("/sched-hmp"); + if (!sn) + return; + + if (!of_property_read_string(sn, "boost-policy", &boost_policy)) { + if (!strcmp(boost_policy, "boost-on-big")) + boost_policy_dt = SCHED_BOOST_ON_BIG; + else if (!strcmp(boost_policy, "boost-on-all")) + boost_policy_dt = SCHED_BOOST_ON_ALL; + } +} + +int sched_set_boost(int type) +{ + int ret = 0; + + if (!sched_enable_hmp) + return -EINVAL; + + mutex_lock(&boost_mutex); + + if (verify_boost_params(sysctl_sched_boost, type)) + _sched_set_boost(sysctl_sched_boost, type); + else + ret = -EINVAL; + + mutex_unlock(&boost_mutex); + return ret; +} + +int sched_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int *data = (unsigned int *)table->data; + unsigned int old_val; + + if (!sched_enable_hmp) + return -EINVAL; + + mutex_lock(&boost_mutex); + + old_val = *data; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + goto done; + + if (verify_boost_params(old_val, *data)) { + _sched_set_boost(old_val, *data); + } else { + *data = old_val; + ret = -EINVAL; + } + +done: + mutex_unlock(&boost_mutex); + return ret; +} + +int sched_boost(void) +{ + return sysctl_sched_boost; +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 84563da000cf..a5d101e8a5f2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7846,7 +7846,6 @@ void __init sched_init_smp(void) hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); update_cluster_topology(); - init_sched_hmp_boost_policy(); init_hrtick(); @@ -7895,7 +7894,7 @@ void __init sched_init(void) BUG_ON(num_possible_cpus() > BITS_PER_LONG); - sched_hmp_parse_dt(); + sched_boost_parse_dt(); init_clusters(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40c2e7ac69e0..3db77aff2433 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2606,7 +2606,7 @@ struct cpu_select_env { u8 need_waker_cluster:1; u8 sync:1; u8 ignore_prev_cpu:1; - enum sched_boost_type boost_type; + enum sched_boost_policy boost_policy; int prev_cpu; DECLARE_BITMAP(candidate_list, NR_CPUS); DECLARE_BITMAP(backup_list, NR_CPUS); @@ -2706,11 +2706,17 @@ select_least_power_cluster(struct cpu_select_env *env) struct sched_cluster *cluster; if (env->rtg) { - env->task_load = scale_load_to_cpu(task_load(env->p), - cluster_first_cpu(env->rtg->preferred_cluster)); - env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER; + int cpu = cluster_first_cpu(env->rtg->preferred_cluster); + + env->task_load = scale_load_to_cpu(task_load(env->p), cpu); + + if (task_load_will_fit(env->p, env->task_load, + cpu, env->boost_policy)) { + env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER; + + if (env->boost_policy == SCHED_BOOST_NONE) + return env->rtg->preferred_cluster; - if (env->boost_type != SCHED_BOOST_NONE) { for_each_sched_cluster(cluster) { if (cluster != env->rtg->preferred_cluster) { __set_bit(cluster->id, @@ -2719,9 +2725,19 @@ select_least_power_cluster(struct cpu_select_env *env) env->candidate_list); } } + + return env->rtg->preferred_cluster; } - return env->rtg->preferred_cluster; + /* + * Since the task load does not fit on the preferred + * cluster anymore, pretend that the task does not + * have any preferred cluster. This allows the waking + * task to get the appropriate CPU it needs as per the + * non co-location placement policy without having to + * wait until the preferred cluster is updated. + */ + env->rtg = NULL; } for_each_sched_cluster(cluster) { @@ -2731,7 +2747,7 @@ select_least_power_cluster(struct cpu_select_env *env) env->task_load = scale_load_to_cpu(task_load(env->p), cpu); if (task_load_will_fit(env->p, env->task_load, cpu, - env->boost_type)) + env->boost_policy)) return cluster; __set_bit(cluster->id, env->backup_list); @@ -2977,10 +2993,11 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c, /* * need_idle takes precedence over sched boost but when both * are set, idlest CPU with in all the clusters is selected - * when boost_type = BOOST_ON_ALL whereas idlest CPU in the - * big cluster is selected within boost_type = BOOST_ON_BIG. + * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the + * big cluster is selected within boost_policy = BOOST_ON_BIG. */ - if ((!env->need_idle && env->boost_type != SCHED_BOOST_NONE) || + if ((!env->need_idle && + env->boost_policy != SCHED_BOOST_NONE) || env->need_waker_cluster || sched_cpu_high_irqload(i) || spill_threshold_crossed(env, cpu_rq(i))) @@ -3024,7 +3041,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) struct task_struct *task = env->p; struct sched_cluster *cluster; - if (env->boost_type != SCHED_BOOST_NONE || env->reason || + if (env->boost_policy != SCHED_BOOST_NONE || env->reason || !task->ravg.mark_start || env->need_idle || !sched_short_sleep_task_threshold) return false; @@ -3053,7 +3070,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) cluster = cpu_rq(prev_cpu)->cluster; if (!task_load_will_fit(task, env->task_load, prev_cpu, - sched_boost_type())) { + sched_boost_policy())) { __set_bit(cluster->id, env->backup_list); __clear_bit(cluster->id, env->candidate_list); @@ -3075,7 +3092,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) static inline bool wake_to_waker_cluster(struct cpu_select_env *env) { - return env->boost_type == SCHED_BOOST_NONE && + return env->boost_policy == SCHED_BOOST_NONE && !env->need_idle && !env->reason && env->sync && task_load(current) > sched_big_waker_task_load && task_load(env->p) < sched_small_wakee_task_load; @@ -3117,7 +3134,6 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, .reason = reason, .need_idle = wake_to_idle(p), .need_waker_cluster = 0, - .boost_type = sched_boost_type(), .sync = sync, .prev_cpu = target, .ignore_prev_cpu = 0, @@ -3126,6 +3142,9 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, .sbc_best_cluster_flag = 0, }; + env.boost_policy = task_sched_boost(p) ? + sched_boost_policy() : SCHED_BOOST_NONE; + bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS); bitmap_zero(env.backup_list, NR_CPUS); @@ -3197,18 +3216,18 @@ retry: sbc_flag |= env.sbc_best_flag; target = stats.best_cpu; } else { - if (env.rtg && env.boost_type == SCHED_BOOST_NONE) { + if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) { env.rtg = NULL; goto retry; } /* - * With boost_type == SCHED_BOOST_ON_BIG, we reach here with + * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with * backup_list = little cluster, candidate_list = none and * stats->best_capacity_cpu points the best spare capacity * CPU among the CPUs in the big cluster. */ - if (env.boost_type == SCHED_BOOST_ON_BIG && + if (env.boost_policy == SCHED_BOOST_ON_BIG && stats.best_capacity_cpu >= 0) sbc_flag |= SBC_FLAG_BOOST_CLUSTER; else @@ -3223,8 +3242,8 @@ retry: out: sbc_flag |= env.sbc_best_cluster_flag; rcu_read_unlock(); - trace_sched_task_load(p, sched_boost(), env.reason, env.sync, - env.need_idle, sbc_flag, target); + trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p), + env.reason, env.sync, env.need_idle, sbc_flag, target); return target; } @@ -3432,11 +3451,9 @@ static inline int migration_needed(struct task_struct *p, int cpu) if (task_will_be_throttled(p)) return 0; - if (sched_boost_type() == SCHED_BOOST_ON_BIG) { - if (cpu_capacity(cpu) != max_capacity) - return UP_MIGRATION; - return 0; - } + if (sched_boost_policy() == SCHED_BOOST_ON_BIG && + cpu_capacity(cpu) != max_capacity && task_sched_boost(p)) + return UP_MIGRATION; if (sched_cpu_high_irqload(cpu)) return IRQLOAD_MIGRATION; @@ -3450,7 +3467,7 @@ static inline int migration_needed(struct task_struct *p, int cpu) return DOWN_MIGRATION; } - if (!grp && !task_will_fit(p, cpu)) { + if (!task_will_fit(p, cpu)) { rcu_read_unlock(); return UP_MIGRATION; } @@ -6678,10 +6695,7 @@ enum fbq_type { regular, remote, all }; #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 -#define LBF_SCHED_BOOST_ACTIVE_BALANCE 0x40 #define LBF_BIG_TASK_ACTIVE_BALANCE 0x80 -#define LBF_HMP_ACTIVE_BALANCE (LBF_SCHED_BOOST_ACTIVE_BALANCE | \ - LBF_BIG_TASK_ACTIVE_BALANCE) #define LBF_IGNORE_BIG_TASKS 0x100 #define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200 #define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400 @@ -6712,6 +6726,7 @@ struct lb_env { enum fbq_type fbq_type; struct list_head tasks; + enum sched_boost_policy boost_policy; }; /* @@ -6856,9 +6871,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu) && - nr_big_tasks(env->src_rq) && !is_big_task(p)) - return 0; + if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) { + if (nr_big_tasks(env->src_rq) && !is_big_task(p)) + return 0; + + if (env->boost_policy == SCHED_BOOST_ON_BIG && + !task_sched_boost(p)) + return 0; + } twf = task_will_fit(p, env->dst_cpu); @@ -6981,8 +7001,7 @@ static int detach_tasks(struct lb_env *env) if (env->imbalance <= 0) return 0; - if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu) && - !sched_boost()) + if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu)) env->flags |= LBF_IGNORE_BIG_TASKS; else if (!same_cluster(env->dst_cpu, env->src_cpu)) env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS; @@ -7285,8 +7304,10 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) int local_capacity, busiest_capacity; int local_pwr_cost, busiest_pwr_cost; int nr_cpus; + int boost = sched_boost(); - if (!sysctl_sched_restrict_cluster_spill || sched_boost()) + if (!sysctl_sched_restrict_cluster_spill || + boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST) return 0; local_cpu = group_first_cpu(sds->local); @@ -7658,11 +7679,6 @@ static bool update_sd_pick_busiest_active_balance(struct lb_env *env, { if (env->idle != CPU_NOT_IDLE && cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) { - if (sched_boost() && !sds->busiest && sgs->sum_nr_running) { - env->flags |= LBF_SCHED_BOOST_ACTIVE_BALANCE; - return true; - } - if (sgs->sum_nr_big_tasks > sds->busiest_stat.sum_nr_big_tasks) { env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE; @@ -8075,7 +8091,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - if (env->flags & LBF_HMP_ACTIVE_BALANCE) + if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE) goto force_balance; if (bail_inter_cluster_balance(env, &sds)) @@ -8287,7 +8303,7 @@ static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; - if (env->flags & LBF_HMP_ACTIVE_BALANCE) + if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE) return 1; if (env->idle == CPU_NEWLY_IDLE) { @@ -8378,20 +8394,21 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { - .sd = sd, - .dst_cpu = this_cpu, - .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), - .idle = idle, - .loop_break = sched_nr_migrate_break, - .cpus = cpus, - .fbq_type = all, - .tasks = LIST_HEAD_INIT(env.tasks), - .imbalance = 0, - .flags = 0, - .loop = 0, + .sd = sd, + .dst_cpu = this_cpu, + .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), + .idle = idle, + .loop_break = sched_nr_migrate_break, + .cpus = cpus, + .fbq_type = all, + .tasks = LIST_HEAD_INIT(env.tasks), + .imbalance = 0, + .flags = 0, + .loop = 0, .busiest_nr_running = 0, .busiest_grp_capacity = 0, + .boost_policy = sched_boost_policy(), }; /* @@ -8540,7 +8557,7 @@ more_balance: no_move: if (!ld_moved) { - if (!(env.flags & LBF_HMP_ACTIVE_BALANCE)) + if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) schedstat_inc(sd, lb_failed[idle]); /* @@ -8550,7 +8567,7 @@ no_move: * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE && - !(env.flags & LBF_HMP_ACTIVE_BALANCE)) + !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) sd->nr_balance_failed++; if (need_active_balance(&env)) { @@ -8827,6 +8844,7 @@ static int active_load_balance_cpu_stop(void *data) .busiest_grp_capacity = 0, .flags = 0, .loop = 0, + .boost_policy = sched_boost_policy(), }; bool moved = false; @@ -9302,7 +9320,8 @@ static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) if (rq->nr_running < 2) return 0; - if (!sysctl_sched_restrict_cluster_spill || sched_boost()) + if (!sysctl_sched_restrict_cluster_spill || + sched_boost_policy() == SCHED_BOOST_ON_ALL) return 1; if (cpu_max_power_cost(cpu) == max_power_cost) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index c1c35bb582fb..968a41e0e81e 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -17,8 +17,6 @@ #include #include #include -#include -#include #include "sched.h" @@ -231,52 +229,6 @@ fail: return ret; } -/* - * It is possible that CPUs of the same micro architecture can have slight - * difference in the efficiency due to other factors like cache size. The - * BOOST_ON_BIG policy may not be optimial for such systems. The required - * boost policy can be specified via device tree to handle this. - */ -static int __read_mostly sched_boost_policy = SCHED_BOOST_NONE; - -/* - * This should be called after clusters are populated and - * the respective efficiency values are initialized. - */ -void init_sched_hmp_boost_policy(void) -{ - /* - * Initialize the boost type here if it is not passed from - * device tree. - */ - if (sched_boost_policy == SCHED_BOOST_NONE) { - if (max_possible_efficiency != min_possible_efficiency) - sched_boost_policy = SCHED_BOOST_ON_BIG; - else - sched_boost_policy = SCHED_BOOST_ON_ALL; - } -} - -void sched_hmp_parse_dt(void) -{ - struct device_node *sn; - const char *boost_policy; - - if (!sched_enable_hmp) - return; - - sn = of_find_node_by_path("/sched-hmp"); - if (!sn) - return; - - if (!of_property_read_string(sn, "boost-policy", &boost_policy)) { - if (!strcmp(boost_policy, "boost-on-big")) - sched_boost_policy = SCHED_BOOST_ON_BIG; - else if (!strcmp(boost_policy, "boost-on-all")) - sched_boost_policy = SCHED_BOOST_ON_ALL; - } -} - unsigned int max_possible_efficiency = 1; unsigned int min_possible_efficiency = UINT_MAX; @@ -680,29 +632,6 @@ int __init set_sched_enable_hmp(char *str) early_param("sched_enable_hmp", set_sched_enable_hmp); -int got_boost_kick(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - return test_bit(BOOST_KICK, &rq->hmp_flags); -} - -inline void clear_boost_kick(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - clear_bit(BOOST_KICK, &rq->hmp_flags); -} - -inline void boost_kick(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) - smp_send_reschedule(cpu); -} - /* Clear any HMP scheduler related requests pending from or on cpu */ void clear_hmp_request(int cpu) { @@ -840,6 +769,9 @@ min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ /* Window size (in ns) */ __read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; +/* Maximum allowed threshold before freq aggregation must be enabled */ +#define MAX_FREQ_AGGR_THRESH 1000 + /* Temporarily disable window-stats activity on all cpus */ unsigned int __read_mostly sched_disable_window_stats; @@ -919,8 +851,8 @@ static const unsigned int top_tasks_bitmap_size = * C1 busy time = 5 + 5 + 6 = 16ms * */ -static __read_mostly unsigned int sched_freq_aggregate; -__read_mostly unsigned int sysctl_sched_freq_aggregate; +static __read_mostly unsigned int sched_freq_aggregate = 1; +__read_mostly unsigned int sysctl_sched_freq_aggregate = 1; unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct; static unsigned int __read_mostly sched_freq_aggregate_threshold; @@ -937,14 +869,6 @@ unsigned int max_task_load(void) /* Use this knob to turn on or off HMP-aware task placement logic */ unsigned int __read_mostly sched_enable_hmp; -/* - * Scheduler boost is a mechanism to temporarily place tasks on CPUs - * with higher capacity than those where a task would have normally - * ended up with their load characteristics. Any entity enabling - * boost is responsible for disabling it as well. - */ -unsigned int sysctl_sched_boost; - /* A cpu can no longer accommodate more tasks if: * * rq->nr_running > sysctl_sched_spill_nr_run || @@ -995,6 +919,21 @@ unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80; unsigned int __read_mostly sched_downmigrate; unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60; +/* + * Task groups whose aggregate demand on a cpu is more than + * sched_group_upmigrate need to be up-migrated if possible. + */ +unsigned int __read_mostly sched_group_upmigrate; +unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100; + +/* + * Task groups, once up-migrated, will need to drop their aggregate + * demand to less than sched_group_downmigrate before they are "down" + * migrated. + */ +unsigned int __read_mostly sched_group_downmigrate; +unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95; + /* * The load scale factor of a CPU gets boosted when its max frequency * is restricted due to which the tasks are migrating to higher capacity @@ -1017,33 +956,46 @@ sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; -void update_up_down_migrate(void) +static void +_update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate) { - unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); - unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); unsigned int delta; if (up_down_migrate_scale_factor == 1024) - goto done; + return; - delta = up_migrate - down_migrate; + delta = *up_migrate - *down_migrate; - up_migrate /= NSEC_PER_USEC; - up_migrate *= up_down_migrate_scale_factor; - up_migrate >>= 10; - up_migrate *= NSEC_PER_USEC; + *up_migrate /= NSEC_PER_USEC; + *up_migrate *= up_down_migrate_scale_factor; + *up_migrate >>= 10; + *up_migrate *= NSEC_PER_USEC; - up_migrate = min(up_migrate, sched_ravg_window); + *up_migrate = min(*up_migrate, sched_ravg_window); - down_migrate /= NSEC_PER_USEC; - down_migrate *= up_down_migrate_scale_factor; - down_migrate >>= 10; - down_migrate *= NSEC_PER_USEC; + *down_migrate /= NSEC_PER_USEC; + *down_migrate *= up_down_migrate_scale_factor; + *down_migrate >>= 10; + *down_migrate *= NSEC_PER_USEC; - down_migrate = min(down_migrate, up_migrate - delta); -done: + *down_migrate = min(*down_migrate, *up_migrate - delta); +} + +static void update_up_down_migrate(void) +{ + unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); + unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); + + _update_up_down_migrate(&up_migrate, &down_migrate); sched_upmigrate = up_migrate; sched_downmigrate = down_migrate; + + up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct); + down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct); + + _update_up_down_migrate(&up_migrate, &down_migrate); + sched_group_upmigrate = up_migrate; + sched_group_downmigrate = down_migrate; } void set_hmp_defaults(void) @@ -1134,82 +1086,6 @@ u64 cpu_load_sync(int cpu, int sync) return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu); } -static int boost_refcount; -static DEFINE_SPINLOCK(boost_lock); -static DEFINE_MUTEX(boost_mutex); - -static void boost_kick_cpus(void) -{ - int i; - - for_each_online_cpu(i) { - if (cpu_capacity(i) != max_capacity) - boost_kick(i); - } -} - -int sched_boost(void) -{ - return boost_refcount > 0; -} - -int sched_set_boost(int enable) -{ - unsigned long flags; - int ret = 0; - int old_refcount; - - if (!sched_enable_hmp) - return -EINVAL; - - spin_lock_irqsave(&boost_lock, flags); - - old_refcount = boost_refcount; - - if (enable == 1) { - boost_refcount++; - } else if (!enable) { - if (boost_refcount >= 1) - boost_refcount--; - else - ret = -EINVAL; - } else { - ret = -EINVAL; - } - - if (!old_refcount && boost_refcount) - boost_kick_cpus(); - - if (boost_refcount <= 1) - core_ctl_set_boost(boost_refcount == 1); - trace_sched_set_boost(boost_refcount); - spin_unlock_irqrestore(&boost_lock, flags); - - return ret; -} - -int sched_boost_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - - mutex_lock(&boost_mutex); - if (!write) - sysctl_sched_boost = sched_boost(); - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret || !write) - goto done; - - ret = (sysctl_sched_boost <= 1) ? - sched_set_boost(sysctl_sched_boost) : -EINVAL; - -done: - mutex_unlock(&boost_mutex); - return ret; -} - /* * Task will fit on a cpu if it's bandwidth consumption on that cpu * will be less than sched_upmigrate. A big task that was previously @@ -1219,60 +1095,63 @@ done: * tasks with load close to the upmigrate threshold */ int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, - enum sched_boost_type boost_type) + enum sched_boost_policy boost_policy) { - int upmigrate; + int upmigrate = sched_upmigrate; if (cpu_capacity(cpu) == max_capacity) return 1; - if (boost_type != SCHED_BOOST_ON_BIG) { + if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) + upmigrate = sched_downmigrate; + + if (boost_policy != SCHED_BOOST_ON_BIG) { if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p)) return 1; - upmigrate = sched_upmigrate; - if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) - upmigrate = sched_downmigrate; - if (task_load < upmigrate) return 1; + } else { + if (task_sched_boost(p) || task_load >= upmigrate) + return 0; + + return 1; } return 0; } -enum sched_boost_type sched_boost_type(void) -{ - if (sched_boost()) - return sched_boost_policy; - - return SCHED_BOOST_NONE; -} - int task_will_fit(struct task_struct *p, int cpu) { u64 tload = scale_load_to_cpu(task_load(p), cpu); - return task_load_will_fit(p, tload, cpu, sched_boost_type()); + return task_load_will_fit(p, tload, cpu, sched_boost_policy()); } -int group_will_fit(struct sched_cluster *cluster, - struct related_thread_group *grp, u64 demand) +static int +group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp, + u64 demand, bool group_boost) { int cpu = cluster_first_cpu(cluster); int prev_capacity = 0; - unsigned int threshold = sched_upmigrate; + unsigned int threshold = sched_group_upmigrate; u64 load; if (cluster->capacity == max_capacity) return 1; + if (group_boost) + return 0; + + if (!demand) + return 1; + if (grp->preferred_cluster) prev_capacity = grp->preferred_cluster->capacity; if (cluster->capacity < prev_capacity) - threshold = sched_downmigrate; + threshold = sched_group_downmigrate; load = scale_load_to_cpu(demand, cpu); if (load < threshold) @@ -1495,6 +1374,23 @@ void post_big_task_count_change(const struct cpumask *cpus) DEFINE_MUTEX(policy_mutex); +unsigned int update_freq_aggregate_threshold(unsigned int threshold) +{ + unsigned int old_threshold; + + mutex_lock(&policy_mutex); + + old_threshold = sysctl_sched_freq_aggregate_threshold_pct; + + sysctl_sched_freq_aggregate_threshold_pct = threshold; + sched_freq_aggregate_threshold = + pct_to_real(sysctl_sched_freq_aggregate_threshold_pct); + + mutex_unlock(&policy_mutex); + + return old_threshold; +} + static inline int invalid_value_freq_input(unsigned int *data) { if (data == &sysctl_sched_freq_aggregate) @@ -1578,7 +1474,9 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write, if (write && (old_val == *data)) goto done; - if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) { + if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct || + sysctl_sched_group_downmigrate_pct > + sysctl_sched_group_upmigrate_pct) { *data = old_val; ret = -EINVAL; goto done; @@ -3801,13 +3699,13 @@ static void check_for_up_down_migrate_update(const struct cpumask *cpus) } /* Return cluster which can offer required capacity for group */ -static struct sched_cluster * -best_cluster(struct related_thread_group *grp, u64 total_demand) +static struct sched_cluster *best_cluster(struct related_thread_group *grp, + u64 total_demand, bool group_boost) { struct sched_cluster *cluster = NULL; for_each_sched_cluster(cluster) { - if (group_will_fit(cluster, grp, total_demand)) + if (group_will_fit(cluster, grp, total_demand, group_boost)) return cluster; } @@ -3818,6 +3716,9 @@ static void _set_preferred_cluster(struct related_thread_group *grp) { struct task_struct *p; u64 combined_demand = 0; + bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG; + bool group_boost = false; + u64 wallclock; if (!sysctl_sched_enable_colocation) { grp->last_update = sched_ktime_clock(); @@ -3825,19 +3726,36 @@ static void _set_preferred_cluster(struct related_thread_group *grp) return; } + if (list_empty(&grp->tasks)) + return; + + wallclock = sched_ktime_clock(); + /* * wakeup of two or more related tasks could race with each other and * could result in multiple calls to _set_preferred_cluster being issued * at same time. Avoid overhead in such cases of rechecking preferred * cluster */ - if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10) + if (wallclock - grp->last_update < sched_ravg_window / 10) return; - list_for_each_entry(p, &grp->tasks, grp_list) + list_for_each_entry(p, &grp->tasks, grp_list) { + if (boost_on_big && task_sched_boost(p)) { + group_boost = true; + break; + } + + if (p->ravg.mark_start < wallclock - + (sched_ravg_window * sched_ravg_hist_size)) + continue; + combined_demand += p->ravg.demand; - grp->preferred_cluster = best_cluster(grp, combined_demand); + } + + grp->preferred_cluster = best_cluster(grp, + combined_demand, group_boost); grp->last_update = sched_ktime_clock(); trace_sched_set_preferred_cluster(grp, combined_demand); } @@ -3852,6 +3770,8 @@ void set_preferred_cluster(struct related_thread_group *grp) #define ADD_TASK 0 #define REM_TASK 1 +#define DEFAULT_CGROUP_COLOC_ID 1 + static inline void free_group_cputime(struct related_thread_group *grp) { free_percpu(grp->cpu_time); @@ -4109,7 +4029,8 @@ static void remove_task_from_group(struct task_struct *p) raw_spin_unlock(&grp->lock); - if (empty_group) { + /* Reserved groups cannot be destroyed */ + if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) { list_del(&grp->list); call_rcu(&grp->rcu, free_related_thread_group); } @@ -4143,20 +4064,25 @@ void add_new_task_to_grp(struct task_struct *new) { unsigned long flags; struct related_thread_group *grp; - struct task_struct *parent; + struct task_struct *leader = new->group_leader; + unsigned int leader_grp_id = sched_get_group_id(leader); - if (!sysctl_sched_enable_thread_grouping) + if (!sysctl_sched_enable_thread_grouping && + leader_grp_id != DEFAULT_CGROUP_COLOC_ID) return; if (thread_group_leader(new)) return; - parent = new->group_leader; + if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) { + if (!same_schedtune(new, leader)) + return; + } write_lock_irqsave(&related_thread_group_lock, flags); rcu_read_lock(); - grp = task_related_thread_group(parent); + grp = task_related_thread_group(leader); rcu_read_unlock(); /* @@ -4179,6 +4105,47 @@ void add_new_task_to_grp(struct task_struct *new) write_unlock_irqrestore(&related_thread_group_lock, flags); } +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +/* + * We create a default colocation group at boot. There is no need to + * synchronize tasks between cgroups at creation time because the + * correct cgroup hierarchy is not available at boot. Therefore cgroup + * colocation is turned off by default even though the colocation group + * itself has been allocated. Furthermore this colocation group cannot + * be destroyted once it has been created. All of this has been as part + * of runtime optimizations. + * + * The job of synchronizing tasks to the colocation group is done when + * the colocation flag in the cgroup is turned on. + */ +static int __init create_default_coloc_group(void) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + + grp = alloc_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + if (IS_ERR(grp)) { + WARN_ON(1); + return -ENOMEM; + } + + write_lock_irqsave(&related_thread_group_lock, flags); + list_add(&grp->list, &related_thread_groups); + write_unlock_irqrestore(&related_thread_group_lock, flags); + + update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH); + return 0; +} +late_initcall(create_default_coloc_group); + +int sync_cgroup_colocation(struct task_struct *p, bool insert) +{ + unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; + + return sched_set_group_id(p, grp_id); +} +#endif + int sched_set_group_id(struct task_struct *p, unsigned int group_id) { int rc = 0; @@ -4201,6 +4168,12 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) grp = lookup_related_thread_group(group_id); if (!grp) { + /* This is a reserved id */ + if (group_id == DEFAULT_CGROUP_COLOC_ID) { + rc = -EINVAL; + goto done; + } + grp = alloc_related_thread_group(group_id); if (IS_ERR(grp)) { rc = -ENOMEM; @@ -4210,7 +4183,6 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) list_add(&grp->list, &related_thread_groups); } - BUG_ON(!grp); rc = add_task_to_group(p, grp); done: write_unlock(&related_thread_group_lock); @@ -4459,7 +4431,7 @@ bool early_detection_notify(struct rq *rq, u64 wallclock) struct task_struct *p; int loop_max = 10; - if (!sched_boost() || !rq->cfs.h_nr_running) + if (sched_boost_policy() == SCHED_BOOST_NONE || !rq->cfs.h_nr_running) return 0; rq->ed_task = NULL; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ba4403e910d8..12a04f30ef77 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1677,8 +1677,13 @@ static int find_lowest_rq_hmp(struct task_struct *task) int prev_cpu = task_cpu(task); u64 cpu_load, min_load = ULLONG_MAX; int i; - int restrict_cluster = sched_boost() ? 0 : - sysctl_sched_restrict_cluster_spill; + int restrict_cluster; + int boost_on_big; + + boost_on_big = sched_boost() == FULL_THROTTLE_BOOST && + sched_boost_policy() == SCHED_BOOST_ON_BIG; + + restrict_cluster = sysctl_sched_restrict_cluster_spill; /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) @@ -1697,6 +1702,9 @@ static int find_lowest_rq_hmp(struct task_struct *task) */ for_each_sched_cluster(cluster) { + if (boost_on_big && cluster->capacity != max_possible_capacity) + continue; + cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); cpumask_andnot(&candidate_mask, &candidate_mask, cpu_isolated_mask); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4289bf6cd642..30838bb9b442 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1061,8 +1061,6 @@ extern unsigned int max_load_scale_factor; extern unsigned int max_possible_capacity; extern unsigned int min_max_possible_capacity; extern unsigned int max_power_cost; -extern unsigned int sched_upmigrate; -extern unsigned int sched_downmigrate; extern unsigned int sched_init_task_load_windows; extern unsigned int up_down_migrate_scale_factor; extern unsigned int sysctl_sched_restrict_cluster_spill; @@ -1106,18 +1104,23 @@ extern void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock); extern unsigned int cpu_temp(int cpu); extern unsigned int nr_eligible_big_tasks(int cpu); -extern void update_up_down_migrate(void); extern int update_preferred_cluster(struct related_thread_group *grp, struct task_struct *p, u32 old_load); extern void set_preferred_cluster(struct related_thread_group *grp); extern void add_new_task_to_grp(struct task_struct *new); +extern unsigned int update_freq_aggregate_threshold(unsigned int threshold); -enum sched_boost_type { +enum sched_boost_policy { SCHED_BOOST_NONE, SCHED_BOOST_ON_BIG, SCHED_BOOST_ON_ALL, }; +#define NO_BOOST 0 +#define FULL_THROTTLE_BOOST 1 +#define CONSERVATIVE_BOOST 2 +#define RESTRAINED_BOOST 3 + static inline struct sched_cluster *cpu_cluster(int cpu) { return cpu_rq(cpu)->cluster; @@ -1387,14 +1390,11 @@ extern void set_hmp_defaults(void); extern int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost); extern unsigned int power_cost(int cpu, u64 demand); extern void reset_all_window_stats(u64 window_start, unsigned int window_size); -extern void boost_kick(int cpu); extern int sched_boost(void); extern int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, - enum sched_boost_type boost_type); -extern enum sched_boost_type sched_boost_type(void); + enum sched_boost_policy boost_policy); +extern enum sched_boost_policy sched_boost_policy(void); extern int task_will_fit(struct task_struct *p, int cpu); -extern int group_will_fit(struct sched_cluster *cluster, - struct related_thread_group *grp, u64 demand); extern u64 cpu_load(int cpu); extern u64 cpu_load_sync(int cpu, int sync); extern int preferred_cluster(struct sched_cluster *cluster, @@ -1422,10 +1422,32 @@ extern u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, struct cftype *cft); extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 upmigrate_discourage); -extern void sched_hmp_parse_dt(void); -extern void init_sched_hmp_boost_policy(void); +extern void sched_boost_parse_dt(void); extern void clear_top_tasks_bitmap(unsigned long *bitmap); +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +extern bool task_sched_boost(struct task_struct *p); +extern int sync_cgroup_colocation(struct task_struct *p, bool insert); +extern bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2); +extern void update_cgroup_boost_settings(void); +extern void restore_cgroup_boost_settings(void); + +#else +static inline bool +same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2) +{ + return true; +} + +static inline bool task_sched_boost(struct task_struct *p) +{ + return true; +} + +static inline void update_cgroup_boost_settings(void) { } +static inline void restore_cgroup_boost_settings(void) { } +#endif + #else /* CONFIG_SCHED_HMP */ struct hmp_sched_stats; @@ -1615,8 +1637,7 @@ static inline void post_big_task_count_change(void) { } static inline void set_hmp_defaults(void) { } static inline void clear_reserved(int cpu) { } -static inline void sched_hmp_parse_dt(void) {} -static inline void init_sched_hmp_boost_policy(void) {} +static inline void sched_boost_parse_dt(void) {} #define trace_sched_cpu_load(...) #define trace_sched_cpu_load_lb(...) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 4f8182302e5e..ee2af8e0b5ce 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -25,6 +25,33 @@ struct schedtune { /* Boost value for tasks on that SchedTune CGroup */ int boost; +#ifdef CONFIG_SCHED_HMP + /* Toggle ability to override sched boost enabled */ + bool sched_boost_no_override; + + /* + * Controls whether a cgroup is eligible for sched boost or not. This + * can temporariliy be disabled by the kernel based on the no_override + * flag above. + */ + bool sched_boost_enabled; + + /* + * This tracks the default value of sched_boost_enabled and is used + * restore the value following any temporary changes to that flag. + */ + bool sched_boost_enabled_backup; + + /* + * Controls whether tasks of this cgroup should be colocated with each + * other and tasks of other cgroups that have the same flag turned on. + */ + bool colocate; + + /* Controls whether further updates are allowed to the colocate flag */ + bool colocate_update_disabled; +#endif + }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -54,6 +81,13 @@ static inline struct schedtune *parent_st(struct schedtune *st) static struct schedtune root_schedtune = { .boost = 0, +#ifdef CONFIG_SCHED_HMP + .sched_boost_no_override = false, + .sched_boost_enabled = true, + .sched_boost_enabled_backup = true, + .colocate = false, + .colocate_update_disabled = false, +#endif }; /* @@ -97,6 +131,121 @@ struct boost_groups { /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); +#ifdef CONFIG_SCHED_HMP +static inline void init_sched_boost(struct schedtune *st) +{ + st->sched_boost_no_override = false; + st->sched_boost_enabled = true; + st->sched_boost_enabled_backup = st->sched_boost_enabled; + st->colocate = false; + st->colocate_update_disabled = false; +} + +bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2) +{ + return task_schedtune(tsk1) == task_schedtune(tsk2); +} + +void update_cgroup_boost_settings(void) +{ + int i; + + for (i = 0; i < BOOSTGROUPS_COUNT; i++) { + if (!allocated_group[i]) + break; + + if (allocated_group[i]->sched_boost_no_override) + continue; + + allocated_group[i]->sched_boost_enabled = false; + } +} + +void restore_cgroup_boost_settings(void) +{ + int i; + + for (i = 0; i < BOOSTGROUPS_COUNT; i++) { + if (!allocated_group[i]) + break; + + allocated_group[i]->sched_boost_enabled = + allocated_group[i]->sched_boost_enabled_backup; + } +} + +bool task_sched_boost(struct task_struct *p) +{ + struct schedtune *st = task_schedtune(p); + + return st->sched_boost_enabled; +} + +static u64 +sched_boost_override_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->sched_boost_no_override; +} + +static int sched_boost_override_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 override) +{ + struct schedtune *st = css_st(css); + + st->sched_boost_no_override = !!override; + + return 0; +} + +static u64 sched_boost_enabled_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->sched_boost_enabled; +} + +static int sched_boost_enabled_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 enable) +{ + struct schedtune *st = css_st(css); + + st->sched_boost_enabled = !!enable; + st->sched_boost_enabled_backup = st->sched_boost_enabled; + + return 0; +} + +static u64 sched_colocate_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->colocate; +} + +static int sched_colocate_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 colocate) +{ + struct schedtune *st = css_st(css); + + if (st->colocate_update_disabled) + return -EPERM; + + st->colocate = !!colocate; + st->colocate_update_disabled = true; + return 0; +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void init_sched_boost(struct schedtune *st) { } + +#endif /* CONFIG_SCHED_HMP */ + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -121,12 +270,45 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, return 0; } +static void schedtune_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + struct schedtune *st; + bool colocate; + + cgroup_taskset_first(tset, &css); + st = css_st(css); + + colocate = st->colocate; + + cgroup_taskset_for_each(task, css, tset) + sync_cgroup_colocation(task, colocate); +} + static struct cftype files[] = { { .name = "boost", .read_u64 = boost_read, .write_u64 = boost_write, }, +#ifdef CONFIG_SCHED_HMP + { + .name = "sched_boost_no_override", + .read_u64 = sched_boost_override_read, + .write_u64 = sched_boost_override_write, + }, + { + .name = "sched_boost_enabled", + .read_u64 = sched_boost_enabled_read, + .write_u64 = sched_boost_enabled_write, + }, + { + .name = "colocate", + .read_u64 = sched_colocate_read, + .write_u64 = sched_colocate_write, + }, +#endif { } /* terminate */ }; @@ -189,6 +371,7 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css) /* Initialize per CPUs boost group support */ st->idx = idx; + init_sched_boost(st); if (schedtune_boostgroup_init(st)) goto release; @@ -222,6 +405,7 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .legacy_cftypes = files, .early_init = 1, .allow_attach = subsys_cgroup_allow_attach, + .attach = schedtune_attach, }; #endif /* CONFIG_CGROUP_SCHEDTUNE */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 574316f1c344..b7cbd7940f7b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -124,6 +124,7 @@ static int __maybe_unused neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; @@ -376,6 +377,22 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .procname = "sched_group_upmigrate", + .data = &sysctl_sched_group_upmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + }, + { + .procname = "sched_group_downmigrate", + .data = &sysctl_sched_group_downmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + }, { .procname = "sched_init_task_load", .data = &sysctl_sched_init_task_load_pct, @@ -487,6 +504,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_boost_handler, + .extra1 = &zero, + .extra2 = &three, }, #endif /* CONFIG_SCHED_HMP */ #ifdef CONFIG_SCHED_DEBUG