Merge "sched/hmp: Enhance co-location and scheduler boost features"

This commit is contained in:
Linux Build Service Account 2016-11-18 01:54:54 -08:00 committed by Gerrit - the friendly Code Review server
commit 54e5bae2ed
11 changed files with 800 additions and 385 deletions

View file

@ -53,6 +53,8 @@ extern unsigned int sysctl_sched_spill_nr_run;
extern unsigned int sysctl_sched_spill_load_pct;
extern unsigned int sysctl_sched_upmigrate_pct;
extern unsigned int sysctl_sched_downmigrate_pct;
extern unsigned int sysctl_sched_group_upmigrate_pct;
extern unsigned int sysctl_sched_group_downmigrate_pct;
extern unsigned int sysctl_early_detection_duration;
extern unsigned int sysctl_sched_boost;
extern unsigned int sysctl_sched_small_wakee_task_load_pct;

View file

@ -133,6 +133,7 @@ TRACE_EVENT(sched_task_load,
__field( u32, flags )
__field( int, best_cpu )
__field( u64, latency )
__field( int, grp_id )
),
TP_fast_assign(
@ -148,12 +149,13 @@ TRACE_EVENT(sched_task_load,
__entry->latency = p->state == TASK_WAKING ?
sched_ktime_clock() -
p->ravg.mark_start : 0;
__entry->grp_id = p->grp ? p->grp->id : 0;
),
TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x best_cpu=%d latency=%llu",
TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x grp=%d best_cpu=%d latency=%llu",
__entry->pid, __entry->comm, __entry->demand,
__entry->boost, __entry->reason, __entry->sync,
__entry->need_idle, __entry->flags,
__entry->need_idle, __entry->flags, __entry->grp_id,
__entry->best_cpu, __entry->latency)
);
@ -164,9 +166,12 @@ TRACE_EVENT(sched_set_preferred_cluster,
TP_ARGS(grp, total_demand),
TP_STRUCT__entry(
__field( int, id )
__field( u64, demand )
__field( int, cluster_first_cpu )
__field( int, id )
__field( u64, demand )
__field( int, cluster_first_cpu )
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field(unsigned int, task_demand )
),
TP_fast_assign(
@ -245,19 +250,19 @@ DEFINE_EVENT(sched_cpu_load, sched_cpu_load_cgroup,
TRACE_EVENT(sched_set_boost,
TP_PROTO(int ref_count),
TP_PROTO(int type),
TP_ARGS(ref_count),
TP_ARGS(type),
TP_STRUCT__entry(
__field(unsigned int, ref_count )
__field(int, type )
),
TP_fast_assign(
__entry->ref_count = ref_count;
__entry->type = type;
),
TP_printk("ref_count=%d", __entry->ref_count)
TP_printk("type %d", __entry->type)
);
#if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_HMP)

View file

@ -15,7 +15,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o completion.o idle.o sched_avg.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_HMP) += hmp.o
obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o

226
kernel/sched/boost.c Normal file
View file

@ -0,0 +1,226 @@
/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
* only version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "sched.h"
#include <linux/of.h>
#include <linux/sched/core_ctl.h>
#include <trace/events/sched.h>
/*
* Scheduler boost is a mechanism to temporarily place tasks on CPUs
* with higher capacity than those where a task would have normally
* ended up with their load characteristics. Any entity enabling
* boost is responsible for disabling it as well.
*/
unsigned int sysctl_sched_boost;
static enum sched_boost_policy boost_policy;
static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE;
static DEFINE_MUTEX(boost_mutex);
static unsigned int freq_aggr_threshold_backup;
static inline void boost_kick(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
smp_send_reschedule(cpu);
}
static void boost_kick_cpus(void)
{
int i;
struct cpumask kick_mask;
if (boost_policy != SCHED_BOOST_ON_BIG)
return;
cpumask_andnot(&kick_mask, cpu_online_mask, cpu_isolated_mask);
for_each_cpu(i, &kick_mask) {
if (cpu_capacity(i) != max_capacity)
boost_kick(i);
}
}
int got_boost_kick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
return test_bit(BOOST_KICK, &rq->hmp_flags);
}
void clear_boost_kick(int cpu)
{
struct rq *rq = cpu_rq(cpu);
clear_bit(BOOST_KICK, &rq->hmp_flags);
}
/*
* Scheduler boost type and boost policy might at first seem unrelated,
* however, there exists a connection between them that will allow us
* to use them interchangeably during placement decisions. We'll explain
* the connection here in one possible way so that the implications are
* clear when looking at placement policies.
*
* When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED
* When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can
* neither be none nor RESTRAINED.
*/
static void set_boost_policy(int type)
{
if (type == SCHED_BOOST_NONE || type == RESTRAINED_BOOST) {
boost_policy = SCHED_BOOST_NONE;
return;
}
if (boost_policy_dt) {
boost_policy = boost_policy_dt;
return;
}
if (min_possible_efficiency != max_possible_efficiency) {
boost_policy = SCHED_BOOST_ON_BIG;
return;
}
boost_policy = SCHED_BOOST_ON_ALL;
}
enum sched_boost_policy sched_boost_policy(void)
{
return boost_policy;
}
static bool verify_boost_params(int old_val, int new_val)
{
/*
* Boost can only be turned on or off. There is no possiblity of
* switching from one boost type to another or to set the same
* kind of boost several times.
*/
return !(!!old_val == !!new_val);
}
static void _sched_set_boost(int old_val, int type)
{
switch (type) {
case NO_BOOST:
if (old_val == FULL_THROTTLE_BOOST)
core_ctl_set_boost(false);
else if (old_val == CONSERVATIVE_BOOST)
restore_cgroup_boost_settings();
else
update_freq_aggregate_threshold(
freq_aggr_threshold_backup);
break;
case FULL_THROTTLE_BOOST:
core_ctl_set_boost(true);
boost_kick_cpus();
break;
case CONSERVATIVE_BOOST:
update_cgroup_boost_settings();
boost_kick_cpus();
break;
case RESTRAINED_BOOST:
freq_aggr_threshold_backup =
update_freq_aggregate_threshold(1);
break;
default:
WARN_ON(1);
return;
}
set_boost_policy(type);
sysctl_sched_boost = type;
trace_sched_set_boost(type);
}
void sched_boost_parse_dt(void)
{
struct device_node *sn;
const char *boost_policy;
if (!sched_enable_hmp)
return;
sn = of_find_node_by_path("/sched-hmp");
if (!sn)
return;
if (!of_property_read_string(sn, "boost-policy", &boost_policy)) {
if (!strcmp(boost_policy, "boost-on-big"))
boost_policy_dt = SCHED_BOOST_ON_BIG;
else if (!strcmp(boost_policy, "boost-on-all"))
boost_policy_dt = SCHED_BOOST_ON_ALL;
}
}
int sched_set_boost(int type)
{
int ret = 0;
if (!sched_enable_hmp)
return -EINVAL;
mutex_lock(&boost_mutex);
if (verify_boost_params(sysctl_sched_boost, type))
_sched_set_boost(sysctl_sched_boost, type);
else
ret = -EINVAL;
mutex_unlock(&boost_mutex);
return ret;
}
int sched_boost_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
unsigned int *data = (unsigned int *)table->data;
unsigned int old_val;
if (!sched_enable_hmp)
return -EINVAL;
mutex_lock(&boost_mutex);
old_val = *data;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
goto done;
if (verify_boost_params(old_val, *data)) {
_sched_set_boost(old_val, *data);
} else {
*data = old_val;
ret = -EINVAL;
}
done:
mutex_unlock(&boost_mutex);
return ret;
}
int sched_boost(void)
{
return sysctl_sched_boost;
}

View file

@ -7846,7 +7846,6 @@ void __init sched_init_smp(void)
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
update_cluster_topology();
init_sched_hmp_boost_policy();
init_hrtick();
@ -7895,7 +7894,7 @@ void __init sched_init(void)
BUG_ON(num_possible_cpus() > BITS_PER_LONG);
sched_hmp_parse_dt();
sched_boost_parse_dt();
init_clusters();
#ifdef CONFIG_FAIR_GROUP_SCHED

View file

@ -2596,6 +2596,7 @@ static u32 __compute_runnable_contrib(u64 n)
#define SBC_FLAG_COLOC_CLUSTER 0x10000
#define SBC_FLAG_WAKER_CLUSTER 0x20000
#define SBC_FLAG_BACKUP_CLUSTER 0x40000
#define SBC_FLAG_BOOST_CLUSTER 0x80000
struct cpu_select_env {
struct task_struct *p;
@ -2605,7 +2606,7 @@ struct cpu_select_env {
u8 need_waker_cluster:1;
u8 sync:1;
u8 ignore_prev_cpu:1;
enum sched_boost_type boost_type;
enum sched_boost_policy boost_policy;
int prev_cpu;
DECLARE_BITMAP(candidate_list, NR_CPUS);
DECLARE_BITMAP(backup_list, NR_CPUS);
@ -2705,10 +2706,38 @@ select_least_power_cluster(struct cpu_select_env *env)
struct sched_cluster *cluster;
if (env->rtg) {
env->task_load = scale_load_to_cpu(task_load(env->p),
cluster_first_cpu(env->rtg->preferred_cluster));
env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
return env->rtg->preferred_cluster;
int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
if (task_load_will_fit(env->p, env->task_load,
cpu, env->boost_policy)) {
env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
if (env->boost_policy == SCHED_BOOST_NONE)
return env->rtg->preferred_cluster;
for_each_sched_cluster(cluster) {
if (cluster != env->rtg->preferred_cluster) {
__set_bit(cluster->id,
env->backup_list);
__clear_bit(cluster->id,
env->candidate_list);
}
}
return env->rtg->preferred_cluster;
}
/*
* Since the task load does not fit on the preferred
* cluster anymore, pretend that the task does not
* have any preferred cluster. This allows the waking
* task to get the appropriate CPU it needs as per the
* non co-location placement policy without having to
* wait until the preferred cluster is updated.
*/
env->rtg = NULL;
}
for_each_sched_cluster(cluster) {
@ -2718,7 +2747,7 @@ select_least_power_cluster(struct cpu_select_env *env)
env->task_load = scale_load_to_cpu(task_load(env->p),
cpu);
if (task_load_will_fit(env->p, env->task_load, cpu,
env->boost_type))
env->boost_policy))
return cluster;
__set_bit(cluster->id, env->backup_list);
@ -2961,7 +2990,14 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c,
update_spare_capacity(stats, env, i, c->capacity,
env->cpu_load);
if (env->boost_type == SCHED_BOOST_ON_ALL ||
/*
* need_idle takes precedence over sched boost but when both
* are set, idlest CPU with in all the clusters is selected
* when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
* big cluster is selected within boost_policy = BOOST_ON_BIG.
*/
if ((!env->need_idle &&
env->boost_policy != SCHED_BOOST_NONE) ||
env->need_waker_cluster ||
sched_cpu_high_irqload(i) ||
spill_threshold_crossed(env, cpu_rq(i)))
@ -3005,7 +3041,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
struct task_struct *task = env->p;
struct sched_cluster *cluster;
if (env->boost_type != SCHED_BOOST_NONE || env->reason ||
if (env->boost_policy != SCHED_BOOST_NONE || env->reason ||
!task->ravg.mark_start ||
env->need_idle || !sched_short_sleep_task_threshold)
return false;
@ -3034,7 +3070,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
cluster = cpu_rq(prev_cpu)->cluster;
if (!task_load_will_fit(task, env->task_load, prev_cpu,
sched_boost_type())) {
sched_boost_policy())) {
__set_bit(cluster->id, env->backup_list);
__clear_bit(cluster->id, env->candidate_list);
@ -3056,7 +3092,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
static inline bool
wake_to_waker_cluster(struct cpu_select_env *env)
{
return env->boost_type == SCHED_BOOST_NONE &&
return env->boost_policy == SCHED_BOOST_NONE &&
!env->need_idle && !env->reason && env->sync &&
task_load(current) > sched_big_waker_task_load &&
task_load(env->p) < sched_small_wakee_task_load;
@ -3098,7 +3134,6 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
.reason = reason,
.need_idle = wake_to_idle(p),
.need_waker_cluster = 0,
.boost_type = sched_boost_type(),
.sync = sync,
.prev_cpu = target,
.ignore_prev_cpu = 0,
@ -3107,6 +3142,9 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
.sbc_best_cluster_flag = 0,
};
env.boost_policy = task_sched_boost(p) ?
sched_boost_policy() : SCHED_BOOST_NONE;
bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
bitmap_zero(env.backup_list, NR_CPUS);
@ -3178,12 +3216,23 @@ retry:
sbc_flag |= env.sbc_best_flag;
target = stats.best_cpu;
} else {
if (env.rtg) {
if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
env.rtg = NULL;
goto retry;
}
find_backup_cluster(&env, &stats);
/*
* With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
* backup_list = little cluster, candidate_list = none and
* stats->best_capacity_cpu points the best spare capacity
* CPU among the CPUs in the big cluster.
*/
if (env.boost_policy == SCHED_BOOST_ON_BIG &&
stats.best_capacity_cpu >= 0)
sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
else
find_backup_cluster(&env, &stats);
if (stats.best_capacity_cpu >= 0) {
target = stats.best_capacity_cpu;
sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
@ -3193,8 +3242,8 @@ retry:
out:
sbc_flag |= env.sbc_best_cluster_flag;
rcu_read_unlock();
trace_sched_task_load(p, sched_boost(), env.reason, env.sync,
env.need_idle, sbc_flag, target);
trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
env.reason, env.sync, env.need_idle, sbc_flag, target);
return target;
}
@ -3402,11 +3451,9 @@ static inline int migration_needed(struct task_struct *p, int cpu)
if (task_will_be_throttled(p))
return 0;
if (sched_boost_type() == SCHED_BOOST_ON_BIG) {
if (cpu_capacity(cpu) != max_capacity)
return UP_MIGRATION;
return 0;
}
if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
return UP_MIGRATION;
if (sched_cpu_high_irqload(cpu))
return IRQLOAD_MIGRATION;
@ -3420,7 +3467,7 @@ static inline int migration_needed(struct task_struct *p, int cpu)
return DOWN_MIGRATION;
}
if (!grp && !task_will_fit(p, cpu)) {
if (!task_will_fit(p, cpu)) {
rcu_read_unlock();
return UP_MIGRATION;
}
@ -6648,10 +6695,7 @@ enum fbq_type { regular, remote, all };
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
#define LBF_SCHED_BOOST_ACTIVE_BALANCE 0x40
#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
#define LBF_HMP_ACTIVE_BALANCE (LBF_SCHED_BOOST_ACTIVE_BALANCE | \
LBF_BIG_TASK_ACTIVE_BALANCE)
#define LBF_IGNORE_BIG_TASKS 0x100
#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
@ -6682,6 +6726,7 @@ struct lb_env {
enum fbq_type fbq_type;
struct list_head tasks;
enum sched_boost_policy boost_policy;
};
/*
@ -6826,9 +6871,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu) &&
nr_big_tasks(env->src_rq) && !is_big_task(p))
return 0;
if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
if (nr_big_tasks(env->src_rq) && !is_big_task(p))
return 0;
if (env->boost_policy == SCHED_BOOST_ON_BIG &&
!task_sched_boost(p))
return 0;
}
twf = task_will_fit(p, env->dst_cpu);
@ -6951,8 +7001,7 @@ static int detach_tasks(struct lb_env *env)
if (env->imbalance <= 0)
return 0;
if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu) &&
!sched_boost())
if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
env->flags |= LBF_IGNORE_BIG_TASKS;
else if (!same_cluster(env->dst_cpu, env->src_cpu))
env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
@ -7255,8 +7304,10 @@ bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
int local_capacity, busiest_capacity;
int local_pwr_cost, busiest_pwr_cost;
int nr_cpus;
int boost = sched_boost();
if (!sysctl_sched_restrict_cluster_spill || sched_boost())
if (!sysctl_sched_restrict_cluster_spill ||
boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
return 0;
local_cpu = group_first_cpu(sds->local);
@ -7628,11 +7679,6 @@ static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
{
if (env->idle != CPU_NOT_IDLE &&
cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
if (sched_boost() && !sds->busiest && sgs->sum_nr_running) {
env->flags |= LBF_SCHED_BOOST_ACTIVE_BALANCE;
return true;
}
if (sgs->sum_nr_big_tasks >
sds->busiest_stat.sum_nr_big_tasks) {
env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
@ -8045,7 +8091,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
if (env->flags & LBF_HMP_ACTIVE_BALANCE)
if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
goto force_balance;
if (bail_inter_cluster_balance(env, &sds))
@ -8257,7 +8303,7 @@ static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
if (env->flags & LBF_HMP_ACTIVE_BALANCE)
if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
return 1;
if (env->idle == CPU_NEWLY_IDLE) {
@ -8348,20 +8394,21 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.dst_grpmask = sched_group_cpus(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
.imbalance = 0,
.flags = 0,
.loop = 0,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.dst_grpmask = sched_group_cpus(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
.imbalance = 0,
.flags = 0,
.loop = 0,
.busiest_nr_running = 0,
.busiest_grp_capacity = 0,
.boost_policy = sched_boost_policy(),
};
/*
@ -8510,7 +8557,7 @@ more_balance:
no_move:
if (!ld_moved) {
if (!(env.flags & LBF_HMP_ACTIVE_BALANCE))
if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
schedstat_inc(sd, lb_failed[idle]);
/*
@ -8520,7 +8567,7 @@ no_move:
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE &&
!(env.flags & LBF_HMP_ACTIVE_BALANCE))
!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
@ -8797,6 +8844,7 @@ static int active_load_balance_cpu_stop(void *data)
.busiest_grp_capacity = 0,
.flags = 0,
.loop = 0,
.boost_policy = sched_boost_policy(),
};
bool moved = false;
@ -9272,7 +9320,8 @@ static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
if (rq->nr_running < 2)
return 0;
if (!sysctl_sched_restrict_cluster_spill || sched_boost())
if (!sysctl_sched_restrict_cluster_spill ||
sched_boost_policy() == SCHED_BOOST_ON_ALL)
return 1;
if (cpu_max_power_cost(cpu) == max_power_cost)

View file

@ -17,8 +17,6 @@
#include <linux/cpufreq.h>
#include <linux/list_sort.h>
#include <linux/syscore_ops.h>
#include <linux/of.h>
#include <linux/sched/core_ctl.h>
#include "sched.h"
@ -231,52 +229,6 @@ fail:
return ret;
}
/*
* It is possible that CPUs of the same micro architecture can have slight
* difference in the efficiency due to other factors like cache size. The
* BOOST_ON_BIG policy may not be optimial for such systems. The required
* boost policy can be specified via device tree to handle this.
*/
static int __read_mostly sched_boost_policy = SCHED_BOOST_NONE;
/*
* This should be called after clusters are populated and
* the respective efficiency values are initialized.
*/
void init_sched_hmp_boost_policy(void)
{
/*
* Initialize the boost type here if it is not passed from
* device tree.
*/
if (sched_boost_policy == SCHED_BOOST_NONE) {
if (max_possible_efficiency != min_possible_efficiency)
sched_boost_policy = SCHED_BOOST_ON_BIG;
else
sched_boost_policy = SCHED_BOOST_ON_ALL;
}
}
void sched_hmp_parse_dt(void)
{
struct device_node *sn;
const char *boost_policy;
if (!sched_enable_hmp)
return;
sn = of_find_node_by_path("/sched-hmp");
if (!sn)
return;
if (!of_property_read_string(sn, "boost-policy", &boost_policy)) {
if (!strcmp(boost_policy, "boost-on-big"))
sched_boost_policy = SCHED_BOOST_ON_BIG;
else if (!strcmp(boost_policy, "boost-on-all"))
sched_boost_policy = SCHED_BOOST_ON_ALL;
}
}
unsigned int max_possible_efficiency = 1;
unsigned int min_possible_efficiency = UINT_MAX;
@ -680,29 +632,6 @@ int __init set_sched_enable_hmp(char *str)
early_param("sched_enable_hmp", set_sched_enable_hmp);
int got_boost_kick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
return test_bit(BOOST_KICK, &rq->hmp_flags);
}
inline void clear_boost_kick(int cpu)
{
struct rq *rq = cpu_rq(cpu);
clear_bit(BOOST_KICK, &rq->hmp_flags);
}
inline void boost_kick(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
smp_send_reschedule(cpu);
}
/* Clear any HMP scheduler related requests pending from or on cpu */
void clear_hmp_request(int cpu)
{
@ -840,6 +769,9 @@ min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
/* Window size (in ns) */
__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
/* Maximum allowed threshold before freq aggregation must be enabled */
#define MAX_FREQ_AGGR_THRESH 1000
/* Temporarily disable window-stats activity on all cpus */
unsigned int __read_mostly sched_disable_window_stats;
@ -919,8 +851,8 @@ static const unsigned int top_tasks_bitmap_size =
* C1 busy time = 5 + 5 + 6 = 16ms
*
*/
static __read_mostly unsigned int sched_freq_aggregate;
__read_mostly unsigned int sysctl_sched_freq_aggregate;
static __read_mostly unsigned int sched_freq_aggregate = 1;
__read_mostly unsigned int sysctl_sched_freq_aggregate = 1;
unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct;
static unsigned int __read_mostly sched_freq_aggregate_threshold;
@ -937,14 +869,6 @@ unsigned int max_task_load(void)
/* Use this knob to turn on or off HMP-aware task placement logic */
unsigned int __read_mostly sched_enable_hmp;
/*
* Scheduler boost is a mechanism to temporarily place tasks on CPUs
* with higher capacity than those where a task would have normally
* ended up with their load characteristics. Any entity enabling
* boost is responsible for disabling it as well.
*/
unsigned int sysctl_sched_boost;
/* A cpu can no longer accommodate more tasks if:
*
* rq->nr_running > sysctl_sched_spill_nr_run ||
@ -995,6 +919,21 @@ unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80;
unsigned int __read_mostly sched_downmigrate;
unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
/*
* Task groups whose aggregate demand on a cpu is more than
* sched_group_upmigrate need to be up-migrated if possible.
*/
unsigned int __read_mostly sched_group_upmigrate;
unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100;
/*
* Task groups, once up-migrated, will need to drop their aggregate
* demand to less than sched_group_downmigrate before they are "down"
* migrated.
*/
unsigned int __read_mostly sched_group_downmigrate;
unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95;
/*
* The load scale factor of a CPU gets boosted when its max frequency
* is restricted due to which the tasks are migrating to higher capacity
@ -1017,33 +956,46 @@ sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
void update_up_down_migrate(void)
static void
_update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate)
{
unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
unsigned int delta;
if (up_down_migrate_scale_factor == 1024)
goto done;
return;
delta = up_migrate - down_migrate;
delta = *up_migrate - *down_migrate;
up_migrate /= NSEC_PER_USEC;
up_migrate *= up_down_migrate_scale_factor;
up_migrate >>= 10;
up_migrate *= NSEC_PER_USEC;
*up_migrate /= NSEC_PER_USEC;
*up_migrate *= up_down_migrate_scale_factor;
*up_migrate >>= 10;
*up_migrate *= NSEC_PER_USEC;
up_migrate = min(up_migrate, sched_ravg_window);
*up_migrate = min(*up_migrate, sched_ravg_window);
down_migrate /= NSEC_PER_USEC;
down_migrate *= up_down_migrate_scale_factor;
down_migrate >>= 10;
down_migrate *= NSEC_PER_USEC;
*down_migrate /= NSEC_PER_USEC;
*down_migrate *= up_down_migrate_scale_factor;
*down_migrate >>= 10;
*down_migrate *= NSEC_PER_USEC;
down_migrate = min(down_migrate, up_migrate - delta);
done:
*down_migrate = min(*down_migrate, *up_migrate - delta);
}
static void update_up_down_migrate(void)
{
unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
_update_up_down_migrate(&up_migrate, &down_migrate);
sched_upmigrate = up_migrate;
sched_downmigrate = down_migrate;
up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct);
down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct);
_update_up_down_migrate(&up_migrate, &down_migrate);
sched_group_upmigrate = up_migrate;
sched_group_downmigrate = down_migrate;
}
void set_hmp_defaults(void)
@ -1134,82 +1086,6 @@ u64 cpu_load_sync(int cpu, int sync)
return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
}
static int boost_refcount;
static DEFINE_SPINLOCK(boost_lock);
static DEFINE_MUTEX(boost_mutex);
static void boost_kick_cpus(void)
{
int i;
for_each_online_cpu(i) {
if (cpu_capacity(i) != max_capacity)
boost_kick(i);
}
}
int sched_boost(void)
{
return boost_refcount > 0;
}
int sched_set_boost(int enable)
{
unsigned long flags;
int ret = 0;
int old_refcount;
if (!sched_enable_hmp)
return -EINVAL;
spin_lock_irqsave(&boost_lock, flags);
old_refcount = boost_refcount;
if (enable == 1) {
boost_refcount++;
} else if (!enable) {
if (boost_refcount >= 1)
boost_refcount--;
else
ret = -EINVAL;
} else {
ret = -EINVAL;
}
if (!old_refcount && boost_refcount)
boost_kick_cpus();
if (boost_refcount <= 1)
core_ctl_set_boost(boost_refcount == 1);
trace_sched_set_boost(boost_refcount);
spin_unlock_irqrestore(&boost_lock, flags);
return ret;
}
int sched_boost_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
mutex_lock(&boost_mutex);
if (!write)
sysctl_sched_boost = sched_boost();
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write)
goto done;
ret = (sysctl_sched_boost <= 1) ?
sched_set_boost(sysctl_sched_boost) : -EINVAL;
done:
mutex_unlock(&boost_mutex);
return ret;
}
/*
* Task will fit on a cpu if it's bandwidth consumption on that cpu
* will be less than sched_upmigrate. A big task that was previously
@ -1219,60 +1095,63 @@ done:
* tasks with load close to the upmigrate threshold
*/
int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
enum sched_boost_type boost_type)
enum sched_boost_policy boost_policy)
{
int upmigrate;
int upmigrate = sched_upmigrate;
if (cpu_capacity(cpu) == max_capacity)
return 1;
if (boost_type != SCHED_BOOST_ON_BIG) {
if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
upmigrate = sched_downmigrate;
if (boost_policy != SCHED_BOOST_ON_BIG) {
if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE ||
upmigrate_discouraged(p))
return 1;
upmigrate = sched_upmigrate;
if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
upmigrate = sched_downmigrate;
if (task_load < upmigrate)
return 1;
} else {
if (task_sched_boost(p) || task_load >= upmigrate)
return 0;
return 1;
}
return 0;
}
enum sched_boost_type sched_boost_type(void)
{
if (sched_boost())
return sched_boost_policy;
return SCHED_BOOST_NONE;
}
int task_will_fit(struct task_struct *p, int cpu)
{
u64 tload = scale_load_to_cpu(task_load(p), cpu);
return task_load_will_fit(p, tload, cpu, sched_boost_type());
return task_load_will_fit(p, tload, cpu, sched_boost_policy());
}
int group_will_fit(struct sched_cluster *cluster,
struct related_thread_group *grp, u64 demand)
static int
group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp,
u64 demand, bool group_boost)
{
int cpu = cluster_first_cpu(cluster);
int prev_capacity = 0;
unsigned int threshold = sched_upmigrate;
unsigned int threshold = sched_group_upmigrate;
u64 load;
if (cluster->capacity == max_capacity)
return 1;
if (group_boost)
return 0;
if (!demand)
return 1;
if (grp->preferred_cluster)
prev_capacity = grp->preferred_cluster->capacity;
if (cluster->capacity < prev_capacity)
threshold = sched_downmigrate;
threshold = sched_group_downmigrate;
load = scale_load_to_cpu(demand, cpu);
if (load < threshold)
@ -1495,6 +1374,23 @@ void post_big_task_count_change(const struct cpumask *cpus)
DEFINE_MUTEX(policy_mutex);
unsigned int update_freq_aggregate_threshold(unsigned int threshold)
{
unsigned int old_threshold;
mutex_lock(&policy_mutex);
old_threshold = sysctl_sched_freq_aggregate_threshold_pct;
sysctl_sched_freq_aggregate_threshold_pct = threshold;
sched_freq_aggregate_threshold =
pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
mutex_unlock(&policy_mutex);
return old_threshold;
}
static inline int invalid_value_freq_input(unsigned int *data)
{
if (data == &sysctl_sched_freq_aggregate)
@ -1578,7 +1474,9 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
if (write && (old_val == *data))
goto done;
if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) {
if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct ||
sysctl_sched_group_downmigrate_pct >
sysctl_sched_group_upmigrate_pct) {
*data = old_val;
ret = -EINVAL;
goto done;
@ -3110,37 +3008,9 @@ static void reset_all_task_stats(void)
{
struct task_struct *g, *p;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
raw_spin_lock_irq(&p->pi_lock);
reset_task_stats(p);
raw_spin_unlock_irq(&p->pi_lock);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
}
static void disable_window_stats(void)
{
unsigned long flags;
int i;
local_irq_save(flags);
for_each_possible_cpu(i)
raw_spin_lock(&cpu_rq(i)->lock);
sched_disable_window_stats = 1;
for_each_possible_cpu(i)
raw_spin_unlock(&cpu_rq(i)->lock);
local_irq_restore(flags);
}
/* Called with all cpu's rq->lock held */
static void enable_window_stats(void)
{
sched_disable_window_stats = 0;
}
enum reset_reason_code {
@ -3166,16 +3036,21 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
unsigned int old = 0, new = 0;
struct related_thread_group *grp;
local_irq_save(flags);
read_lock(&tasklist_lock);
read_lock(&related_thread_group_lock);
disable_window_stats();
/* Taking all runqueue locks prevents race with sched_exit(). */
for_each_possible_cpu(cpu)
raw_spin_lock(&cpu_rq(cpu)->lock);
sched_disable_window_stats = 1;
reset_all_task_stats();
local_irq_save(flags);
for_each_possible_cpu(cpu)
raw_spin_lock(&cpu_rq(cpu)->lock);
read_unlock(&tasklist_lock);
list_for_each_entry(grp, &related_thread_groups, list) {
int j;
@ -3196,7 +3071,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES;
}
enable_window_stats();
sched_disable_window_stats = 0;
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
@ -3239,10 +3114,10 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
for_each_possible_cpu(cpu)
raw_spin_unlock(&cpu_rq(cpu)->lock);
local_irq_restore(flags);
read_unlock(&related_thread_group_lock);
local_irq_restore(flags);
trace_sched_reset_all_window_stats(window_start, window_size,
sched_ktime_clock() - start_ts, reason, old, new);
}
@ -3824,13 +3699,13 @@ static void check_for_up_down_migrate_update(const struct cpumask *cpus)
}
/* Return cluster which can offer required capacity for group */
static struct sched_cluster *
best_cluster(struct related_thread_group *grp, u64 total_demand)
static struct sched_cluster *best_cluster(struct related_thread_group *grp,
u64 total_demand, bool group_boost)
{
struct sched_cluster *cluster = NULL;
for_each_sched_cluster(cluster) {
if (group_will_fit(cluster, grp, total_demand))
if (group_will_fit(cluster, grp, total_demand, group_boost))
return cluster;
}
@ -3841,6 +3716,9 @@ static void _set_preferred_cluster(struct related_thread_group *grp)
{
struct task_struct *p;
u64 combined_demand = 0;
bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG;
bool group_boost = false;
u64 wallclock;
if (!sysctl_sched_enable_colocation) {
grp->last_update = sched_ktime_clock();
@ -3848,31 +3726,43 @@ static void _set_preferred_cluster(struct related_thread_group *grp)
return;
}
if (list_empty(&grp->tasks))
return;
wallclock = sched_ktime_clock();
/*
* wakeup of two or more related tasks could race with each other and
* could result in multiple calls to _set_preferred_cluster being issued
* at same time. Avoid overhead in such cases of rechecking preferred
* cluster
*/
if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10)
if (wallclock - grp->last_update < sched_ravg_window / 10)
return;
list_for_each_entry(p, &grp->tasks, grp_list)
list_for_each_entry(p, &grp->tasks, grp_list) {
if (boost_on_big && task_sched_boost(p)) {
group_boost = true;
break;
}
if (p->ravg.mark_start < wallclock -
(sched_ravg_window * sched_ravg_hist_size))
continue;
combined_demand += p->ravg.demand;
grp->preferred_cluster = best_cluster(grp, combined_demand);
}
grp->preferred_cluster = best_cluster(grp,
combined_demand, group_boost);
grp->last_update = sched_ktime_clock();
trace_sched_set_preferred_cluster(grp, combined_demand);
}
void set_preferred_cluster(struct related_thread_group *grp)
{
/*
* Prevent possible deadlock with update_children(). Not updating
* the preferred cluster once is not a big deal.
*/
if (!raw_spin_trylock(&grp->lock))
return;
raw_spin_lock(&grp->lock);
_set_preferred_cluster(grp);
raw_spin_unlock(&grp->lock);
}
@ -3880,6 +3770,8 @@ void set_preferred_cluster(struct related_thread_group *grp)
#define ADD_TASK 0
#define REM_TASK 1
#define DEFAULT_CGROUP_COLOC_ID 1
static inline void free_group_cputime(struct related_thread_group *grp)
{
free_percpu(grp->cpu_time);
@ -4116,64 +4008,19 @@ static void free_related_thread_group(struct rcu_head *rcu)
kfree(grp);
}
/*
* The thread group for a task can change while we are here. However,
* add_new_task_to_grp() will take care of any tasks that we miss here.
* When a parent exits, and a child thread is simultaneously exiting,
* sched_set_group_id() will synchronize those operations.
*/
static void update_children(struct task_struct *leader,
struct related_thread_group *grp, int event)
{
struct task_struct *child;
struct rq *rq;
unsigned long flags;
if (!thread_group_leader(leader))
return;
if (event == ADD_TASK && !sysctl_sched_enable_thread_grouping)
return;
if (thread_group_empty(leader))
return;
child = next_thread(leader);
do {
rq = task_rq_lock(child, &flags);
if (event == REM_TASK && child->grp && grp == child->grp) {
transfer_busy_time(rq, grp, child, event);
list_del_init(&child->grp_list);
rcu_assign_pointer(child->grp, NULL);
} else if (event == ADD_TASK && !child->grp) {
transfer_busy_time(rq, grp, child, event);
list_add(&child->grp_list, &grp->tasks);
rcu_assign_pointer(child->grp, grp);
}
task_rq_unlock(rq, child, &flags);
} while_each_thread(leader, child);
}
static void remove_task_from_group(struct task_struct *p)
{
struct related_thread_group *grp = p->grp;
struct rq *rq;
int empty_group = 1;
unsigned long flags;
raw_spin_lock(&grp->lock);
rq = task_rq_lock(p, &flags);
rq = __task_rq_lock(p);
transfer_busy_time(rq, p->grp, p, REM_TASK);
list_del_init(&p->grp_list);
rcu_assign_pointer(p->grp, NULL);
task_rq_unlock(rq, p, &flags);
update_children(p, grp, REM_TASK);
__task_rq_unlock(rq);
if (!list_empty(&grp->tasks)) {
empty_group = 0;
@ -4182,7 +4029,8 @@ static void remove_task_from_group(struct task_struct *p)
raw_spin_unlock(&grp->lock);
if (empty_group) {
/* Reserved groups cannot be destroyed */
if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) {
list_del(&grp->list);
call_rcu(&grp->rcu, free_related_thread_group);
}
@ -4192,7 +4040,6 @@ static int
add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
{
struct rq *rq;
unsigned long flags;
raw_spin_lock(&grp->lock);
@ -4200,13 +4047,11 @@ add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
* Change p->grp under rq->lock. Will prevent races with read-side
* reference of p->grp in various hot-paths
*/
rq = task_rq_lock(p, &flags);
rq = __task_rq_lock(p);
transfer_busy_time(rq, grp, p, ADD_TASK);
list_add(&p->grp_list, &grp->tasks);
rcu_assign_pointer(p->grp, grp);
task_rq_unlock(rq, p, &flags);
update_children(p, grp, ADD_TASK);
__task_rq_unlock(rq);
_set_preferred_cluster(grp);
@ -4219,23 +4064,33 @@ void add_new_task_to_grp(struct task_struct *new)
{
unsigned long flags;
struct related_thread_group *grp;
struct task_struct *parent;
struct task_struct *leader = new->group_leader;
unsigned int leader_grp_id = sched_get_group_id(leader);
if (!sysctl_sched_enable_thread_grouping)
if (!sysctl_sched_enable_thread_grouping &&
leader_grp_id != DEFAULT_CGROUP_COLOC_ID)
return;
if (thread_group_leader(new))
return;
parent = new->group_leader;
if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) {
if (!same_schedtune(new, leader))
return;
}
write_lock_irqsave(&related_thread_group_lock, flags);
rcu_read_lock();
grp = task_related_thread_group(parent);
grp = task_related_thread_group(leader);
rcu_read_unlock();
/* Its possible that update_children() already added us to the group */
/*
* It's possible that someone already added the new task to the
* group. A leader's thread group is updated prior to calling
* this function. It's also possible that the leader has exited
* the group. In either case, there is nothing else to do.
*/
if (!grp || new->grp) {
write_unlock_irqrestore(&related_thread_group_lock, flags);
return;
@ -4250,14 +4105,55 @@ void add_new_task_to_grp(struct task_struct *new)
write_unlock_irqrestore(&related_thread_group_lock, flags);
}
#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
/*
* We create a default colocation group at boot. There is no need to
* synchronize tasks between cgroups at creation time because the
* correct cgroup hierarchy is not available at boot. Therefore cgroup
* colocation is turned off by default even though the colocation group
* itself has been allocated. Furthermore this colocation group cannot
* be destroyted once it has been created. All of this has been as part
* of runtime optimizations.
*
* The job of synchronizing tasks to the colocation group is done when
* the colocation flag in the cgroup is turned on.
*/
static int __init create_default_coloc_group(void)
{
struct related_thread_group *grp = NULL;
unsigned long flags;
grp = alloc_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
if (IS_ERR(grp)) {
WARN_ON(1);
return -ENOMEM;
}
write_lock_irqsave(&related_thread_group_lock, flags);
list_add(&grp->list, &related_thread_groups);
write_unlock_irqrestore(&related_thread_group_lock, flags);
update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
return 0;
}
late_initcall(create_default_coloc_group);
int sync_cgroup_colocation(struct task_struct *p, bool insert)
{
unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
return sched_set_group_id(p, grp_id);
}
#endif
int sched_set_group_id(struct task_struct *p, unsigned int group_id)
{
int rc = 0;
unsigned long flags;
struct related_thread_group *grp = NULL;
/* Prevents tasks from exiting while we are managing groups. */
write_lock_irqsave(&related_thread_group_lock, flags);
raw_spin_lock_irqsave(&p->pi_lock, flags);
write_lock(&related_thread_group_lock);
/* Switching from one group to another directly is not permitted */
if ((current != p && p->flags & PF_EXITING) ||
@ -4272,6 +4168,12 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id)
grp = lookup_related_thread_group(group_id);
if (!grp) {
/* This is a reserved id */
if (group_id == DEFAULT_CGROUP_COLOC_ID) {
rc = -EINVAL;
goto done;
}
grp = alloc_related_thread_group(group_id);
if (IS_ERR(grp)) {
rc = -ENOMEM;
@ -4281,10 +4183,10 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id)
list_add(&grp->list, &related_thread_groups);
}
BUG_ON(!grp);
rc = add_task_to_group(p, grp);
done:
write_unlock_irqrestore(&related_thread_group_lock, flags);
write_unlock(&related_thread_group_lock);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return rc;
}
@ -4529,7 +4431,7 @@ bool early_detection_notify(struct rq *rq, u64 wallclock)
struct task_struct *p;
int loop_max = 10;
if (!sched_boost() || !rq->cfs.h_nr_running)
if (sched_boost_policy() == SCHED_BOOST_NONE || !rq->cfs.h_nr_running)
return 0;
rq->ed_task = NULL;

View file

@ -1677,8 +1677,13 @@ static int find_lowest_rq_hmp(struct task_struct *task)
int prev_cpu = task_cpu(task);
u64 cpu_load, min_load = ULLONG_MAX;
int i;
int restrict_cluster = sched_boost() ? 0 :
sysctl_sched_restrict_cluster_spill;
int restrict_cluster;
int boost_on_big;
boost_on_big = sched_boost() == FULL_THROTTLE_BOOST &&
sched_boost_policy() == SCHED_BOOST_ON_BIG;
restrict_cluster = sysctl_sched_restrict_cluster_spill;
/* Make sure the mask is initialized first */
if (unlikely(!lowest_mask))
@ -1697,6 +1702,9 @@ static int find_lowest_rq_hmp(struct task_struct *task)
*/
for_each_sched_cluster(cluster) {
if (boost_on_big && cluster->capacity != max_possible_capacity)
continue;
cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask);
cpumask_andnot(&candidate_mask, &candidate_mask,
cpu_isolated_mask);

View file

@ -1061,8 +1061,6 @@ extern unsigned int max_load_scale_factor;
extern unsigned int max_possible_capacity;
extern unsigned int min_max_possible_capacity;
extern unsigned int max_power_cost;
extern unsigned int sched_upmigrate;
extern unsigned int sched_downmigrate;
extern unsigned int sched_init_task_load_windows;
extern unsigned int up_down_migrate_scale_factor;
extern unsigned int sysctl_sched_restrict_cluster_spill;
@ -1106,18 +1104,23 @@ extern void sched_account_irqstart(int cpu, struct task_struct *curr,
u64 wallclock);
extern unsigned int cpu_temp(int cpu);
extern unsigned int nr_eligible_big_tasks(int cpu);
extern void update_up_down_migrate(void);
extern int update_preferred_cluster(struct related_thread_group *grp,
struct task_struct *p, u32 old_load);
extern void set_preferred_cluster(struct related_thread_group *grp);
extern void add_new_task_to_grp(struct task_struct *new);
extern unsigned int update_freq_aggregate_threshold(unsigned int threshold);
enum sched_boost_type {
enum sched_boost_policy {
SCHED_BOOST_NONE,
SCHED_BOOST_ON_BIG,
SCHED_BOOST_ON_ALL,
};
#define NO_BOOST 0
#define FULL_THROTTLE_BOOST 1
#define CONSERVATIVE_BOOST 2
#define RESTRAINED_BOOST 3
static inline struct sched_cluster *cpu_cluster(int cpu)
{
return cpu_rq(cpu)->cluster;
@ -1387,14 +1390,11 @@ extern void set_hmp_defaults(void);
extern int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost);
extern unsigned int power_cost(int cpu, u64 demand);
extern void reset_all_window_stats(u64 window_start, unsigned int window_size);
extern void boost_kick(int cpu);
extern int sched_boost(void);
extern int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
enum sched_boost_type boost_type);
extern enum sched_boost_type sched_boost_type(void);
enum sched_boost_policy boost_policy);
extern enum sched_boost_policy sched_boost_policy(void);
extern int task_will_fit(struct task_struct *p, int cpu);
extern int group_will_fit(struct sched_cluster *cluster,
struct related_thread_group *grp, u64 demand);
extern u64 cpu_load(int cpu);
extern u64 cpu_load_sync(int cpu, int sync);
extern int preferred_cluster(struct sched_cluster *cluster,
@ -1422,10 +1422,32 @@ extern u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft);
extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
struct cftype *cft, u64 upmigrate_discourage);
extern void sched_hmp_parse_dt(void);
extern void init_sched_hmp_boost_policy(void);
extern void sched_boost_parse_dt(void);
extern void clear_top_tasks_bitmap(unsigned long *bitmap);
#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
extern bool task_sched_boost(struct task_struct *p);
extern int sync_cgroup_colocation(struct task_struct *p, bool insert);
extern bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2);
extern void update_cgroup_boost_settings(void);
extern void restore_cgroup_boost_settings(void);
#else
static inline bool
same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2)
{
return true;
}
static inline bool task_sched_boost(struct task_struct *p)
{
return true;
}
static inline void update_cgroup_boost_settings(void) { }
static inline void restore_cgroup_boost_settings(void) { }
#endif
#else /* CONFIG_SCHED_HMP */
struct hmp_sched_stats;
@ -1615,8 +1637,7 @@ static inline void post_big_task_count_change(void) { }
static inline void set_hmp_defaults(void) { }
static inline void clear_reserved(int cpu) { }
static inline void sched_hmp_parse_dt(void) {}
static inline void init_sched_hmp_boost_policy(void) {}
static inline void sched_boost_parse_dt(void) {}
#define trace_sched_cpu_load(...)
#define trace_sched_cpu_load_lb(...)

View file

@ -25,6 +25,33 @@ struct schedtune {
/* Boost value for tasks on that SchedTune CGroup */
int boost;
#ifdef CONFIG_SCHED_HMP
/* Toggle ability to override sched boost enabled */
bool sched_boost_no_override;
/*
* Controls whether a cgroup is eligible for sched boost or not. This
* can temporariliy be disabled by the kernel based on the no_override
* flag above.
*/
bool sched_boost_enabled;
/*
* This tracks the default value of sched_boost_enabled and is used
* restore the value following any temporary changes to that flag.
*/
bool sched_boost_enabled_backup;
/*
* Controls whether tasks of this cgroup should be colocated with each
* other and tasks of other cgroups that have the same flag turned on.
*/
bool colocate;
/* Controls whether further updates are allowed to the colocate flag */
bool colocate_update_disabled;
#endif
};
static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
@ -54,6 +81,13 @@ static inline struct schedtune *parent_st(struct schedtune *st)
static struct schedtune
root_schedtune = {
.boost = 0,
#ifdef CONFIG_SCHED_HMP
.sched_boost_no_override = false,
.sched_boost_enabled = true,
.sched_boost_enabled_backup = true,
.colocate = false,
.colocate_update_disabled = false,
#endif
};
/*
@ -97,6 +131,121 @@ struct boost_groups {
/* Boost groups affecting each CPU in the system */
DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
#ifdef CONFIG_SCHED_HMP
static inline void init_sched_boost(struct schedtune *st)
{
st->sched_boost_no_override = false;
st->sched_boost_enabled = true;
st->sched_boost_enabled_backup = st->sched_boost_enabled;
st->colocate = false;
st->colocate_update_disabled = false;
}
bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2)
{
return task_schedtune(tsk1) == task_schedtune(tsk2);
}
void update_cgroup_boost_settings(void)
{
int i;
for (i = 0; i < BOOSTGROUPS_COUNT; i++) {
if (!allocated_group[i])
break;
if (allocated_group[i]->sched_boost_no_override)
continue;
allocated_group[i]->sched_boost_enabled = false;
}
}
void restore_cgroup_boost_settings(void)
{
int i;
for (i = 0; i < BOOSTGROUPS_COUNT; i++) {
if (!allocated_group[i])
break;
allocated_group[i]->sched_boost_enabled =
allocated_group[i]->sched_boost_enabled_backup;
}
}
bool task_sched_boost(struct task_struct *p)
{
struct schedtune *st = task_schedtune(p);
return st->sched_boost_enabled;
}
static u64
sched_boost_override_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct schedtune *st = css_st(css);
return st->sched_boost_no_override;
}
static int sched_boost_override_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 override)
{
struct schedtune *st = css_st(css);
st->sched_boost_no_override = !!override;
return 0;
}
static u64 sched_boost_enabled_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct schedtune *st = css_st(css);
return st->sched_boost_enabled;
}
static int sched_boost_enabled_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 enable)
{
struct schedtune *st = css_st(css);
st->sched_boost_enabled = !!enable;
st->sched_boost_enabled_backup = st->sched_boost_enabled;
return 0;
}
static u64 sched_colocate_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct schedtune *st = css_st(css);
return st->colocate;
}
static int sched_colocate_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 colocate)
{
struct schedtune *st = css_st(css);
if (st->colocate_update_disabled)
return -EPERM;
st->colocate = !!colocate;
st->colocate_update_disabled = true;
return 0;
}
#else /* CONFIG_SCHED_HMP */
static inline void init_sched_boost(struct schedtune *st) { }
#endif /* CONFIG_SCHED_HMP */
static u64
boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
@ -121,12 +270,45 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
return 0;
}
static void schedtune_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
struct schedtune *st;
bool colocate;
cgroup_taskset_first(tset, &css);
st = css_st(css);
colocate = st->colocate;
cgroup_taskset_for_each(task, css, tset)
sync_cgroup_colocation(task, colocate);
}
static struct cftype files[] = {
{
.name = "boost",
.read_u64 = boost_read,
.write_u64 = boost_write,
},
#ifdef CONFIG_SCHED_HMP
{
.name = "sched_boost_no_override",
.read_u64 = sched_boost_override_read,
.write_u64 = sched_boost_override_write,
},
{
.name = "sched_boost_enabled",
.read_u64 = sched_boost_enabled_read,
.write_u64 = sched_boost_enabled_write,
},
{
.name = "colocate",
.read_u64 = sched_colocate_read,
.write_u64 = sched_colocate_write,
},
#endif
{ } /* terminate */
};
@ -189,6 +371,7 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
/* Initialize per CPUs boost group support */
st->idx = idx;
init_sched_boost(st);
if (schedtune_boostgroup_init(st))
goto release;
@ -222,6 +405,7 @@ struct cgroup_subsys schedtune_cgrp_subsys = {
.legacy_cftypes = files,
.early_init = 1,
.allow_attach = subsys_cgroup_allow_attach,
.attach = schedtune_attach,
};
#endif /* CONFIG_CGROUP_SCHEDTUNE */

View file

@ -124,6 +124,7 @@ static int __maybe_unused neg_one = -1;
static int zero;
static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused three = 3;
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
static int one_hundred = 100;
@ -376,6 +377,22 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one_hundred,
},
{
.procname = "sched_group_upmigrate",
.data = &sysctl_sched_group_upmigrate_pct,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
.extra1 = &zero,
},
{
.procname = "sched_group_downmigrate",
.data = &sysctl_sched_group_downmigrate_pct,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
.extra1 = &zero,
},
{
.procname = "sched_init_task_load",
.data = &sysctl_sched_init_task_load_pct,
@ -487,6 +504,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_boost_handler,
.extra1 = &zero,
.extra2 = &three,
},
#endif /* CONFIG_SCHED_HMP */
#ifdef CONFIG_SCHED_DEBUG