From 0dee0d1411e4ba837089a769a5bcce57a5a14df2 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@codeaurora.org>
Date: Fri, 9 Sep 2016 19:38:03 +0530
Subject: [PATCH] sched: Avoid waking idle cpu for short-burst tasks

Introduce sched_short_burst tunable to classify "short-burst" tasks.
These tasks are eligible for packing to avoid overhead associated with
waking up an idle CPU. select_best_cpu() ignores power-cost and selects
the CPU with least wakeup latency which is not loaded with IRQs and
can accommodate this task without exceeding spill limits. The ties are
broken with load followed by previous CPU.

This policy does not affect cluster selection but only CPU selection
in the selected cluster. The tasks eligible for "wakeup-up-idle" and
"boost" are not considered for packing. This policy is applied for
both "fair" and "rt" scheduling class tasks.

Change-Id: I2a05493fde93f58636725f18d0ce8dbce4418a30
Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org>
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
---
 Documentation/scheduler/sched-hmp.txt | 10 +++++++
 include/linux/sched/sysctl.h          |  1 +
 kernel/sched/fair.c                   | 39 +++++++++++++++++++++------
 kernel/sched/hmp.c                    | 17 +++++++++++-
 kernel/sched/rt.c                     | 18 +++++++++++++
 kernel/sched/sched.h                  | 10 +++++++
 kernel/sysctl.c                       |  7 +++++
 7 files changed, 93 insertions(+), 9 deletions(-)

diff --git a/Documentation/scheduler/sched-hmp.txt b/Documentation/scheduler/sched-hmp.txt
index 091d49ea80cf..766c01d321b5 100644
--- a/Documentation/scheduler/sched-hmp.txt
+++ b/Documentation/scheduler/sched-hmp.txt
@@ -726,6 +726,16 @@ d. /proc/sys/kernel/sched_select_prev_cpu_us
 	Default value of sched_select_prev_cpu_us is 2000 (2ms).  This can be
 	turned off by setting it to 0.
 
+e. /proc/sys/kernel/sched_short_burst_ns
+	This threshold controls whether a task is considered as "short-burst"
+	or not. "short-burst" tasks are eligible for packing to avoid overhead
+	associated with waking up an idle CPU. "non-idle" CPUs which are not
+	loaded with IRQs and can accommodate the waking task without exceeding
+	spill limits are considered. The ties are broken with load followed
+	by previous CPU. This tunable does not affect cluster selection.
+	It only affects CPU selection in a given cluster. This packing is
+	skipped for tasks that are eligible for "wake-up-idle" and "boost".
+
 **** 5.2.4 Wakeup Logic for Task "p"
 
 Wakeup task placement logic is as follows:
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 5d0899df64ff..e4aff5e6e17f 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -68,6 +68,7 @@ extern unsigned int sysctl_sched_freq_aggregate;
 extern unsigned int sysctl_sched_enable_thread_grouping;
 extern unsigned int sysctl_sched_freq_aggregate_threshold_pct;
 extern unsigned int sysctl_sched_prefer_sync_wakee_to_waker;
+extern unsigned int sysctl_sched_short_burst;
 
 #else /* CONFIG_SCHED_HMP */
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3db77aff2433..95b961dc7b14 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2591,6 +2591,7 @@ static u32 __compute_runnable_contrib(u64 n)
 #define SBC_FLAG_CSTATE_LOAD				0x100
 #define SBC_FLAG_BEST_SIBLING				0x200
 #define SBC_FLAG_WAKER_CPU				0x400
+#define SBC_FLAG_PACK_TASK				0x800
 
 /* Cluster selection flag */
 #define SBC_FLAG_COLOC_CLUSTER				0x10000
@@ -2607,6 +2608,7 @@ struct cpu_select_env {
 	u8 sync:1;
 	u8 ignore_prev_cpu:1;
 	enum sched_boost_policy boost_policy;
+	u8 pack_task:1;
 	int prev_cpu;
 	DECLARE_BITMAP(candidate_list, NR_CPUS);
 	DECLARE_BITMAP(backup_list, NR_CPUS);
@@ -2958,8 +2960,17 @@ static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
 {
 	int cpu_cost;
 
-	cpu_cost = power_cost(cpu, task_load(env->p) +
+	/*
+	 * We try to find the least loaded *busy* CPU irrespective
+	 * of the power cost.
+	 */
+	if (env->pack_task)
+		cpu_cost = cpu_min_power_cost(cpu);
+
+	else
+		cpu_cost = power_cost(cpu, task_load(env->p) +
 				cpu_cravg_sync(cpu, env->sync));
+
 	if (cpu_cost <= stats->min_cost)
 		__update_cluster_stats(cpu, stats, env, cpu_cost);
 }
@@ -3034,6 +3045,15 @@ static inline int wake_to_idle(struct task_struct *p)
 		 (p->flags & PF_WAKE_UP_IDLE) || sysctl_sched_wake_to_idle;
 }
 
+static inline bool env_has_special_flags(struct cpu_select_env *env)
+{
+	if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
+	    env->reason)
+		return true;
+
+	return false;
+}
+
 static inline bool
 bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 {
@@ -3041,9 +3061,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 	struct task_struct *task = env->p;
 	struct sched_cluster *cluster;
 
-	if (env->boost_policy != SCHED_BOOST_NONE || env->reason ||
-	    !task->ravg.mark_start ||
-	    env->need_idle || !sched_short_sleep_task_threshold)
+	if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
 		return false;
 
 	prev_cpu = env->prev_cpu;
@@ -3092,8 +3110,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 static inline bool
 wake_to_waker_cluster(struct cpu_select_env *env)
 {
-	return env->boost_policy == SCHED_BOOST_NONE &&
-	       !env->need_idle && !env->reason && env->sync &&
+	return env->sync &&
 	       task_load(current) > sched_big_waker_task_load &&
 	       task_load(env->p) < sched_small_wakee_task_load;
 }
@@ -3118,7 +3135,6 @@ cluster_allowed(struct task_struct *p, struct sched_cluster *cluster)
 	return !cpumask_empty(&tmp_mask);
 }
 
-
 /* return cheapest cpu that can fit this task */
 static int select_best_cpu(struct task_struct *p, int target, int reason,
 			   int sync)
@@ -3128,6 +3144,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 	struct related_thread_group *grp;
 	unsigned int sbc_flag = 0;
 	int cpu = raw_smp_processor_id();
+	bool special;
 
 	struct cpu_select_env env = {
 		.p			= p,
@@ -3140,6 +3157,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		.rtg			= NULL,
 		.sbc_best_flag		= 0,
 		.sbc_best_cluster_flag	= 0,
+		.pack_task              = false,
 	};
 
 	env.boost_policy = task_sched_boost(p) ?
@@ -3149,6 +3167,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 	bitmap_zero(env.backup_list, NR_CPUS);
 
 	init_cluster_cpu_stats(&stats);
+	special = env_has_special_flags(&env);
 
 	rcu_read_lock();
 
@@ -3160,7 +3179,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 			clear_bit(pref_cluster->id, env.candidate_list);
 		else
 			env.rtg = grp;
-	} else {
+	} else if (!special) {
 		cluster = cpu_rq(cpu)->cluster;
 		if (wake_to_waker_cluster(&env)) {
 			if (bias_to_waker_cpu(p, cpu)) {
@@ -3181,6 +3200,10 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		}
 	}
 
+	if (!special && is_short_burst_task(p)) {
+		env.pack_task = true;
+		sbc_flag = SBC_FLAG_PACK_TASK;
+	}
 retry:
 	cluster = select_least_power_cluster(&env);
 
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index b2f3013bfe31..95e618ee1124 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -961,6 +961,13 @@ sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
 
 unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
 
+/*
+ * Scheduler tries to avoid waking up idle CPUs for tasks running
+ * in short bursts. If the task average burst is less than
+ * sysctl_sched_short_burst nanoseconds, it is eligible for packing.
+ */
+unsigned int __read_mostly sysctl_sched_short_burst;
+
 static void
 _update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate)
 {
@@ -1553,7 +1560,13 @@ void init_new_task_load(struct task_struct *p, bool idle_task)
 	memset(&p->ravg, 0, sizeof(struct ravg));
 	p->cpu_cycles = 0;
 	p->ravg.curr_burst = 0;
-	p->ravg.avg_burst = 0;
+	/*
+	 * Initialize the avg_burst to twice the threshold, so that
+	 * a task would not be classified as short burst right away
+	 * after fork. It takes at least 6 sleep-wakeup cycles for
+	 * the avg_burst to go below the threshold.
+	 */
+	p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
 
 	p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
 	p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
@@ -2987,6 +3000,8 @@ void reset_task_stats(struct task_struct *p)
 	p->ravg.curr_window_cpu = curr_window_ptr;
 	p->ravg.prev_window_cpu = prev_window_ptr;
 
+	p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
+
 	/* Retain EXITING_TASK marker */
 	p->ravg.sum_history[0] = sum;
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 52edd6b158ed..624bededfb85 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1679,6 +1679,7 @@ static int find_lowest_rq_hmp(struct task_struct *task)
 	int i;
 	int restrict_cluster;
 	int boost_on_big;
+	int pack_task, wakeup_latency, least_wakeup_latency = INT_MAX;
 
 	boost_on_big = sched_boost() == FULL_THROTTLE_BOOST &&
 			sched_boost_policy() == SCHED_BOOST_ON_BIG;
@@ -1695,6 +1696,8 @@ static int find_lowest_rq_hmp(struct task_struct *task)
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
 		return best_cpu; /* No targets found */
 
+	pack_task = is_short_burst_task(task);
+
 	/*
 	 * At this point we have built a mask of cpus representing the
 	 * lowest priority tasks in the system.  Now we want to elect
@@ -1720,6 +1723,20 @@ static int find_lowest_rq_hmp(struct task_struct *task)
 			if (!restrict_cluster)
 				cpu_load = scale_load_to_cpu(cpu_load, i);
 
+			if (pack_task) {
+				wakeup_latency = cpu_rq(i)->wakeup_latency;
+
+				if (wakeup_latency > least_wakeup_latency)
+					continue;
+
+				if (wakeup_latency < least_wakeup_latency) {
+					least_wakeup_latency = wakeup_latency;
+					min_load = cpu_load;
+					best_cpu = i;
+					continue;
+				}
+			}
+
 			if (cpu_load < min_load ||
 				(cpu_load == min_load &&
 				(i == prev_cpu || (best_cpu != prev_cpu &&
@@ -1728,6 +1745,7 @@ static int find_lowest_rq_hmp(struct task_struct *task)
 				best_cpu = i;
 			}
 		}
+
 		if (restrict_cluster && best_cpu != -1)
 			break;
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6b9f11d9a47c..b9a109e5ef94 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1188,6 +1188,11 @@ static inline int cpu_max_power_cost(int cpu)
 	return cpu_rq(cpu)->cluster->max_power_cost;
 }
 
+static inline int cpu_min_power_cost(int cpu)
+{
+	return cpu_rq(cpu)->cluster->min_power_cost;
+}
+
 static inline u32 cpu_cycles_to_freq(u64 cycles, u32 period)
 {
 	return div64_u64(cycles, period);
@@ -1385,6 +1390,11 @@ static inline u64 cpu_cravg_sync(int cpu, int sync)
 	return load;
 }
 
+static inline bool is_short_burst_task(struct task_struct *p)
+{
+	return p->ravg.avg_burst < sysctl_sched_short_burst;
+}
+
 extern void check_for_migration(struct rq *rq, struct task_struct *p);
 extern void pre_big_task_count_change(const struct cpumask *cpus);
 extern void post_big_task_count_change(const struct cpumask *cpus);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b7cbd7940f7b..d4682d0cdeb1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -507,6 +507,13 @@ static struct ctl_table kern_table[] = {
 		.extra1         = &zero,
 		.extra2		= &three,
 	},
+	{
+		.procname	= "sched_short_burst_ns",
+		.data		= &sysctl_sched_short_burst,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #endif	/* CONFIG_SCHED_HMP */
 #ifdef CONFIG_SCHED_DEBUG
 	{