From 078568e4259847be3cecf72052606abb93b7eed0 Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <pkondeti@codeaurora.org>
Date: Tue, 31 May 2016 12:34:52 +0530
Subject: [PATCH 1/2] sched: Introduce sched_freq_aggregate_threshold tunable

Do the aggregation for frequency only when the total group busy time
is above sched_freq_aggregate_threshold. This filtering is especially
needed for the cases where groups are created by including all threads
of an application process. This knob can be tuned to apply aggregation
only for the heavy workload applications.

When this knob is enabled and load is aggregated, the load is not
clipped to 100% @ current frequency to ramp up the frequency faster.

Change-Id: Icfd91c85938def101a989af3597d3dcaa8026d16
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
---
 include/linux/sched/sysctl.h |  1 +
 kernel/sched/hmp.c           | 55 ++++++++++++++++++++++++++++--------
 kernel/sysctl.c              |  7 +++++
 3 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 68a9bdde6604..1f9c2c734b20 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -63,6 +63,7 @@ extern unsigned int sysctl_sched_new_task_windows;
 extern unsigned int sysctl_sched_pred_alert_freq;
 extern unsigned int sysctl_sched_freq_aggregate;
 extern unsigned int sysctl_sched_enable_thread_grouping;
+extern unsigned int sysctl_sched_freq_aggregate_threshold_pct;
 
 #else /* CONFIG_SCHED_HMP */
 
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 8da0147b4f89..447f3880f645 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -821,6 +821,9 @@ static DEFINE_RWLOCK(related_thread_group_lock);
 static __read_mostly unsigned int sched_freq_aggregate;
 __read_mostly unsigned int sysctl_sched_freq_aggregate;
 
+unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct;
+static unsigned int __read_mostly sched_freq_aggregate_threshold;
+
 /* Initial task load. Newly created tasks are assigned this load. */
 unsigned int __read_mostly sched_init_task_load_windows;
 unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
@@ -959,6 +962,9 @@ void set_hmp_defaults(void)
 	sched_big_waker_task_load =
 		div64_u64((u64)sysctl_sched_big_waker_task_load_pct *
 			  (u64)sched_ravg_window, 100);
+
+	sched_freq_aggregate_threshold =
+		pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
 }
 
 u32 sched_get_init_task_load(struct task_struct *p)
@@ -1475,7 +1481,18 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
 	if (write && (old_val == *data))
 		goto done;
 
-	if (data != &sysctl_sched_select_prev_cpu_us) {
+	/*
+	 * Special handling for sched_freq_aggregate_threshold_pct
+	 * which can be greater than 100. Use 1000 as an upper bound
+	 * value which works for all practical use cases.
+	 */
+	if (data == &sysctl_sched_freq_aggregate_threshold_pct) {
+		if (*data > 1000) {
+			*data = old_val;
+			ret = -EINVAL;
+			goto done;
+		}
+	} else if (data != &sysctl_sched_select_prev_cpu_us) {
 		/*
 		 * all tunables other than sched_select_prev_cpu_us are
 		 * in percentage.
@@ -2947,6 +2964,8 @@ void sched_get_cpus_busy(struct sched_load *busy,
 	u64 max_prev_sum = 0;
 	int max_busy_cpu = cpumask_first(query_cpus);
 	struct related_thread_group *grp;
+	u64 total_group_load = 0, total_ngload = 0;
+	bool aggregate_load = false;
 
 	if (unlikely(cpus == 0))
 		return;
@@ -3006,6 +3025,11 @@ void sched_get_cpus_busy(struct sched_load *busy,
 		}
 	}
 
+	group_load_in_freq_domain(
+			&cpu_rq(max_busy_cpu)->freq_domain_cpumask,
+			&total_group_load, &total_ngload);
+	aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold);
+
 	i = 0;
 	for_each_cpu(cpu, query_cpus) {
 		group_load[i] = 0;
@@ -3015,11 +3039,11 @@ void sched_get_cpus_busy(struct sched_load *busy,
 			goto skip_early;
 
 		rq = cpu_rq(cpu);
-		if (!notifier_sent) {
-			if (cpu == max_busy_cpu)
-				group_load_in_freq_domain(
-					&rq->freq_domain_cpumask,
-					&group_load[i], &ngload[i]);
+		if (aggregate_load) {
+			if (cpu == max_busy_cpu) {
+				group_load[i] = total_group_load;
+				ngload[i] = total_ngload;
+			}
 		} else {
 			_group_load_in_cpu(cpu, &group_load[i], &ngload[i]);
 		}
@@ -3056,7 +3080,19 @@ skip_early:
 			goto exit_early;
 		}
 
-		if (!notifier_sent) {
+		/*
+		 * When the load aggregation is controlled by
+		 * sched_freq_aggregate_threshold, allow reporting loads
+		 * greater than 100 @ Fcur to ramp up the frequency
+		 * faster.
+		 */
+		if (notifier_sent || (aggregate_load &&
+					sched_freq_aggregate_threshold)) {
+			load[i] = scale_load_to_freq(load[i], max_freq[i],
+						    cpu_max_possible_freq(cpu));
+			nload[i] = scale_load_to_freq(nload[i], max_freq[i],
+						    cpu_max_possible_freq(cpu));
+		} else {
 			load[i] = scale_load_to_freq(load[i], max_freq[i],
 						     cur_freq[i]);
 			nload[i] = scale_load_to_freq(nload[i], max_freq[i],
@@ -3070,11 +3106,6 @@ skip_early:
 						    cpu_max_possible_freq(cpu));
 			nload[i] = scale_load_to_freq(nload[i], cur_freq[i],
 						    cpu_max_possible_freq(cpu));
-		} else {
-			load[i] = scale_load_to_freq(load[i], max_freq[i],
-						    cpu_max_possible_freq(cpu));
-			nload[i] = scale_load_to_freq(nload[i], max_freq[i],
-						    cpu_max_possible_freq(cpu));
 		}
 		pload[i] = scale_load_to_freq(pload[i], max_freq[i],
 					     rq->cluster->max_possible_freq);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ac34212f6881..07fef40d1274 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -434,6 +434,13 @@ static struct ctl_table kern_table[] = {
 		.mode           = 0644,
 		.proc_handler   = sched_window_update_handler,
 	},
+	{
+		.procname	= "sched_freq_aggregate_threshold",
+		.data		= &sysctl_sched_freq_aggregate_threshold_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_hmp_proc_update_handler,
+	},
 	{
 		.procname	= "sched_boost",
 		.data		= &sysctl_sched_boost,

From 3506942e600a29dd135e80b15a3620221d38a2eb Mon Sep 17 00:00:00 2001
From: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
Date: Thu, 18 Aug 2016 16:41:35 -0700
Subject: [PATCH 2/2] sched: Make use of sysctl_sched_wake_to_idle in
 select_best_cpu

sysctl_sched_wake_to_idle is a means to allow or disallow a global
task placement preference for idle CPUs. It has been unused thus
far since we've preferred to use a per-task flag instead to control
placement for individual tasks. Using this global flag, however, does
allow greater flexibility for testing and system evaluation.
Incorporate sysctl_sched_wake_to_idle in the placement policy.

Change-Id: I7e830bc914eb9c159ae18f165bc8b0278ec9af40
Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ce58e2245b4b..e893b0fcac6b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2992,7 +2992,7 @@ static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
 static inline int wake_to_idle(struct task_struct *p)
 {
 	return (current->flags & PF_WAKE_UP_IDLE) ||
-				(p->flags & PF_WAKE_UP_IDLE);
+		 (p->flags & PF_WAKE_UP_IDLE) || sysctl_sched_wake_to_idle;
 }
 
 static inline bool