From eb7300e9a89edf0692fa53dbb6cb4214f9130927 Mon Sep 17 00:00:00 2001
From: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
Date: Mon, 9 May 2016 16:28:07 -0700
Subject: [PATCH 1/4] sched: Add per CPU load tracking for each task

Keeping a track of the load footprint of each task on every CPU
that it executed on gives the scheduler much more flexibility in
terms of the number of frequency guidance policies. These new fields
will be used in subsequent patches as we alter the load fixup
mechanism upon task migration. We still need to maintain the
curr/prev_window sums as they will also be required in subsequent
patches as we start to track top tasks based on cumulative load.

Also, we need to call init_new_task_load() for the idle task. This
is an existing harmless bug as load tracking for the idle task is
irrelevant. However, in this patch we are adding pointers to the
ravg structure. These pointers have to be initialized even for the
idle task.

Finally move init_new_task_load() to sched_fork(). This was always
the more appropriate place, however, following the introduction of
new pointers in the ravg struct, this is necessary to avoid races
with functions such as reset_all_task_stats().

Change-Id: Ib584372eb539706da4319973314e54dae04e5934
Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
---
 include/linux/sched.h        | 15 ++++--
 include/trace/events/sched.h | 39 +++++++++++++--
 kernel/fork.c                |  2 +-
 kernel/sched/core.c          | 24 ++++++----
 kernel/sched/hmp.c           | 92 +++++++++++++++++++++++++++++-------
 kernel/sched/sched.h         |  6 ++-
 kernel/smpboot.c             |  2 +-
 7 files changed, 141 insertions(+), 39 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a395d8a9ff73..06acefeffd4c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -356,7 +356,7 @@ extern int lockdep_tasklist_lock_is_held(void);
 extern void sched_init(void);
 extern void sched_init_smp(void);
 extern asmlinkage void schedule_tail(struct task_struct *prev);
-extern void init_idle(struct task_struct *idle, int cpu);
+extern void init_idle(struct task_struct *idle, int cpu, bool hotplug);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
 extern cpumask_var_t cpu_isolated_map;
@@ -1332,11 +1332,15 @@ struct ravg {
 	 * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
 	 * demand for tasks.
 	 *
-	 * 'curr_window' represents task's contribution to cpu busy time
-	 * statistics (rq->curr_runnable_sum) in current window
+	 * 'curr_window_cpu' represents task's contribution to cpu busy time on
+	 * various CPUs in the current window
 	 *
-	 * 'prev_window' represents task's contribution to cpu busy time
-	 * statistics (rq->prev_runnable_sum) in previous window
+	 * 'prev_window_cpu' represents task's contribution to cpu busy time on
+	 * various CPUs in the previous window
+	 *
+	 * 'curr_window' represents the sum of all entries in curr_window_cpu
+	 *
+	 * 'prev_window' represents the sum of all entries in prev_window_cpu
 	 *
 	 * 'pred_demand' represents task's current predicted cpu busy time
 	 *
@@ -1346,6 +1350,7 @@ struct ravg {
 	u64 mark_start;
 	u32 sum, demand;
 	u32 sum_history[RAVG_HIST_SIZE_MAX];
+	u32 *curr_window_cpu, *prev_window_cpu;
 	u32 curr_window, prev_window;
 	u16 active_windows;
 	u32 pred_demand;
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index daf69b7df534..209355c66e02 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -260,6 +260,30 @@ TRACE_EVENT(sched_set_boost,
 	TP_printk("ref_count=%d", __entry->ref_count)
 );
 
+#if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_HMP)
+static inline void __window_data(u32 *dst, u32 *src)
+{
+	if (src)
+		memcpy(dst, src, nr_cpu_ids * sizeof(u32));
+	else
+		memset(dst, 0, nr_cpu_ids * sizeof(u32));
+}
+
+struct trace_seq;
+const char *__window_print(struct trace_seq *p, const u32 *buf, int buf_len)
+{
+	int i;
+	const char *ret = p->buffer + seq_buf_used(&p->seq);
+
+	for (i = 0; i < buf_len; i++)
+		trace_seq_printf(p, "%u ", buf[i]);
+
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+#endif
+
 TRACE_EVENT(sched_update_task_ravg,
 
 	TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt,
@@ -288,10 +312,12 @@ TRACE_EVENT(sched_update_task_ravg,
 		__field(	u64,	rq_ps			)
 		__field(	u64,	grp_cs			)
 		__field(	u64,	grp_ps			)
-		__field(	u64,	grp_nt_cs			)
-		__field(	u64,	grp_nt_ps			)
+		__field(	u64,	grp_nt_cs		)
+		__field(	u64,	grp_nt_ps		)
 		__field(	u32,	curr_window		)
 		__field(	u32,	prev_window		)
+		__dynamic_array(u32,	curr_sum, nr_cpu_ids	)
+		__dynamic_array(u32,	prev_sum, nr_cpu_ids	)
 		__field(	u64,	nt_cs			)
 		__field(	u64,	nt_ps			)
 		__field(	u32,	active_windows		)
@@ -321,12 +347,14 @@ TRACE_EVENT(sched_update_task_ravg,
 		__entry->grp_nt_ps = cpu_time ? cpu_time->nt_prev_runnable_sum : 0;
 		__entry->curr_window	= p->ravg.curr_window;
 		__entry->prev_window	= p->ravg.prev_window;
+		__window_data(__get_dynamic_array(curr_sum), p->ravg.curr_window_cpu);
+		__window_data(__get_dynamic_array(prev_sum), p->ravg.prev_window_cpu);
 		__entry->nt_cs		= rq->nt_curr_runnable_sum;
 		__entry->nt_ps		= rq->nt_prev_runnable_sum;
 		__entry->active_windows	= p->ravg.active_windows;
 	),
 
-	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu"
+	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu"
 		, __entry->wallclock, __entry->win_start, __entry->delta,
 		task_event_names[__entry->evt], __entry->cpu,
 		__entry->cur_freq, __entry->cur_pid,
@@ -334,7 +362,10 @@ TRACE_EVENT(sched_update_task_ravg,
 		__entry->delta_m, __entry->demand,
 		__entry->sum, __entry->irqtime, __entry->pred_demand,
 		__entry->rq_cs, __entry->rq_ps, __entry->curr_window,
-		__entry->prev_window, __entry->nt_cs, __entry->nt_ps,
+		__window_print(p, __get_dynamic_array(curr_sum), nr_cpu_ids),
+		__entry->prev_window,
+		__window_print(p, __get_dynamic_array(prev_sum), nr_cpu_ids),
+		__entry->nt_cs, __entry->nt_ps,
 		__entry->active_windows, __entry->grp_cs,
 		__entry->grp_ps, __entry->grp_nt_cs, __entry->grp_nt_ps)
 );
diff --git a/kernel/fork.c b/kernel/fork.c
index e89d0bae6f20..8a5962276788 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1684,7 +1684,7 @@ struct task_struct *fork_idle(int cpu)
 	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
 	if (!IS_ERR(task)) {
 		init_idle_pids(task->pids);
-		init_idle(task, cpu);
+		init_idle(task, cpu, false);
 	}
 
 	return task;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 024fb1007c78..01bc9edc8b81 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2255,13 +2255,13 @@ void __dl_clear_params(struct task_struct *p)
 void sched_exit(struct task_struct *p)
 {
 	unsigned long flags;
-	int cpu = get_cpu();
-	struct rq *rq = cpu_rq(cpu);
+	struct rq *rq;
 	u64 wallclock;
 
 	sched_set_group_id(p, 0);
 
-	raw_spin_lock_irqsave(&rq->lock, flags);
+	rq = task_rq_lock(p, &flags);
+
 	/* rq->curr == p */
 	wallclock = sched_ktime_clock();
 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
@@ -2269,11 +2269,13 @@ void sched_exit(struct task_struct *p)
 	reset_task_stats(p);
 	p->ravg.mark_start = wallclock;
 	p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+
+	kfree(p->ravg.curr_window_cpu);
+	kfree(p->ravg.prev_window_cpu);
+
 	enqueue_task(rq, p, 0);
 	clear_ed_task(p, rq);
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-	put_cpu();
+	task_rq_unlock(rq, p, &flags);
 }
 #endif /* CONFIG_SCHED_HMP */
 
@@ -2377,6 +2379,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	int cpu = get_cpu();
 
 	__sched_fork(clone_flags, p);
+	init_new_task_load(p, false);
 	/*
 	 * We mark the process as running here. This guarantees that
 	 * nobody will actually run it, and a signal or other external
@@ -2562,7 +2565,6 @@ void wake_up_new_task(struct task_struct *p)
 	struct rq *rq;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	init_new_task_load(p);
 	add_new_task_to_grp(p);
 	/* Initialize new task's runnable average */
 	init_entity_runnable_average(&p->se);
@@ -5210,17 +5212,21 @@ void init_idle_bootup_task(struct task_struct *idle)
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
  * @cpu: cpu the idle task belongs to
+ * @cpu_up: differentiate between initial boot vs hotplug
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
-void init_idle(struct task_struct *idle, int cpu)
+void init_idle(struct task_struct *idle, int cpu, bool cpu_up)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
 	__sched_fork(0, idle);
 
+	if (!cpu_up)
+		init_new_task_load(idle, true);
+
 	raw_spin_lock_irqsave(&idle->pi_lock, flags);
 	raw_spin_lock(&rq->lock);
 
@@ -8051,7 +8057,7 @@ void __init sched_init(void)
 	 * but because we are the idle thread, we just pick up running again
 	 * when this runqueue becomes "idle".
 	 */
-	init_idle(current, smp_processor_id());
+	init_idle(current, smp_processor_id(), false);
 
 	calc_load_update = jiffies + LOAD_FREQ;
 
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 3d5de8ba70a2..6ede7a224430 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -1611,7 +1611,7 @@ unsigned int cpu_temp(int cpu)
 		return 0;
 }
 
-void init_new_task_load(struct task_struct *p)
+void init_new_task_load(struct task_struct *p, bool idle_task)
 {
 	int i;
 	u32 init_load_windows = sched_init_task_load_windows;
@@ -1623,6 +1623,15 @@ void init_new_task_load(struct task_struct *p)
 	memset(&p->ravg, 0, sizeof(struct ravg));
 	p->cpu_cycles = 0;
 
+	p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_ATOMIC);
+	p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_ATOMIC);
+
+	/* Don't have much choice. CPU frequency would be bogus */
+	BUG_ON(!p->ravg.curr_window_cpu || !p->ravg.prev_window_cpu);
+
+	if (idle_task)
+		return;
+
 	if (init_load_pct)
 		init_load_windows = div64_u64((u64)init_load_pct *
 			  (u64)sched_ravg_window, 100);
@@ -2161,6 +2170,32 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
 	p->ravg.pred_demand = new;
 }
 
+static u32 empty_windows[NR_CPUS];
+
+static void rollover_task_window(struct task_struct *p, bool full_window)
+{
+	u32 *curr_cpu_windows = empty_windows;
+	u32 curr_window;
+	int i;
+
+	/* Rollover the sum */
+	curr_window = 0;
+
+	if (!full_window) {
+		curr_window = p->ravg.curr_window;
+		curr_cpu_windows = p->ravg.curr_window_cpu;
+	}
+
+	p->ravg.prev_window = curr_window;
+	p->ravg.curr_window = 0;
+
+	/* Roll over individual CPU contributions */
+	for (i = 0; i < nr_cpu_ids; i++) {
+		p->ravg.prev_window_cpu[i] = curr_cpu_windows[i];
+		p->ravg.curr_window_cpu[i] = 0;
+	}
+}
+
 /*
  * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
  */
@@ -2181,6 +2216,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 	int prev_sum_reset = 0;
 	bool new_task;
 	struct related_thread_group *grp;
+	int cpu = rq->cpu;
 
 	new_window = mark_start < window_start;
 	if (new_window) {
@@ -2240,15 +2276,9 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 	 * Handle per-task window rollover. We don't care about the idle
 	 * task or exiting tasks.
 	 */
-	if (new_window && !is_idle_task(p) && !exiting_task(p)) {
-		u32 curr_window = 0;
+	if (new_window && !is_idle_task(p) && !exiting_task(p))
+		rollover_task_window(p, full_window);
 
-		if (!full_window)
-			curr_window = p->ravg.curr_window;
-
-		p->ravg.prev_window = curr_window;
-		p->ravg.curr_window = 0;
-	}
 
 	if (flip_counters) {
 		u64 curr_sum = *curr_runnable_sum;
@@ -2310,8 +2340,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		if (new_task)
 			*nt_curr_runnable_sum += delta;
 
-		if (!is_idle_task(p) && !exiting_task(p))
+		if (!is_idle_task(p) && !exiting_task(p)) {
 			p->ravg.curr_window += delta;
+			p->ravg.curr_window_cpu[cpu] += delta;
+		}
 
 		return;
 	}
@@ -2336,8 +2368,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			 * contribution to previous completed window.
 			 */
 			delta = scale_exec_time(window_start - mark_start, rq);
-			if (!exiting_task(p))
+			if (!exiting_task(p)) {
 				p->ravg.prev_window += delta;
+				p->ravg.prev_window_cpu[cpu] += delta;
+			}
 		} else {
 			/*
 			 * Since at least one full window has elapsed,
@@ -2345,8 +2379,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			 * full window (window_size).
 			 */
 			delta = scale_exec_time(window_size, rq);
-			if (!exiting_task(p))
+			if (!exiting_task(p)) {
 				p->ravg.prev_window = delta;
+				p->ravg.prev_window_cpu[cpu] = delta;
+			}
 		}
 
 		*prev_runnable_sum += delta;
@@ -2359,8 +2395,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		if (new_task)
 			*nt_curr_runnable_sum += delta;
 
-		if (!exiting_task(p))
+		if (!exiting_task(p)) {
 			p->ravg.curr_window = delta;
+			p->ravg.curr_window_cpu[cpu] = delta;
+		}
 
 		return;
 	}
@@ -2386,8 +2424,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			 * contribution to previous completed window.
 			 */
 			delta = scale_exec_time(window_start - mark_start, rq);
-			if (!is_idle_task(p) && !exiting_task(p))
+			if (!is_idle_task(p) && !exiting_task(p)) {
 				p->ravg.prev_window += delta;
+				p->ravg.prev_window_cpu[cpu] += delta;
+			}
 		} else {
 			/*
 			 * Since at least one full window has elapsed,
@@ -2395,8 +2435,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			 * full window (window_size).
 			 */
 			delta = scale_exec_time(window_size, rq);
-			if (!is_idle_task(p) && !exiting_task(p))
+			if (!is_idle_task(p) && !exiting_task(p)) {
 				p->ravg.prev_window = delta;
+				p->ravg.prev_window_cpu[cpu] = delta;
+			}
 		}
 
 		/*
@@ -2413,8 +2455,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		if (new_task)
 			*nt_curr_runnable_sum += delta;
 
-		if (!is_idle_task(p) && !exiting_task(p))
+		if (!is_idle_task(p) && !exiting_task(p)) {
 			p->ravg.curr_window = delta;
+			p->ravg.curr_window_cpu[cpu] = delta;
+		}
 
 		return;
 	}
@@ -2829,11 +2873,23 @@ void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
 void reset_task_stats(struct task_struct *p)
 {
 	u32 sum = 0;
+	u32 *curr_window_ptr = NULL;
+	u32 *prev_window_ptr = NULL;
 
-	if (exiting_task(p))
+	if (exiting_task(p)) {
 		sum = EXITING_TASK_MARKER;
+	} else {
+		curr_window_ptr =  p->ravg.curr_window_cpu;
+		prev_window_ptr = p->ravg.prev_window_cpu;
+		memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
+		memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
+	}
 
 	memset(&p->ravg, 0, sizeof(struct ravg));
+
+	p->ravg.curr_window_cpu = curr_window_ptr;
+	p->ravg.prev_window_cpu = prev_window_ptr;
+
 	/* Retain EXITING_TASK marker */
 	p->ravg.sum_history[0] = sum;
 }
@@ -2889,7 +2945,9 @@ static void reset_all_task_stats(void)
 
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
+		raw_spin_lock(&p->pi_lock);
 		reset_task_stats(p);
+		raw_spin_unlock(&p->pi_lock);
 	}  while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 27b28369440d..f786767aa353 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1057,7 +1057,7 @@ extern unsigned int  __read_mostly sched_upmigrate;
 extern unsigned int  __read_mostly sched_downmigrate;
 extern unsigned int  __read_mostly sysctl_sched_spill_nr_run;
 
-extern void init_new_task_load(struct task_struct *p);
+extern void init_new_task_load(struct task_struct *p, bool idle_task);
 extern u64 sched_ktime_clock(void);
 extern int got_boost_kick(void);
 extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb);
@@ -1503,7 +1503,9 @@ static inline struct sched_cluster *rq_cluster(struct rq *rq)
 	return NULL;
 }
 
-static inline void init_new_task_load(struct task_struct *p) { }
+static inline void init_new_task_load(struct task_struct *p, bool idle_task)
+{
+}
 
 static inline u64 scale_load_to_cpu(u64 load, int cpu)
 {
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 6949476a118f..3a0415803b09 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -32,7 +32,7 @@ struct task_struct *idle_thread_get(unsigned int cpu)
 
 	if (!tsk)
 		return ERR_PTR(-ENOMEM);
-	init_idle(tsk, cpu);
+	init_idle(tsk, cpu, true);
 	return tsk;
 }
 

From 7e1a4f15b2c38ea0d0207a6fc95b721c09d6f994 Mon Sep 17 00:00:00 2001
From: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
Date: Thu, 19 May 2016 17:06:47 -0700
Subject: [PATCH 2/4] sched: Enhance the scheduler migration load fixup feature

In the current frequency guidance implementation the scheduler migrates
task load from the source CPU to the destination CPU when a task migrates.
The underlying assumption is that a task will stay on the destination CPU
following the migration. Hence a CPU's load should reflect the sum of
all tasks that last ran on that CPU prior to window expiration even if
these tasks executed on some other CPU in that window prior to being
migrated.

However, given the ubiquitous nature of migrations the above assumption
is flawed causing the scheduler to often add up load on a single CPU
that in reality ran concurrently on multiple CPUs and will continue to
run concurrently in subsequent windows. This leads to load over
reporting on a single CPU which in turn causes CPU frequency to be higher
than necessary.

This is the first patch in a series of patches that attempts to change
how load fixups are done upon migration to prevent load over reporting.
In this patch, we stop doing migration fixups for intra-cluster
migrations. Inter-cluster migration fixups are still retained.

In order to achieve the above, we make use the per CPU footprint of each
task introduced in the previous patch. Upon inter cluster migration, we
go through every CPU in the source cluster to subtract the migrating
task's contribution to the busy time on each one of those CPUs. The sum
of the contributions is then added to the destination CPU allowing it
to ramp up to the appropriate frequency for that task.

Subtracting load from each of the source CPUs is not trivial, however,
as it would require all runqueue locks to held. To get around this
we introduce a deferred load subtraction mechanism whereby subtracting
load from each of the source CPUs in deferred until an opportune moment.
This opportune moment is when the governor comes asking the scheduler
for load. At that time, all necessary runqueue locks are already held.

There are a few cases to consider when doing deferred subtraction. Since
we are not holding all runqueue locks other CPUs in the source cluster
can be in a different window than the source CPU where the task
is migrating from.

Case 1: Other CPU in the source cluster is in the same window
No special consideration

Case 2: Other CPU in the source cluster is ahead by 1 window
In this case, we will be doing redundant updates to subtraction load
for the prev window. There is no way to avoid this redundant update
though, without holding the rq lock.

Case 3: Other CPU in the source cluster is trailing by 1 window
In this case, we might end up overwriting old data for that CPU. But
this is not a problem as when the other CPU calls update_task_ravg()
it will move to the same window. This relies on maintaining
synchronized windows between CPUs, which is true today.

Finally, we must deal with frequency aggregation. When frequency
aggregation is in effect, there is little point in dealing with per
CPU footprint since the load of all related tasks have to be reported
on a single CPU. Therefore when a task enters a related group we clear
out all per CPU contributions and add it to the task CPU's cpu_time
struct. From that point onwards we stop managing per CPU contributions
upon inter cluster migrations since that work is redundant. Finally
when a task exits a related group we must walk every CPU in reset
all CPU contributions. We then set the task CPU contribution to the
respective curr/prev sum values and add that sum to the task CPU
rq runnable sum.

Change-Id: I1f8d596e6c930f3f6f00e24109ddbe8b121f8d6b
Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
---
 kernel/sched/core.c  |   3 +
 kernel/sched/hmp.c   | 292 ++++++++++++++++++++++++++++++++++---------
 kernel/sched/sched.h |  12 +-
 3 files changed, 249 insertions(+), 58 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 01bc9edc8b81..90d7ba39e4c2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8015,6 +8015,9 @@ void __init sched_init(void)
 		rq->old_estimated_time = 0;
 		rq->old_busy_time_group = 0;
 		rq->hmp_stats.pred_demands_sum = 0;
+		for (j = 0; j < NUM_SUBTRACTION_WINDOWS; j++)
+			memset(&rq->load_subs[j], 0,
+					sizeof(struct load_subtractions));
 #endif
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 6ede7a224430..35f4ea1761e2 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -590,6 +590,7 @@ static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
 	cluster->dstate_wakeup_latency	=	0;
 	cluster->freq_init_done		=	false;
 
+	raw_spin_lock_init(&cluster->load_lock);
 	cluster->cpus = *cpus;
 	cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
 
@@ -647,6 +648,7 @@ void init_clusters(void)
 {
 	bitmap_clear(all_cluster_ids, 0, NR_CPUS);
 	init_cluster.cpus = *cpu_possible_mask;
+	raw_spin_lock_init(&init_cluster.load_lock);
 	INIT_LIST_HEAD(&cluster_head);
 }
 
@@ -1505,7 +1507,7 @@ static inline int invalid_value(unsigned int *data)
 
 /*
  * Handle "atomic" update of sysctl_sched_window_stats_policy,
- * sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables.
+ * sysctl_sched_ravg_hist_size variables.
  */
 int sched_window_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
@@ -2992,7 +2994,7 @@ const char *sched_window_reset_reasons[] = {
 /* Called with IRQs enabled */
 void reset_all_window_stats(u64 window_start, unsigned int window_size)
 {
-	int cpu;
+	int cpu, i;
 	unsigned long flags;
 	u64 start_ts = sched_ktime_clock();
 	int reason = WINDOW_CHANGE;
@@ -3037,6 +3039,9 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 			rq->window_start = window_start;
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+		for (i = 0; i < NUM_SUBTRACTION_WINDOWS; i++)
+			memset(&rq->load_subs[i], 0,
+					sizeof(struct load_subtractions));
 		reset_cpu_hmp_stats(cpu, 1);
 	}
 
@@ -3069,6 +3074,39 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 		sched_ktime_clock() - start_ts, reason, old, new);
 }
 
+/*
+ * In this function we match the accumulated subtractions with the current
+ * and previous windows we are operating with. Ignore any entries where
+ * the window start in the load_subtraction struct does not match either
+ * the curent or the previous window. This could happen whenever CPUs
+ * become idle or busy with interrupts disabled for an extended period.
+ */
+static inline void account_load_subtractions(struct rq *rq)
+{
+	u64 ws = rq->window_start;
+	u64 prev_ws = ws - sched_ravg_window;
+	struct load_subtractions *ls = rq->load_subs;
+	int i;
+
+	for (i = 0; i < NUM_SUBTRACTION_WINDOWS; i++) {
+		if (ls[i].window_start == ws) {
+			rq->curr_runnable_sum -= ls[i].subs;
+			rq->nt_curr_runnable_sum -= ls[i].new_subs;
+		} else if (ls[i].window_start == prev_ws) {
+			rq->prev_runnable_sum -= ls[i].subs;
+			rq->nt_prev_runnable_sum -= ls[i].new_subs;
+		}
+
+		ls[i].subs = 0;
+		ls[i].new_subs = 0;
+	}
+
+	BUG_ON((s64)rq->prev_runnable_sum < 0);
+	BUG_ON((s64)rq->curr_runnable_sum < 0);
+	BUG_ON((s64)rq->nt_prev_runnable_sum < 0);
+	BUG_ON((s64)rq->nt_curr_runnable_sum < 0);
+}
+
 static inline void
 sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time);
 
@@ -3091,6 +3129,7 @@ void sched_get_cpus_busy(struct sched_load *busy,
 	struct related_thread_group *grp;
 	u64 total_group_load = 0, total_ngload = 0;
 	bool aggregate_load = false;
+	struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus));
 
 	if (unlikely(cpus == 0))
 		return;
@@ -3108,6 +3147,13 @@ void sched_get_cpus_busy(struct sched_load *busy,
 
 	window_size = sched_ravg_window;
 
+	/*
+	 * We don't really need the cluster lock for this entire for loop
+	 * block. However, there is no advantage in optimizing this as rq
+	 * locks are held regardless and would prevent migration anyways
+	 */
+	raw_spin_lock(&cluster->load_lock);
+
 	for_each_cpu(cpu, query_cpus) {
 		rq = cpu_rq(cpu);
 
@@ -3115,6 +3161,7 @@ void sched_get_cpus_busy(struct sched_load *busy,
 				 0);
 		cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
 
+		account_load_subtractions(rq);
 		load[i] = rq->old_busy_time = rq->prev_runnable_sum;
 		nload[i] = rq->nt_prev_runnable_sum;
 		pload[i] = rq->hmp_stats.pred_demands_sum;
@@ -3141,6 +3188,8 @@ void sched_get_cpus_busy(struct sched_load *busy,
 		i++;
 	}
 
+	raw_spin_unlock(&cluster->load_lock);
+
 	for_each_related_thread_group(grp) {
 		for_each_cpu(cpu, query_cpus) {
 			/* Protected by rq_lock */
@@ -3295,6 +3344,116 @@ int sched_set_window(u64 window_start, unsigned int window_size)
 	return 0;
 }
 
+static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index)
+{
+	rq->load_subs[index].window_start = ws;
+	rq->load_subs[index].subs = 0;
+	rq->load_subs[index].new_subs = 0;
+}
+
+static bool get_subtraction_index(struct rq *rq, u64 ws)
+{
+	int i;
+	u64 oldest = ULLONG_MAX;
+	int oldest_index = 0;
+
+	for (i = 0; i < NUM_SUBTRACTION_WINDOWS; i++) {
+		u64 entry_ws = rq->load_subs[i].window_start;
+
+		if (ws == entry_ws)
+			return i;
+
+		if (entry_ws < oldest) {
+			oldest = entry_ws;
+			oldest_index = i;
+		}
+	}
+
+	create_subtraction_entry(rq, ws, oldest_index);
+	return oldest_index;
+}
+
+static void update_rq_load_subtractions(int index, struct rq *rq,
+					u32 sub_load, bool new_task)
+{
+	rq->load_subs[index].subs +=  sub_load;
+	if (new_task)
+		rq->load_subs[index].new_subs += sub_load;
+}
+
+static void update_cluster_load_subtractions(struct task_struct *p,
+					int cpu, u64 ws, bool new_task)
+{
+	struct sched_cluster *cluster = cpu_cluster(cpu);
+	struct cpumask cluster_cpus = cluster->cpus;
+	u64 prev_ws = ws - sched_ravg_window;
+	int i;
+
+	cpumask_clear_cpu(cpu, &cluster_cpus);
+	raw_spin_lock(&cluster->load_lock);
+
+	for_each_cpu(i, &cluster_cpus) {
+		struct rq *rq = cpu_rq(i);
+		int index;
+
+		if (p->ravg.curr_window_cpu[i]) {
+			index = get_subtraction_index(rq, ws);
+			update_rq_load_subtractions(index, rq,
+				p->ravg.curr_window_cpu[i], new_task);
+			p->ravg.curr_window_cpu[i] = 0;
+		}
+
+		if (p->ravg.prev_window_cpu[i]) {
+			index = get_subtraction_index(rq, prev_ws);
+			update_rq_load_subtractions(index, rq,
+				p->ravg.prev_window_cpu[i], new_task);
+			p->ravg.prev_window_cpu[i] = 0;
+		}
+	}
+
+	raw_spin_unlock(&cluster->load_lock);
+}
+
+static inline void inter_cluster_migration_fixup
+	(struct task_struct *p, int new_cpu, int task_cpu, bool new_task)
+{
+	struct rq *dest_rq = cpu_rq(new_cpu);
+	struct rq *src_rq = cpu_rq(task_cpu);
+
+	if (same_freq_domain(new_cpu, task_cpu))
+		return;
+
+	p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window;
+	p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window;
+
+	dest_rq->curr_runnable_sum += p->ravg.curr_window;
+	dest_rq->prev_runnable_sum += p->ravg.prev_window;
+
+	src_rq->curr_runnable_sum -=  p->ravg.curr_window_cpu[task_cpu];
+	src_rq->prev_runnable_sum -=  p->ravg.prev_window_cpu[task_cpu];
+
+	if (new_task) {
+		dest_rq->nt_curr_runnable_sum += p->ravg.curr_window;
+		dest_rq->nt_prev_runnable_sum += p->ravg.prev_window;
+
+		src_rq->nt_curr_runnable_sum -=
+				p->ravg.curr_window_cpu[task_cpu];
+		src_rq->nt_prev_runnable_sum -=
+				p->ravg.prev_window_cpu[task_cpu];
+	}
+
+	p->ravg.curr_window_cpu[task_cpu] = 0;
+	p->ravg.prev_window_cpu[task_cpu] = 0;
+
+	update_cluster_load_subtractions(p, task_cpu,
+			src_rq->window_start, new_task);
+
+	BUG_ON((s64)src_rq->prev_runnable_sum < 0);
+	BUG_ON((s64)src_rq->curr_runnable_sum < 0);
+	BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0);
+	BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
+}
+
 void fixup_busy_time(struct task_struct *p, int new_cpu)
 {
 	struct rq *src_rq = task_rq(p);
@@ -3304,8 +3463,6 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
 	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
 	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
 	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
-	int migrate_type;
-	struct migration_sum_data d;
 	bool new_task;
 	struct related_thread_group *grp;
 
@@ -3339,62 +3496,54 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
 	new_task = is_new_task(p);
 	/* Protected by rq_lock */
 	grp = p->grp;
+
+	/*
+	 * For frequency aggregation, we continue to do migration fixups
+	 * even for intra cluster migrations. This is because, the aggregated
+	 * load has to reported on a single CPU regardless.
+	 */
 	if (grp && sched_freq_aggregate) {
 		struct group_cpu_time *cpu_time;
 
-		migrate_type = GROUP_TO_GROUP;
-		/* Protected by rq_lock */
 		cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
-		d.src_rq = NULL;
-		d.src_cpu_time = cpu_time;
 		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
 		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
 
-		/* Protected by rq_lock */
 		cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
-		d.dst_rq = NULL;
-		d.dst_cpu_time = cpu_time;
 		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
 		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
 		sync_window_start(dest_rq, cpu_time);
+
+		if (p->ravg.curr_window) {
+			*src_curr_runnable_sum -= p->ravg.curr_window;
+			*dst_curr_runnable_sum += p->ravg.curr_window;
+			if (new_task) {
+				*src_nt_curr_runnable_sum -=
+							p->ravg.curr_window;
+				*dst_nt_curr_runnable_sum +=
+							p->ravg.curr_window;
+			}
+		}
+
+		if (p->ravg.prev_window) {
+			*src_prev_runnable_sum -= p->ravg.prev_window;
+			*dst_prev_runnable_sum += p->ravg.prev_window;
+			if (new_task) {
+				*src_nt_prev_runnable_sum -=
+							p->ravg.prev_window;
+				*dst_nt_prev_runnable_sum +=
+							p->ravg.prev_window;
+			}
+		}
 	} else {
-		migrate_type = RQ_TO_RQ;
-		d.src_rq = src_rq;
-		d.src_cpu_time = NULL;
-		d.dst_rq = dest_rq;
-		d.dst_cpu_time = NULL;
-		src_curr_runnable_sum = &src_rq->curr_runnable_sum;
-		src_prev_runnable_sum = &src_rq->prev_runnable_sum;
-		src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum;
-		src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum;
-
-		dst_curr_runnable_sum = &dest_rq->curr_runnable_sum;
-		dst_prev_runnable_sum = &dest_rq->prev_runnable_sum;
-		dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum;
-		dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum;
+		inter_cluster_migration_fixup(p, new_cpu,
+						task_cpu(p), new_task);
 	}
 
-	if (p->ravg.curr_window) {
-		*src_curr_runnable_sum -= p->ravg.curr_window;
-		*dst_curr_runnable_sum += p->ravg.curr_window;
-		if (new_task) {
-			*src_nt_curr_runnable_sum -= p->ravg.curr_window;
-			*dst_nt_curr_runnable_sum += p->ravg.curr_window;
-		}
-	}
-
-	if (p->ravg.prev_window) {
-		*src_prev_runnable_sum -= p->ravg.prev_window;
-		*dst_prev_runnable_sum += p->ravg.prev_window;
-		if (new_task) {
-			*src_nt_prev_runnable_sum -= p->ravg.prev_window;
-			*dst_nt_prev_runnable_sum += p->ravg.prev_window;
-		}
-	}
 
 	if (p == src_rq->ed_task) {
 		src_rq->ed_task = NULL;
@@ -3402,12 +3551,6 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
 			dest_rq->ed_task = p;
 	}
 
-	trace_sched_migration_update_sum(p, migrate_type, &d);
-	BUG_ON((s64)*src_prev_runnable_sum < 0);
-	BUG_ON((s64)*src_curr_runnable_sum < 0);
-	BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
-	BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
-
 done:
 	if (p->state == TASK_WAKING)
 		double_rq_unlock(src_rq, dest_rq);
@@ -3559,6 +3702,9 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
 	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
 	struct migration_sum_data d;
 	int migrate_type;
+	int cpu = cpu_of(rq);
+	bool new_task = is_new_task(p);
+	int i;
 
 	if (!sched_freq_aggregate)
 		return;
@@ -3569,7 +3715,7 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
 	update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
 
 	/* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
-	cpu_time = _group_cpu_time(grp, cpu_of(rq));
+	cpu_time = _group_cpu_time(grp, cpu);
 	if (event == ADD_TASK) {
 		sync_window_start(rq, cpu_time);
 		migrate_type = RQ_TO_GROUP;
@@ -3586,6 +3732,19 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
 		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
 		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+		*src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu];
+		*src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu];
+		if (new_task) {
+			*src_nt_curr_runnable_sum -=
+					p->ravg.curr_window_cpu[cpu];
+			*src_nt_prev_runnable_sum -=
+					p->ravg.prev_window_cpu[cpu];
+		}
+
+		update_cluster_load_subtractions(p, cpu,
+				rq->window_start, new_task);
+
 	} else {
 		migrate_type = GROUP_TO_RQ;
 		d.src_rq = NULL;
@@ -3608,21 +3767,42 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
 		dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
 		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
 		dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+
+		*src_curr_runnable_sum -= p->ravg.curr_window;
+		*src_prev_runnable_sum -= p->ravg.prev_window;
+		if (new_task) {
+			*src_nt_curr_runnable_sum -= p->ravg.curr_window;
+			*src_nt_prev_runnable_sum -= p->ravg.prev_window;
+		}
+
+		/*
+		 * Need to reset curr/prev windows for all CPUs, not just the
+		 * ones in the same cluster. Since inter cluster migrations
+		 * did not result in the appropriate book keeping, the values
+		 * per CPU would be inaccurate.
+		 */
+		for_each_possible_cpu(i) {
+			p->ravg.curr_window_cpu[i] = 0;
+			p->ravg.prev_window_cpu[i] = 0;
+		}
 	}
 
-	*src_curr_runnable_sum -= p->ravg.curr_window;
 	*dst_curr_runnable_sum += p->ravg.curr_window;
-
-	*src_prev_runnable_sum -= p->ravg.prev_window;
 	*dst_prev_runnable_sum += p->ravg.prev_window;
-
-	if (is_new_task(p)) {
-		*src_nt_curr_runnable_sum -= p->ravg.curr_window;
+	if (new_task) {
 		*dst_nt_curr_runnable_sum += p->ravg.curr_window;
-		*src_nt_prev_runnable_sum -= p->ravg.prev_window;
 		*dst_nt_prev_runnable_sum += p->ravg.prev_window;
 	}
 
+	/*
+	 * When a task enter or exits a group, it's curr and prev windows are
+	 * moved to a single CPU. This behavior might be sub-optimal in the
+	 * exit case, however, it saves us the overhead of handling inter
+	 * cluster migration fixups while the task is part of a related group.
+	 */
+	p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window;
+	p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window;
+
 	trace_sched_migration_update_sum(p, migrate_type, &d);
 
 	BUG_ON((s64)*src_curr_runnable_sum < 0);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f786767aa353..c107712643dc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -351,13 +351,22 @@ struct cfs_bandwidth { };
 
 #ifdef CONFIG_SCHED_HMP
 
+#define NUM_SUBTRACTION_WINDOWS 2
+
 struct hmp_sched_stats {
 	int nr_big_tasks;
 	u64 cumulative_runnable_avg;
 	u64 pred_demands_sum;
 };
 
+struct load_subtractions {
+	u64 window_start;
+	u64 subs;
+	u64 new_subs;
+};
+
 struct sched_cluster {
+	raw_spinlock_t load_lock;
 	struct list_head list;
 	struct cpumask cpus;
 	int id;
@@ -742,6 +751,7 @@ struct rq {
 	u64 prev_runnable_sum;
 	u64 nt_curr_runnable_sum;
 	u64 nt_prev_runnable_sum;
+	struct load_subtractions load_subs[NUM_SUBTRACTION_WINDOWS];
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -1572,8 +1582,6 @@ static inline int update_preferred_cluster(struct related_thread_group *grp,
 static inline void add_new_task_to_grp(struct task_struct *new) {}
 
 #define sched_enable_hmp 0
-#define sched_freq_legacy_mode 1
-#define sched_migration_fixup	0
 #define PRED_DEMAND_DELTA (0)
 
 static inline void

From 7bd09f24415cb4809973ed4f536c717b91dc0e18 Mon Sep 17 00:00:00 2001
From: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
Date: Tue, 31 May 2016 16:40:45 -0700
Subject: [PATCH 3/4] sched: Add the mechanics of top task tracking for
 frequency guidance

The previous patches in this rewrite of scheduler guided frequency
selection reintroduces the part-picture problem that we addressed in
our initial implementation. In that, when tasks migrate across CPUs
within a cluster, we end up losing the complete picture of the
sequential nature of the workload.

This patch aims to solve that problem slightly differently. We track
the top task on every CPU within a window. Top task is defined as the
task that runs the most in a given window. This enhances our ability
to detect the sequential nature of workloads. A single migrating task
executing for an entire window will cause 100% load to be reported
for frequency guidance instead of the maximum footprint left on any
individual CPU in the task's trail. There are cases, that this new
approach does not address. Namely, cases where the sum of two or more
tasks accurately reflects the true sequential nature of the workload.
Future optimizations might aim to tackle that problem.

To track top tasks, we first realize that there is no strict need to
maintain the task struct itself as long as we know the load exerted by
the top task. We also realize that to maintain top tasks on every CPU
we have to track the execution of every single task that runs during
the window. The load associated with a task needs to be migrated when
the task migrates from one CPU to another. When the top task migrates
away, we need to locate the second top task and so on.

Given the above realizations, we use hashmaps to track top task load
both for the current and the previous window. This hashmap is
implemented as an array of fixed size. The key of the hashmap is given
by task_execution_time_in_a_window / array_size. The size of the array
(number of buckets in the hashmap) dictate the load granularity of each
bucket. The value stored in each bucket is a refcount of all the tasks
that executed long enough to be in that bucket.

This approach has a few benefits. Firstly, any top task stats update
now take O(1) time. While task migration is also O(1), it does still
involve going through up to the size of the array to find the second
top task. Further patches will aim to optimize this behavior. Secondly,
and more importantly, not having to store the task struct itself saves
a lot of memory usage in that 1) there is no need to retrieve task
structs later causing cache misses and 2) we don't have to unnecessarily
hold up task memory for up to 2 full windows by calling get_task_struct()
after a task exits.

Change-Id: I004dba474f41590db7d3f40d9deafe86e71359ac
Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
---
 include/trace/events/sched.h |  11 +-
 kernel/sched/core.c          |  13 +-
 kernel/sched/debug.c         |   1 +
 kernel/sched/hmp.c           | 261 +++++++++++++++++++++++++++++------
 kernel/sched/sched.h         |  10 +-
 5 files changed, 250 insertions(+), 46 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 209355c66e02..cd15ae7b8b0c 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -321,6 +321,8 @@ TRACE_EVENT(sched_update_task_ravg,
 		__field(	u64,	nt_cs			)
 		__field(	u64,	nt_ps			)
 		__field(	u32,	active_windows		)
+		__field(	u8,	curr_top		)
+		__field(	u8,	prev_top		)
 	),
 
 	TP_fast_assign(
@@ -352,10 +354,12 @@ TRACE_EVENT(sched_update_task_ravg,
 		__entry->nt_cs		= rq->nt_curr_runnable_sum;
 		__entry->nt_ps		= rq->nt_prev_runnable_sum;
 		__entry->active_windows	= p->ravg.active_windows;
+		__entry->curr_top	= rq->curr_top;
+		__entry->prev_top	= rq->prev_top;
 	),
 
-	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu"
-		, __entry->wallclock, __entry->win_start, __entry->delta,
+	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu curr_top %u prev_top %u",
+		__entry->wallclock, __entry->win_start, __entry->delta,
 		task_event_names[__entry->evt], __entry->cpu,
 		__entry->cur_freq, __entry->cur_pid,
 		__entry->pid, __entry->comm, __entry->mark_start,
@@ -367,7 +371,8 @@ TRACE_EVENT(sched_update_task_ravg,
 		__window_print(p, __get_dynamic_array(prev_sum), nr_cpu_ids),
 		__entry->nt_cs, __entry->nt_ps,
 		__entry->active_windows, __entry->grp_cs,
-		__entry->grp_ps, __entry->grp_nt_cs, __entry->grp_nt_ps)
+		__entry->grp_ps, __entry->grp_nt_cs, __entry->grp_nt_ps,
+		__entry->curr_top, __entry->prev_top)
 );
 
 TRACE_EVENT(sched_get_task_cpu_cycles,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 90d7ba39e4c2..5c616517d4d3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8015,9 +8015,20 @@ void __init sched_init(void)
 		rq->old_estimated_time = 0;
 		rq->old_busy_time_group = 0;
 		rq->hmp_stats.pred_demands_sum = 0;
-		for (j = 0; j < NUM_SUBTRACTION_WINDOWS; j++)
+		rq->curr_table = 0;
+		rq->prev_top = 0;
+		rq->curr_top = 0;
+
+		for (j = 0; j < NUM_TRACKED_WINDOWS; j++) {
 			memset(&rq->load_subs[j], 0,
 					sizeof(struct load_subtractions));
+
+			rq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES,
+						sizeof(u8), GFP_NOWAIT);
+
+			/* No other choice */
+			BUG_ON(!rq->top_tasks[j]);
+		}
 #endif
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index b6dc131f36a6..c8c4272c61d8 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -418,6 +418,7 @@ static void sched_debug_header(struct seq_file *m)
 	P(min_capacity);
 	P(max_capacity);
 	P(sched_ravg_window);
+	P(sched_load_granule);
 #endif
 #undef PN
 #undef P
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 35f4ea1761e2..8675ebeebf6a 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -825,15 +825,15 @@ unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
 unsigned int
 min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
 
-/* Window size (in ns) */
-__read_mostly unsigned int sched_ravg_window = 10000000;
-
 /* Min window size (in ns) = 10ms */
 #define MIN_SCHED_RAVG_WINDOW 10000000
 
 /* Max window size (in ns) = 1s */
 #define MAX_SCHED_RAVG_WINDOW 1000000000
 
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
+
 /* Temporarily disable window-stats activity on all cpus */
 unsigned int __read_mostly sched_disable_window_stats;
 
@@ -852,6 +852,17 @@ static DEFINE_RWLOCK(related_thread_group_lock);
 #define for_each_related_thread_group(grp) \
 	list_for_each_entry(grp, &related_thread_groups, list)
 
+/*
+ * Task load is categorized into buckets for the purpose of top task tracking.
+ * The entire range of load from 0 to sched_ravg_window needs to be covered
+ * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket
+ * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value
+ * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute
+ * sched_load_granule.
+ */
+__read_mostly unsigned int sched_load_granule =
+			MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
+
 /*
  * Demand aggregation for frequency purpose:
  *
@@ -2172,6 +2183,113 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
 	p->ravg.pred_demand = new;
 }
 
+/*
+ * Special case the last index and provide a fast path for index = 0.
+ * Note that sched_load_granule can change underneath us if we are not
+ * holding any runqueue locks while calling the two functions below.
+ */
+static u32  __maybe_unused top_task_load(struct rq *rq)
+{
+	int index = rq->prev_top;
+
+	if (!index) {
+		if (!rq->prev_runnable_sum)
+			return 0;
+		else
+			return sched_load_granule;
+	} else if (index == NUM_LOAD_INDICES - 1) {
+		return sched_ravg_window;
+	} else {
+		return (index + 1) * sched_load_granule;
+	}
+}
+
+static int load_to_index(u32 load)
+{
+	if (load < sched_load_granule)
+		return 0;
+	else if (load >= sched_ravg_window)
+		return NUM_LOAD_INDICES - 1;
+	else
+		return load / sched_load_granule;
+}
+
+static void update_top_tasks(struct task_struct *p, struct rq *rq,
+		u32 old_curr_window, int new_window, bool full_window)
+{
+	u8 *curr_table = rq->top_tasks[rq->curr_table];
+	u8 *prev_table = rq->top_tasks[1 - rq->curr_table];
+	int old_index, new_index, update_index;
+	u32 curr_window = p->ravg.curr_window;
+	u32 prev_window = p->ravg.prev_window;
+	bool zero_index_update;
+
+	if (old_curr_window == curr_window && !new_window)
+		return;
+
+	old_index = load_to_index(old_curr_window);
+	new_index = load_to_index(curr_window);
+
+	if (!new_window) {
+		zero_index_update = !old_curr_window && curr_window;
+		if (old_index != new_index || zero_index_update) {
+			if (old_curr_window)
+				curr_table[old_index] -= 1;
+			if (curr_window)
+				curr_table[new_index] += 1;
+			if (new_index > rq->curr_top)
+				rq->curr_top = new_index;
+		}
+
+		return;
+	}
+
+	/*
+	 * The window has rolled over for this task. By the time we get
+	 * here, curr/prev swaps would has already occurred. So we need
+	 * to use prev_window for the new index.
+	 */
+	update_index = load_to_index(prev_window);
+
+	if (full_window) {
+		/*
+		 * Two cases here. Either 'p' ran for the entire window or
+		 * it didn't run at all. In either case there is no entry
+		 * in the prev table. If 'p' ran the entire window, we just
+		 * need to create a new entry in the prev table. In this case
+		 * update_index will be correspond to sched_ravg_window
+		 * so we can unconditionally update the top index.
+		 */
+		if (prev_window) {
+			prev_table[update_index] += 1;
+			rq->prev_top = update_index;
+		}
+	} else {
+		zero_index_update = !old_curr_window && prev_window;
+		if (old_index != update_index || zero_index_update) {
+			if (old_curr_window)
+				prev_table[old_index] -= 1;
+
+			prev_table[update_index] += 1;
+
+			if (update_index > rq->prev_top)
+				rq->prev_top = update_index;
+		}
+	}
+
+	if (curr_window) {
+		curr_table[new_index] += 1;
+
+		if (new_index > rq->curr_top)
+			rq->curr_top = new_index;
+	}
+}
+
+static inline void clear_top_tasks_table(u8 *table)
+{
+	memset(table, 0, NUM_LOAD_INDICES * sizeof(u8));
+}
+
 static u32 empty_windows[NR_CPUS];
 
 static void rollover_task_window(struct task_struct *p, bool full_window)
@@ -2219,6 +2337,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 	bool new_task;
 	struct related_thread_group *grp;
 	int cpu = rq->cpu;
+	u32 old_curr_window;
 
 	new_window = mark_start < window_start;
 	if (new_window) {
@@ -2278,51 +2397,40 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 	 * Handle per-task window rollover. We don't care about the idle
 	 * task or exiting tasks.
 	 */
-	if (new_window && !is_idle_task(p) && !exiting_task(p))
-		rollover_task_window(p, full_window);
+	if (!is_idle_task(p) && !exiting_task(p)) {
+		old_curr_window = p->ravg.curr_window;
 
+		if (new_window)
+			rollover_task_window(p, full_window);
+	}
 
 	if (flip_counters) {
 		u64 curr_sum = *curr_runnable_sum;
 		u64 nt_curr_sum = *nt_curr_runnable_sum;
+		u8 curr_table = rq->curr_table;
+		u8 prev_table = 1 - curr_table;
+		int curr_top = rq->curr_top;
 
-		if (prev_sum_reset)
+		clear_top_tasks_table(rq->top_tasks[prev_table]);
+
+		if (prev_sum_reset) {
 			curr_sum = nt_curr_sum = 0;
+			curr_top = 0;
+			clear_top_tasks_table(rq->top_tasks[curr_table]);
+		}
 
 		*prev_runnable_sum = curr_sum;
 		*nt_prev_runnable_sum = nt_curr_sum;
 
 		*curr_runnable_sum = 0;
 		*nt_curr_runnable_sum = 0;
+		rq->curr_table = prev_table;
+		rq->prev_top = curr_top;
+		rq->curr_top = 0;
 	}
 
-	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
-		/*
-		 * account_busy_for_cpu_time() = 0, so no update to the
-		 * task's current window needs to be made. This could be
-		 * for example
-		 *
-		 *   - a wakeup event on a task within the current
-		 *     window (!new_window below, no action required),
-		 *   - switching to a new task from idle (PICK_NEXT_TASK)
-		 *     in a new window where irqtime is 0 and we aren't
-		 *     waiting on IO
-		 */
-
-		if (!new_window)
-			return;
-
-		/*
-		 * A new window has started. The RQ demand must be rolled
-		 * over if p is the current task.
-		 */
-		if (p_is_curr_task) {
-			/* p is idle task */
-			BUG_ON(p != rq->idle);
-		}
-
-		return;
-	}
+	if (!account_busy_for_cpu_time(rq, p, irqtime, event))
+		goto done;
 
 	if (!new_window) {
 		/*
@@ -2347,7 +2455,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			p->ravg.curr_window_cpu[cpu] += delta;
 		}
 
-		return;
+		goto done;
 	}
 
 	if (!p_is_curr_task) {
@@ -2402,7 +2510,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			p->ravg.curr_window_cpu[cpu] = delta;
 		}
 
-		return;
+		goto done;
 	}
 
 	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
@@ -2462,7 +2570,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			p->ravg.curr_window_cpu[cpu] = delta;
 		}
 
-		return;
+		goto done;
 	}
 
 	if (irqtime) {
@@ -2507,7 +2615,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		return;
 	}
 
-	BUG();
+done:
+	if (!is_idle_task(p) && !exiting_task(p))
+		update_top_tasks(p, rq, old_curr_window,
+					new_window, full_window);
 }
 
 static inline u32 predict_and_update_buckets(struct rq *rq,
@@ -3028,6 +3139,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 	if (window_size) {
 		sched_ravg_window = window_size * TICK_NSEC;
 		set_hmp_defaults();
+		sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES;
 	}
 
 	enable_window_stats();
@@ -3039,9 +3151,15 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 			rq->window_start = window_start;
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
-		for (i = 0; i < NUM_SUBTRACTION_WINDOWS; i++)
+		for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
 			memset(&rq->load_subs[i], 0,
 					sizeof(struct load_subtractions));
+			clear_top_tasks_table(rq->top_tasks[i]);
+		}
+
+		rq->curr_table = 0;
+		rq->curr_top = 0;
+		rq->prev_top = 0;
 		reset_cpu_hmp_stats(cpu, 1);
 	}
 
@@ -3088,7 +3206,7 @@ static inline void account_load_subtractions(struct rq *rq)
 	struct load_subtractions *ls = rq->load_subs;
 	int i;
 
-	for (i = 0; i < NUM_SUBTRACTION_WINDOWS; i++) {
+	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
 		if (ls[i].window_start == ws) {
 			rq->curr_runnable_sum -= ls[i].subs;
 			rq->nt_curr_runnable_sum -= ls[i].new_subs;
@@ -3357,7 +3475,7 @@ static bool get_subtraction_index(struct rq *rq, u64 ws)
 	u64 oldest = ULLONG_MAX;
 	int oldest_index = 0;
 
-	for (i = 0; i < NUM_SUBTRACTION_WINDOWS; i++) {
+	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
 		u64 entry_ws = rq->load_subs[i].window_start;
 
 		if (ws == entry_ws)
@@ -3454,6 +3572,68 @@ static inline void inter_cluster_migration_fixup
 	BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
 }
 
+static int find_next_top_index(u8 *tasks, int end)
+{
+	int i;
+
+	if (end <= 1)
+		return 0;
+
+	for (i = end - 1; i >= 0; i--) {
+		if (tasks[i])
+			return i;
+	}
+
+	return 0;
+}
+
+static void
+migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
+{
+	int index;
+	int top_index;
+	u32 curr_window = p->ravg.curr_window;
+	u32 prev_window = p->ravg.prev_window;
+	u8 src = src_rq->curr_table;
+	u8 dst = dst_rq->curr_table;
+	u8 *src_table;
+	u8 *dst_table;
+
+	if (curr_window) {
+		src_table = src_rq->top_tasks[src];
+		dst_table = dst_rq->top_tasks[dst];
+		index = load_to_index(curr_window);
+		src_table[index] -= 1;
+		dst_table[index] += 1;
+
+		if (index > dst_rq->curr_top)
+			dst_rq->curr_top = index;
+
+		top_index = src_rq->curr_top;
+		if (index == top_index && !src_table[index])
+			src_rq->curr_top =
+				find_next_top_index(src_table, top_index);
+	}
+
+	if (prev_window) {
+		src = 1 - src;
+		dst = 1 - dst;
+		src_table = src_rq->top_tasks[src];
+		dst_table = dst_rq->top_tasks[dst];
+		index = load_to_index(prev_window);
+		src_table[index] -= 1;
+		dst_table[index] += 1;
+
+		if (index > dst_rq->prev_top)
+			dst_rq->prev_top = index;
+
+		top_index = src_rq->prev_top;
+		if (index == top_index && !src_table[index])
+			src_rq->prev_top =
+				find_next_top_index(src_table, top_index);
+	}
+}
+
 void fixup_busy_time(struct task_struct *p, int new_cpu)
 {
 	struct rq *src_rq = task_rq(p);
@@ -3544,6 +3724,7 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
 						task_cpu(p), new_task);
 	}
 
+	migrate_top_tasks(p, src_rq, dest_rq);
 
 	if (p == src_rq->ed_task) {
 		src_rq->ed_task = NULL;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c107712643dc..5cbf374696ee 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -351,7 +351,8 @@ struct cfs_bandwidth { };
 
 #ifdef CONFIG_SCHED_HMP
 
-#define NUM_SUBTRACTION_WINDOWS 2
+#define NUM_TRACKED_WINDOWS 2
+#define NUM_LOAD_INDICES 1000
 
 struct hmp_sched_stats {
 	int nr_big_tasks;
@@ -751,7 +752,11 @@ struct rq {
 	u64 prev_runnable_sum;
 	u64 nt_curr_runnable_sum;
 	u64 nt_prev_runnable_sum;
-	struct load_subtractions load_subs[NUM_SUBTRACTION_WINDOWS];
+	struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
+	u8 *top_tasks[NUM_TRACKED_WINDOWS];
+	u8 curr_table;
+	int prev_top;
+	int curr_top;
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -1066,6 +1071,7 @@ extern unsigned int  __read_mostly sched_spill_load;
 extern unsigned int  __read_mostly sched_upmigrate;
 extern unsigned int  __read_mostly sched_downmigrate;
 extern unsigned int  __read_mostly sysctl_sched_spill_nr_run;
+extern unsigned int  __read_mostly sched_load_granule;
 
 extern void init_new_task_load(struct task_struct *p, bool idle_task);
 extern u64 sched_ktime_clock(void);

From dc09dd60a03e083c8e9ed2a971e1fda6f1a36309 Mon Sep 17 00:00:00 2001
From: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
Date: Tue, 7 Jun 2016 15:18:37 -0700
Subject: [PATCH 4/4] sched: Optimize the next top task search logic upon task
 migration

find_next_top_index() is responsible for finding the second top task
on a CPU when the top task migrates away from that CPU. This operation
is expensive as we need to iterate the entire array of top tasks to
find the second top task.

Optimize this by introducing bitmaps for tracking top task indices.
There are two bitmaps; one for the previous window and one for the
current window. Each bit in a bitmap tracks whether the corresponding
bucket in the top task hashmap has a non zero refcount. The bit is set
when the refcount becomes non zero and is cleared when it becomes zero.

Finding the second top task upon migration is then simply a matter of
finding the highest set bit in the bitmap.

Change-Id: Ibafaf66eed756b0328704dfaa89c17ab0d84e359
Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
---
 include/linux/types.h |  3 ++
 kernel/sched/core.c   |  2 +
 kernel/sched/hmp.c    | 86 +++++++++++++++++++++++++++++++++++--------
 kernel/sched/sched.h  |  3 ++
 4 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/include/linux/types.h b/include/linux/types.h
index 70dd3dfde631..9f2d2f46b459 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -9,6 +9,9 @@
 #define DECLARE_BITMAP(name,bits) \
 	unsigned long name[BITS_TO_LONGS(bits)]
 
+#define DECLARE_BITMAP_ARRAY(name,nr,bits) \
+	unsigned long name[nr][BITS_TO_LONGS(bits)]
+
 typedef __u32 __kernel_dev_t;
 
 typedef __kernel_fd_set		fd_set;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5c616517d4d3..7e7e19ed53c6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8028,6 +8028,8 @@ void __init sched_init(void)
 
 			/* No other choice */
 			BUG_ON(!rq->top_tasks[j]);
+
+			clear_top_tasks_bitmap(rq->top_tasks_bitmap[j]);
 		}
 #endif
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 8675ebeebf6a..dffe18ebab74 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -863,6 +863,10 @@ static DEFINE_RWLOCK(related_thread_group_lock);
 __read_mostly unsigned int sched_load_granule =
 			MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
 
+/* Size of bitmaps maintained to track top tasks */
+static const unsigned int top_tasks_bitmap_size =
+		BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long);
+
 /*
  * Demand aggregation for frequency purpose:
  *
@@ -2183,6 +2187,12 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
 	p->ravg.pred_demand = new;
 }
 
+void clear_top_tasks_bitmap(unsigned long *bitmap)
+{
+	memset(bitmap, 0, top_tasks_bitmap_size);
+	__set_bit(NUM_LOAD_INDICES, bitmap);
+}
+
 /*
  * Special case the last index and provide a fast path for index = 0.
  * Note that sched_load_granule can change underneath us if we are not
@@ -2191,9 +2201,12 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
 static u32  __maybe_unused top_task_load(struct rq *rq)
 {
 	int index = rq->prev_top;
+	u8 prev = 1 - rq->curr_table;
 
 	if (!index) {
-		if (!rq->prev_runnable_sum)
+		int msb = NUM_LOAD_INDICES - 1;
+
+		if (!test_bit(msb, rq->top_tasks_bitmap[prev]))
 			return 0;
 		else
 			return sched_load_granule;
@@ -2217,8 +2230,10 @@ static int load_to_index(u32 load)
 static void update_top_tasks(struct task_struct *p, struct rq *rq,
 		u32 old_curr_window, int new_window, bool full_window)
 {
-	u8 *curr_table = rq->top_tasks[rq->curr_table];
-	u8 *prev_table = rq->top_tasks[1 - rq->curr_table];
+	u8 curr = rq->curr_table;
+	u8 prev = 1 - curr;
+	u8 *curr_table = rq->top_tasks[curr];
+	u8 *prev_table = rq->top_tasks[prev];
 	int old_index, new_index, update_index;
 	u32 curr_window = p->ravg.curr_window;
 	u32 prev_window = p->ravg.prev_window;
@@ -2241,6 +2256,14 @@ static void update_top_tasks(struct task_struct *p, struct rq *rq,
 				rq->curr_top = new_index;
 		}
 
+		if (!curr_table[old_index])
+			__clear_bit(NUM_LOAD_INDICES - old_index - 1,
+				rq->top_tasks_bitmap[curr]);
+
+		if (curr_table[new_index] == 1)
+			__set_bit(NUM_LOAD_INDICES - new_index - 1,
+				rq->top_tasks_bitmap[curr]);
+
 		return;
 	}
 
@@ -2264,6 +2287,10 @@ static void update_top_tasks(struct task_struct *p, struct rq *rq,
 			prev_table[update_index] += 1;
 			rq->prev_top = update_index;
 		}
+
+		if (prev_table[update_index] == 1)
+			__set_bit(NUM_LOAD_INDICES - update_index - 1,
+				rq->top_tasks_bitmap[prev]);
 	} else {
 		zero_index_update = !old_curr_window && prev_window;
 		if (old_index != update_index || zero_index_update) {
@@ -2274,6 +2301,14 @@ static void update_top_tasks(struct task_struct *p, struct rq *rq,
 
 			if (update_index > rq->prev_top)
 				rq->prev_top = update_index;
+
+			if (!prev_table[old_index])
+				__clear_bit(NUM_LOAD_INDICES - old_index - 1,
+						rq->top_tasks_bitmap[prev]);
+
+			if (prev_table[update_index] == 1)
+				__set_bit(NUM_LOAD_INDICES - update_index - 1,
+						rq->top_tasks_bitmap[prev]);
 		}
 	}
 
@@ -2282,6 +2317,10 @@ static void update_top_tasks(struct task_struct *p, struct rq *rq,
 
 		if (new_index > rq->curr_top)
 			rq->curr_top = new_index;
+
+		if (curr_table[new_index] == 1)
+			__set_bit(NUM_LOAD_INDICES - new_index - 1,
+				rq->top_tasks_bitmap[curr]);
 	}
 }
 
@@ -2412,11 +2451,14 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		int curr_top = rq->curr_top;
 
 		clear_top_tasks_table(rq->top_tasks[prev_table]);
+		clear_top_tasks_bitmap(rq->top_tasks_bitmap[prev_table]);
 
 		if (prev_sum_reset) {
 			curr_sum = nt_curr_sum = 0;
 			curr_top = 0;
 			clear_top_tasks_table(rq->top_tasks[curr_table]);
+			clear_top_tasks_bitmap(
+					rq->top_tasks_bitmap[curr_table]);
 		}
 
 		*prev_runnable_sum = curr_sum;
@@ -3155,6 +3197,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 			memset(&rq->load_subs[i], 0,
 					sizeof(struct load_subtractions));
 			clear_top_tasks_table(rq->top_tasks[i]);
+			clear_top_tasks_bitmap(rq->top_tasks_bitmap[i]);
 		}
 
 		rq->curr_table = 0;
@@ -3572,19 +3615,14 @@ static inline void inter_cluster_migration_fixup
 	BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
 }
 
-static int find_next_top_index(u8 *tasks, int end)
+static int get_top_index(unsigned long *bitmap, unsigned long old_top)
 {
-	int i;
+	int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top);
 
-	if (end <= 1)
+	if (index == NUM_LOAD_INDICES)
 		return 0;
 
-	for (i = end - 1; i >= 0; i--) {
-		if (tasks[i])
-			return i;
-	}
-
-	return 0;
+	return NUM_LOAD_INDICES - 1 - index;
 }
 
 static void
@@ -3606,13 +3644,21 @@ migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
 		src_table[index] -= 1;
 		dst_table[index] += 1;
 
+		if (!src_table[index])
+			__clear_bit(NUM_LOAD_INDICES - index - 1,
+				src_rq->top_tasks_bitmap[src]);
+
+		if (dst_table[index] == 1)
+			__set_bit(NUM_LOAD_INDICES - index - 1,
+				dst_rq->top_tasks_bitmap[dst]);
+
 		if (index > dst_rq->curr_top)
 			dst_rq->curr_top = index;
 
 		top_index = src_rq->curr_top;
 		if (index == top_index && !src_table[index])
-			src_rq->curr_top =
-				find_next_top_index(src_table, top_index);
+			src_rq->curr_top = get_top_index(
+				src_rq->top_tasks_bitmap[src], top_index);
 	}
 
 	if (prev_window) {
@@ -3624,13 +3670,21 @@ migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
 		src_table[index] -= 1;
 		dst_table[index] += 1;
 
+		if (!src_table[index])
+			__clear_bit(NUM_LOAD_INDICES - index - 1,
+				src_rq->top_tasks_bitmap[src]);
+
+		if (dst_table[index] == 1)
+			__set_bit(NUM_LOAD_INDICES - index - 1,
+				dst_rq->top_tasks_bitmap[dst]);
+
 		if (index > dst_rq->prev_top)
 			dst_rq->prev_top = index;
 
 		top_index = src_rq->prev_top;
 		if (index == top_index && !src_table[index])
-			src_rq->prev_top =
-				find_next_top_index(src_table, top_index);
+			src_rq->prev_top = get_top_index(
+				src_rq->top_tasks_bitmap[src], top_index);
 	}
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf374696ee..4fd56b04c336 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -753,6 +753,8 @@ struct rq {
 	u64 nt_curr_runnable_sum;
 	u64 nt_prev_runnable_sum;
 	struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
+	DECLARE_BITMAP_ARRAY(top_tasks_bitmap,
+			NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES);
 	u8 *top_tasks[NUM_TRACKED_WINDOWS];
 	u8 curr_table;
 	int prev_top;
@@ -1417,6 +1419,7 @@ extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cft, u64 upmigrate_discourage);
 extern void sched_hmp_parse_dt(void);
 extern void init_sched_hmp_boost_policy(void);
+extern void clear_top_tasks_bitmap(unsigned long *bitmap);
 
 #else	/* CONFIG_SCHED_HMP */