Merge "sched: Optimize the next top task search logic upon task migration"

This commit is contained in:
Linux Build Service Account 2016-10-19 11:18:06 -07:00 committed by Gerrit - the friendly Code Review server
commit 268d4e5d68
9 changed files with 692 additions and 133 deletions

View file

@ -356,7 +356,7 @@ extern int lockdep_tasklist_lock_is_held(void);
extern void sched_init(void);
extern void sched_init_smp(void);
extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);
extern void init_idle(struct task_struct *idle, int cpu, bool hotplug);
extern void init_idle_bootup_task(struct task_struct *idle);
extern cpumask_var_t cpu_isolated_map;
@ -1332,11 +1332,15 @@ struct ravg {
* sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
* demand for tasks.
*
* 'curr_window' represents task's contribution to cpu busy time
* statistics (rq->curr_runnable_sum) in current window
* 'curr_window_cpu' represents task's contribution to cpu busy time on
* various CPUs in the current window
*
* 'prev_window' represents task's contribution to cpu busy time
* statistics (rq->prev_runnable_sum) in previous window
* 'prev_window_cpu' represents task's contribution to cpu busy time on
* various CPUs in the previous window
*
* 'curr_window' represents the sum of all entries in curr_window_cpu
*
* 'prev_window' represents the sum of all entries in prev_window_cpu
*
* 'pred_demand' represents task's current predicted cpu busy time
*
@ -1346,6 +1350,7 @@ struct ravg {
u64 mark_start;
u32 sum, demand;
u32 sum_history[RAVG_HIST_SIZE_MAX];
u32 *curr_window_cpu, *prev_window_cpu;
u32 curr_window, prev_window;
u16 active_windows;
u32 pred_demand;

View file

@ -9,6 +9,9 @@
#define DECLARE_BITMAP(name,bits) \
unsigned long name[BITS_TO_LONGS(bits)]
#define DECLARE_BITMAP_ARRAY(name,nr,bits) \
unsigned long name[nr][BITS_TO_LONGS(bits)]
typedef __u32 __kernel_dev_t;
typedef __kernel_fd_set fd_set;

View file

@ -260,6 +260,30 @@ TRACE_EVENT(sched_set_boost,
TP_printk("ref_count=%d", __entry->ref_count)
);
#if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_HMP)
static inline void __window_data(u32 *dst, u32 *src)
{
if (src)
memcpy(dst, src, nr_cpu_ids * sizeof(u32));
else
memset(dst, 0, nr_cpu_ids * sizeof(u32));
}
struct trace_seq;
const char *__window_print(struct trace_seq *p, const u32 *buf, int buf_len)
{
int i;
const char *ret = p->buffer + seq_buf_used(&p->seq);
for (i = 0; i < buf_len; i++)
trace_seq_printf(p, "%u ", buf[i]);
trace_seq_putc(p, 0);
return ret;
}
#endif
TRACE_EVENT(sched_update_task_ravg,
TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt,
@ -288,13 +312,17 @@ TRACE_EVENT(sched_update_task_ravg,
__field( u64, rq_ps )
__field( u64, grp_cs )
__field( u64, grp_ps )
__field( u64, grp_nt_cs )
__field( u64, grp_nt_ps )
__field( u64, grp_nt_cs )
__field( u64, grp_nt_ps )
__field( u32, curr_window )
__field( u32, prev_window )
__dynamic_array(u32, curr_sum, nr_cpu_ids )
__dynamic_array(u32, prev_sum, nr_cpu_ids )
__field( u64, nt_cs )
__field( u64, nt_ps )
__field( u32, active_windows )
__field( u8, curr_top )
__field( u8, prev_top )
),
TP_fast_assign(
@ -321,22 +349,30 @@ TRACE_EVENT(sched_update_task_ravg,
__entry->grp_nt_ps = cpu_time ? cpu_time->nt_prev_runnable_sum : 0;
__entry->curr_window = p->ravg.curr_window;
__entry->prev_window = p->ravg.prev_window;
__window_data(__get_dynamic_array(curr_sum), p->ravg.curr_window_cpu);
__window_data(__get_dynamic_array(prev_sum), p->ravg.prev_window_cpu);
__entry->nt_cs = rq->nt_curr_runnable_sum;
__entry->nt_ps = rq->nt_prev_runnable_sum;
__entry->active_windows = p->ravg.active_windows;
__entry->curr_top = rq->curr_top;
__entry->prev_top = rq->prev_top;
),
TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu"
, __entry->wallclock, __entry->win_start, __entry->delta,
TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu curr_top %u prev_top %u",
__entry->wallclock, __entry->win_start, __entry->delta,
task_event_names[__entry->evt], __entry->cpu,
__entry->cur_freq, __entry->cur_pid,
__entry->pid, __entry->comm, __entry->mark_start,
__entry->delta_m, __entry->demand,
__entry->sum, __entry->irqtime, __entry->pred_demand,
__entry->rq_cs, __entry->rq_ps, __entry->curr_window,
__entry->prev_window, __entry->nt_cs, __entry->nt_ps,
__window_print(p, __get_dynamic_array(curr_sum), nr_cpu_ids),
__entry->prev_window,
__window_print(p, __get_dynamic_array(prev_sum), nr_cpu_ids),
__entry->nt_cs, __entry->nt_ps,
__entry->active_windows, __entry->grp_cs,
__entry->grp_ps, __entry->grp_nt_cs, __entry->grp_nt_ps)
__entry->grp_ps, __entry->grp_nt_cs, __entry->grp_nt_ps,
__entry->curr_top, __entry->prev_top)
);
TRACE_EVENT(sched_get_task_cpu_cycles,

View file

@ -1684,7 +1684,7 @@ struct task_struct *fork_idle(int cpu)
task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
if (!IS_ERR(task)) {
init_idle_pids(task->pids);
init_idle(task, cpu);
init_idle(task, cpu, false);
}
return task;

View file

@ -2255,13 +2255,13 @@ void __dl_clear_params(struct task_struct *p)
void sched_exit(struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
struct rq *rq = cpu_rq(cpu);
struct rq *rq;
u64 wallclock;
sched_set_group_id(p, 0);
raw_spin_lock_irqsave(&rq->lock, flags);
rq = task_rq_lock(p, &flags);
/* rq->curr == p */
wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
@ -2269,11 +2269,13 @@ void sched_exit(struct task_struct *p)
reset_task_stats(p);
p->ravg.mark_start = wallclock;
p->ravg.sum_history[0] = EXITING_TASK_MARKER;
kfree(p->ravg.curr_window_cpu);
kfree(p->ravg.prev_window_cpu);
enqueue_task(rq, p, 0);
clear_ed_task(p, rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
put_cpu();
task_rq_unlock(rq, p, &flags);
}
#endif /* CONFIG_SCHED_HMP */
@ -2377,6 +2379,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
int cpu = get_cpu();
__sched_fork(clone_flags, p);
init_new_task_load(p, false);
/*
* We mark the process as running here. This guarantees that
* nobody will actually run it, and a signal or other external
@ -2562,7 +2565,6 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, flags);
init_new_task_load(p);
add_new_task_to_grp(p);
/* Initialize new task's runnable average */
init_entity_runnable_average(&p->se);
@ -5210,17 +5212,21 @@ void init_idle_bootup_task(struct task_struct *idle)
* init_idle - set up an idle thread for a given CPU
* @idle: task in question
* @cpu: cpu the idle task belongs to
* @cpu_up: differentiate between initial boot vs hotplug
*
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
void init_idle(struct task_struct *idle, int cpu)
void init_idle(struct task_struct *idle, int cpu, bool cpu_up)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
__sched_fork(0, idle);
if (!cpu_up)
init_new_task_load(idle, true);
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_lock(&rq->lock);
@ -8009,6 +8015,22 @@ void __init sched_init(void)
rq->old_estimated_time = 0;
rq->old_busy_time_group = 0;
rq->hmp_stats.pred_demands_sum = 0;
rq->curr_table = 0;
rq->prev_top = 0;
rq->curr_top = 0;
for (j = 0; j < NUM_TRACKED_WINDOWS; j++) {
memset(&rq->load_subs[j], 0,
sizeof(struct load_subtractions));
rq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES,
sizeof(u8), GFP_NOWAIT);
/* No other choice */
BUG_ON(!rq->top_tasks[j]);
clear_top_tasks_bitmap(rq->top_tasks_bitmap[j]);
}
#endif
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
@ -8051,7 +8073,7 @@ void __init sched_init(void)
* but because we are the idle thread, we just pick up running again
* when this runqueue becomes "idle".
*/
init_idle(current, smp_processor_id());
init_idle(current, smp_processor_id(), false);
calc_load_update = jiffies + LOAD_FREQ;

View file

@ -418,6 +418,7 @@ static void sched_debug_header(struct seq_file *m)
P(min_capacity);
P(max_capacity);
P(sched_ravg_window);
P(sched_load_granule);
#endif
#undef PN
#undef P

View file

@ -590,6 +590,7 @@ static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
cluster->dstate_wakeup_latency = 0;
cluster->freq_init_done = false;
raw_spin_lock_init(&cluster->load_lock);
cluster->cpus = *cpus;
cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
@ -647,6 +648,7 @@ void init_clusters(void)
{
bitmap_clear(all_cluster_ids, 0, NR_CPUS);
init_cluster.cpus = *cpu_possible_mask;
raw_spin_lock_init(&init_cluster.load_lock);
INIT_LIST_HEAD(&cluster_head);
}
@ -823,15 +825,15 @@ unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
unsigned int
min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
/* Window size (in ns) */
__read_mostly unsigned int sched_ravg_window = 10000000;
/* Min window size (in ns) = 10ms */
#define MIN_SCHED_RAVG_WINDOW 10000000
/* Max window size (in ns) = 1s */
#define MAX_SCHED_RAVG_WINDOW 1000000000
/* Window size (in ns) */
__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
/* Temporarily disable window-stats activity on all cpus */
unsigned int __read_mostly sched_disable_window_stats;
@ -850,6 +852,21 @@ static DEFINE_RWLOCK(related_thread_group_lock);
#define for_each_related_thread_group(grp) \
list_for_each_entry(grp, &related_thread_groups, list)
/*
* Task load is categorized into buckets for the purpose of top task tracking.
* The entire range of load from 0 to sched_ravg_window needs to be covered
* in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket
* is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value
* of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute
* sched_load_granule.
*/
__read_mostly unsigned int sched_load_granule =
MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
/* Size of bitmaps maintained to track top tasks */
static const unsigned int top_tasks_bitmap_size =
BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long);
/*
* Demand aggregation for frequency purpose:
*
@ -1505,7 +1522,7 @@ static inline int invalid_value(unsigned int *data)
/*
* Handle "atomic" update of sysctl_sched_window_stats_policy,
* sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables.
* sysctl_sched_ravg_hist_size variables.
*/
int sched_window_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@ -1611,7 +1628,7 @@ unsigned int cpu_temp(int cpu)
return 0;
}
void init_new_task_load(struct task_struct *p)
void init_new_task_load(struct task_struct *p, bool idle_task)
{
int i;
u32 init_load_windows = sched_init_task_load_windows;
@ -1623,6 +1640,15 @@ void init_new_task_load(struct task_struct *p)
memset(&p->ravg, 0, sizeof(struct ravg));
p->cpu_cycles = 0;
p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_ATOMIC);
p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_ATOMIC);
/* Don't have much choice. CPU frequency would be bogus */
BUG_ON(!p->ravg.curr_window_cpu || !p->ravg.prev_window_cpu);
if (idle_task)
return;
if (init_load_pct)
init_load_windows = div64_u64((u64)init_load_pct *
(u64)sched_ravg_window, 100);
@ -2161,6 +2187,174 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
p->ravg.pred_demand = new;
}
void clear_top_tasks_bitmap(unsigned long *bitmap)
{
memset(bitmap, 0, top_tasks_bitmap_size);
__set_bit(NUM_LOAD_INDICES, bitmap);
}
/*
* Special case the last index and provide a fast path for index = 0.
* Note that sched_load_granule can change underneath us if we are not
* holding any runqueue locks while calling the two functions below.
*/
static u32 __maybe_unused top_task_load(struct rq *rq)
{
int index = rq->prev_top;
u8 prev = 1 - rq->curr_table;
if (!index) {
int msb = NUM_LOAD_INDICES - 1;
if (!test_bit(msb, rq->top_tasks_bitmap[prev]))
return 0;
else
return sched_load_granule;
} else if (index == NUM_LOAD_INDICES - 1) {
return sched_ravg_window;
} else {
return (index + 1) * sched_load_granule;
}
}
static int load_to_index(u32 load)
{
if (load < sched_load_granule)
return 0;
else if (load >= sched_ravg_window)
return NUM_LOAD_INDICES - 1;
else
return load / sched_load_granule;
}
static void update_top_tasks(struct task_struct *p, struct rq *rq,
u32 old_curr_window, int new_window, bool full_window)
{
u8 curr = rq->curr_table;
u8 prev = 1 - curr;
u8 *curr_table = rq->top_tasks[curr];
u8 *prev_table = rq->top_tasks[prev];
int old_index, new_index, update_index;
u32 curr_window = p->ravg.curr_window;
u32 prev_window = p->ravg.prev_window;
bool zero_index_update;
if (old_curr_window == curr_window && !new_window)
return;
old_index = load_to_index(old_curr_window);
new_index = load_to_index(curr_window);
if (!new_window) {
zero_index_update = !old_curr_window && curr_window;
if (old_index != new_index || zero_index_update) {
if (old_curr_window)
curr_table[old_index] -= 1;
if (curr_window)
curr_table[new_index] += 1;
if (new_index > rq->curr_top)
rq->curr_top = new_index;
}
if (!curr_table[old_index])
__clear_bit(NUM_LOAD_INDICES - old_index - 1,
rq->top_tasks_bitmap[curr]);
if (curr_table[new_index] == 1)
__set_bit(NUM_LOAD_INDICES - new_index - 1,
rq->top_tasks_bitmap[curr]);
return;
}
/*
* The window has rolled over for this task. By the time we get
* here, curr/prev swaps would has already occurred. So we need
* to use prev_window for the new index.
*/
update_index = load_to_index(prev_window);
if (full_window) {
/*
* Two cases here. Either 'p' ran for the entire window or
* it didn't run at all. In either case there is no entry
* in the prev table. If 'p' ran the entire window, we just
* need to create a new entry in the prev table. In this case
* update_index will be correspond to sched_ravg_window
* so we can unconditionally update the top index.
*/
if (prev_window) {
prev_table[update_index] += 1;
rq->prev_top = update_index;
}
if (prev_table[update_index] == 1)
__set_bit(NUM_LOAD_INDICES - update_index - 1,
rq->top_tasks_bitmap[prev]);
} else {
zero_index_update = !old_curr_window && prev_window;
if (old_index != update_index || zero_index_update) {
if (old_curr_window)
prev_table[old_index] -= 1;
prev_table[update_index] += 1;
if (update_index > rq->prev_top)
rq->prev_top = update_index;
if (!prev_table[old_index])
__clear_bit(NUM_LOAD_INDICES - old_index - 1,
rq->top_tasks_bitmap[prev]);
if (prev_table[update_index] == 1)
__set_bit(NUM_LOAD_INDICES - update_index - 1,
rq->top_tasks_bitmap[prev]);
}
}
if (curr_window) {
curr_table[new_index] += 1;
if (new_index > rq->curr_top)
rq->curr_top = new_index;
if (curr_table[new_index] == 1)
__set_bit(NUM_LOAD_INDICES - new_index - 1,
rq->top_tasks_bitmap[curr]);
}
}
static inline void clear_top_tasks_table(u8 *table)
{
memset(table, 0, NUM_LOAD_INDICES * sizeof(u8));
}
static u32 empty_windows[NR_CPUS];
static void rollover_task_window(struct task_struct *p, bool full_window)
{
u32 *curr_cpu_windows = empty_windows;
u32 curr_window;
int i;
/* Rollover the sum */
curr_window = 0;
if (!full_window) {
curr_window = p->ravg.curr_window;
curr_cpu_windows = p->ravg.curr_window_cpu;
}
p->ravg.prev_window = curr_window;
p->ravg.curr_window = 0;
/* Roll over individual CPU contributions */
for (i = 0; i < nr_cpu_ids; i++) {
p->ravg.prev_window_cpu[i] = curr_cpu_windows[i];
p->ravg.curr_window_cpu[i] = 0;
}
}
/*
* Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
*/
@ -2181,6 +2375,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
int prev_sum_reset = 0;
bool new_task;
struct related_thread_group *grp;
int cpu = rq->cpu;
u32 old_curr_window;
new_window = mark_start < window_start;
if (new_window) {
@ -2240,57 +2436,43 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
* Handle per-task window rollover. We don't care about the idle
* task or exiting tasks.
*/
if (new_window && !is_idle_task(p) && !exiting_task(p)) {
u32 curr_window = 0;
if (!is_idle_task(p) && !exiting_task(p)) {
old_curr_window = p->ravg.curr_window;
if (!full_window)
curr_window = p->ravg.curr_window;
p->ravg.prev_window = curr_window;
p->ravg.curr_window = 0;
if (new_window)
rollover_task_window(p, full_window);
}
if (flip_counters) {
u64 curr_sum = *curr_runnable_sum;
u64 nt_curr_sum = *nt_curr_runnable_sum;
u8 curr_table = rq->curr_table;
u8 prev_table = 1 - curr_table;
int curr_top = rq->curr_top;
if (prev_sum_reset)
clear_top_tasks_table(rq->top_tasks[prev_table]);
clear_top_tasks_bitmap(rq->top_tasks_bitmap[prev_table]);
if (prev_sum_reset) {
curr_sum = nt_curr_sum = 0;
curr_top = 0;
clear_top_tasks_table(rq->top_tasks[curr_table]);
clear_top_tasks_bitmap(
rq->top_tasks_bitmap[curr_table]);
}
*prev_runnable_sum = curr_sum;
*nt_prev_runnable_sum = nt_curr_sum;
*curr_runnable_sum = 0;
*nt_curr_runnable_sum = 0;
rq->curr_table = prev_table;
rq->prev_top = curr_top;
rq->curr_top = 0;
}
if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
/*
* account_busy_for_cpu_time() = 0, so no update to the
* task's current window needs to be made. This could be
* for example
*
* - a wakeup event on a task within the current
* window (!new_window below, no action required),
* - switching to a new task from idle (PICK_NEXT_TASK)
* in a new window where irqtime is 0 and we aren't
* waiting on IO
*/
if (!new_window)
return;
/*
* A new window has started. The RQ demand must be rolled
* over if p is the current task.
*/
if (p_is_curr_task) {
/* p is idle task */
BUG_ON(p != rq->idle);
}
return;
}
if (!account_busy_for_cpu_time(rq, p, irqtime, event))
goto done;
if (!new_window) {
/*
@ -2310,10 +2492,12 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
if (new_task)
*nt_curr_runnable_sum += delta;
if (!is_idle_task(p) && !exiting_task(p))
if (!is_idle_task(p) && !exiting_task(p)) {
p->ravg.curr_window += delta;
p->ravg.curr_window_cpu[cpu] += delta;
}
return;
goto done;
}
if (!p_is_curr_task) {
@ -2336,8 +2520,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
* contribution to previous completed window.
*/
delta = scale_exec_time(window_start - mark_start, rq);
if (!exiting_task(p))
if (!exiting_task(p)) {
p->ravg.prev_window += delta;
p->ravg.prev_window_cpu[cpu] += delta;
}
} else {
/*
* Since at least one full window has elapsed,
@ -2345,8 +2531,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
* full window (window_size).
*/
delta = scale_exec_time(window_size, rq);
if (!exiting_task(p))
if (!exiting_task(p)) {
p->ravg.prev_window = delta;
p->ravg.prev_window_cpu[cpu] = delta;
}
}
*prev_runnable_sum += delta;
@ -2359,10 +2547,12 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
if (new_task)
*nt_curr_runnable_sum += delta;
if (!exiting_task(p))
if (!exiting_task(p)) {
p->ravg.curr_window = delta;
p->ravg.curr_window_cpu[cpu] = delta;
}
return;
goto done;
}
if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
@ -2386,8 +2576,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
* contribution to previous completed window.
*/
delta = scale_exec_time(window_start - mark_start, rq);
if (!is_idle_task(p) && !exiting_task(p))
if (!is_idle_task(p) && !exiting_task(p)) {
p->ravg.prev_window += delta;
p->ravg.prev_window_cpu[cpu] += delta;
}
} else {
/*
* Since at least one full window has elapsed,
@ -2395,8 +2587,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
* full window (window_size).
*/
delta = scale_exec_time(window_size, rq);
if (!is_idle_task(p) && !exiting_task(p))
if (!is_idle_task(p) && !exiting_task(p)) {
p->ravg.prev_window = delta;
p->ravg.prev_window_cpu[cpu] = delta;
}
}
/*
@ -2413,10 +2607,12 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
if (new_task)
*nt_curr_runnable_sum += delta;
if (!is_idle_task(p) && !exiting_task(p))
if (!is_idle_task(p) && !exiting_task(p)) {
p->ravg.curr_window = delta;
p->ravg.curr_window_cpu[cpu] = delta;
}
return;
goto done;
}
if (irqtime) {
@ -2461,7 +2657,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
return;
}
BUG();
done:
if (!is_idle_task(p) && !exiting_task(p))
update_top_tasks(p, rq, old_curr_window,
new_window, full_window);
}
static inline u32 predict_and_update_buckets(struct rq *rq,
@ -2829,11 +3028,23 @@ void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
void reset_task_stats(struct task_struct *p)
{
u32 sum = 0;
u32 *curr_window_ptr = NULL;
u32 *prev_window_ptr = NULL;
if (exiting_task(p))
if (exiting_task(p)) {
sum = EXITING_TASK_MARKER;
} else {
curr_window_ptr = p->ravg.curr_window_cpu;
prev_window_ptr = p->ravg.prev_window_cpu;
memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
}
memset(&p->ravg, 0, sizeof(struct ravg));
p->ravg.curr_window_cpu = curr_window_ptr;
p->ravg.prev_window_cpu = prev_window_ptr;
/* Retain EXITING_TASK marker */
p->ravg.sum_history[0] = sum;
}
@ -2889,7 +3100,9 @@ static void reset_all_task_stats(void)
read_lock(&tasklist_lock);
do_each_thread(g, p) {
raw_spin_lock(&p->pi_lock);
reset_task_stats(p);
raw_spin_unlock(&p->pi_lock);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
}
@ -2934,7 +3147,7 @@ const char *sched_window_reset_reasons[] = {
/* Called with IRQs enabled */
void reset_all_window_stats(u64 window_start, unsigned int window_size)
{
int cpu;
int cpu, i;
unsigned long flags;
u64 start_ts = sched_ktime_clock();
int reason = WINDOW_CHANGE;
@ -2968,6 +3181,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
if (window_size) {
sched_ravg_window = window_size * TICK_NSEC;
set_hmp_defaults();
sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES;
}
enable_window_stats();
@ -2979,6 +3193,16 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
rq->window_start = window_start;
rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
memset(&rq->load_subs[i], 0,
sizeof(struct load_subtractions));
clear_top_tasks_table(rq->top_tasks[i]);
clear_top_tasks_bitmap(rq->top_tasks_bitmap[i]);
}
rq->curr_table = 0;
rq->curr_top = 0;
rq->prev_top = 0;
reset_cpu_hmp_stats(cpu, 1);
}
@ -3011,6 +3235,39 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
sched_ktime_clock() - start_ts, reason, old, new);
}
/*
* In this function we match the accumulated subtractions with the current
* and previous windows we are operating with. Ignore any entries where
* the window start in the load_subtraction struct does not match either
* the curent or the previous window. This could happen whenever CPUs
* become idle or busy with interrupts disabled for an extended period.
*/
static inline void account_load_subtractions(struct rq *rq)
{
u64 ws = rq->window_start;
u64 prev_ws = ws - sched_ravg_window;
struct load_subtractions *ls = rq->load_subs;
int i;
for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
if (ls[i].window_start == ws) {
rq->curr_runnable_sum -= ls[i].subs;
rq->nt_curr_runnable_sum -= ls[i].new_subs;
} else if (ls[i].window_start == prev_ws) {
rq->prev_runnable_sum -= ls[i].subs;
rq->nt_prev_runnable_sum -= ls[i].new_subs;
}
ls[i].subs = 0;
ls[i].new_subs = 0;
}
BUG_ON((s64)rq->prev_runnable_sum < 0);
BUG_ON((s64)rq->curr_runnable_sum < 0);
BUG_ON((s64)rq->nt_prev_runnable_sum < 0);
BUG_ON((s64)rq->nt_curr_runnable_sum < 0);
}
static inline void
sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time);
@ -3033,6 +3290,7 @@ void sched_get_cpus_busy(struct sched_load *busy,
struct related_thread_group *grp;
u64 total_group_load = 0, total_ngload = 0;
bool aggregate_load = false;
struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus));
if (unlikely(cpus == 0))
return;
@ -3050,6 +3308,13 @@ void sched_get_cpus_busy(struct sched_load *busy,
window_size = sched_ravg_window;
/*
* We don't really need the cluster lock for this entire for loop
* block. However, there is no advantage in optimizing this as rq
* locks are held regardless and would prevent migration anyways
*/
raw_spin_lock(&cluster->load_lock);
for_each_cpu(cpu, query_cpus) {
rq = cpu_rq(cpu);
@ -3057,6 +3322,7 @@ void sched_get_cpus_busy(struct sched_load *busy,
0);
cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
account_load_subtractions(rq);
load[i] = rq->old_busy_time = rq->prev_runnable_sum;
nload[i] = rq->nt_prev_runnable_sum;
pload[i] = rq->hmp_stats.pred_demands_sum;
@ -3083,6 +3349,8 @@ void sched_get_cpus_busy(struct sched_load *busy,
i++;
}
raw_spin_unlock(&cluster->load_lock);
for_each_related_thread_group(grp) {
for_each_cpu(cpu, query_cpus) {
/* Protected by rq_lock */
@ -3237,6 +3505,189 @@ int sched_set_window(u64 window_start, unsigned int window_size)
return 0;
}
static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index)
{
rq->load_subs[index].window_start = ws;
rq->load_subs[index].subs = 0;
rq->load_subs[index].new_subs = 0;
}
static bool get_subtraction_index(struct rq *rq, u64 ws)
{
int i;
u64 oldest = ULLONG_MAX;
int oldest_index = 0;
for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
u64 entry_ws = rq->load_subs[i].window_start;
if (ws == entry_ws)
return i;
if (entry_ws < oldest) {
oldest = entry_ws;
oldest_index = i;
}
}
create_subtraction_entry(rq, ws, oldest_index);
return oldest_index;
}
static void update_rq_load_subtractions(int index, struct rq *rq,
u32 sub_load, bool new_task)
{
rq->load_subs[index].subs += sub_load;
if (new_task)
rq->load_subs[index].new_subs += sub_load;
}
static void update_cluster_load_subtractions(struct task_struct *p,
int cpu, u64 ws, bool new_task)
{
struct sched_cluster *cluster = cpu_cluster(cpu);
struct cpumask cluster_cpus = cluster->cpus;
u64 prev_ws = ws - sched_ravg_window;
int i;
cpumask_clear_cpu(cpu, &cluster_cpus);
raw_spin_lock(&cluster->load_lock);
for_each_cpu(i, &cluster_cpus) {
struct rq *rq = cpu_rq(i);
int index;
if (p->ravg.curr_window_cpu[i]) {
index = get_subtraction_index(rq, ws);
update_rq_load_subtractions(index, rq,
p->ravg.curr_window_cpu[i], new_task);
p->ravg.curr_window_cpu[i] = 0;
}
if (p->ravg.prev_window_cpu[i]) {
index = get_subtraction_index(rq, prev_ws);
update_rq_load_subtractions(index, rq,
p->ravg.prev_window_cpu[i], new_task);
p->ravg.prev_window_cpu[i] = 0;
}
}
raw_spin_unlock(&cluster->load_lock);
}
static inline void inter_cluster_migration_fixup
(struct task_struct *p, int new_cpu, int task_cpu, bool new_task)
{
struct rq *dest_rq = cpu_rq(new_cpu);
struct rq *src_rq = cpu_rq(task_cpu);
if (same_freq_domain(new_cpu, task_cpu))
return;
p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window;
p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window;
dest_rq->curr_runnable_sum += p->ravg.curr_window;
dest_rq->prev_runnable_sum += p->ravg.prev_window;
src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu];
src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu];
if (new_task) {
dest_rq->nt_curr_runnable_sum += p->ravg.curr_window;
dest_rq->nt_prev_runnable_sum += p->ravg.prev_window;
src_rq->nt_curr_runnable_sum -=
p->ravg.curr_window_cpu[task_cpu];
src_rq->nt_prev_runnable_sum -=
p->ravg.prev_window_cpu[task_cpu];
}
p->ravg.curr_window_cpu[task_cpu] = 0;
p->ravg.prev_window_cpu[task_cpu] = 0;
update_cluster_load_subtractions(p, task_cpu,
src_rq->window_start, new_task);
BUG_ON((s64)src_rq->prev_runnable_sum < 0);
BUG_ON((s64)src_rq->curr_runnable_sum < 0);
BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0);
BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
}
static int get_top_index(unsigned long *bitmap, unsigned long old_top)
{
int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top);
if (index == NUM_LOAD_INDICES)
return 0;
return NUM_LOAD_INDICES - 1 - index;
}
static void
migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
{
int index;
int top_index;
u32 curr_window = p->ravg.curr_window;
u32 prev_window = p->ravg.prev_window;
u8 src = src_rq->curr_table;
u8 dst = dst_rq->curr_table;
u8 *src_table;
u8 *dst_table;
if (curr_window) {
src_table = src_rq->top_tasks[src];
dst_table = dst_rq->top_tasks[dst];
index = load_to_index(curr_window);
src_table[index] -= 1;
dst_table[index] += 1;
if (!src_table[index])
__clear_bit(NUM_LOAD_INDICES - index - 1,
src_rq->top_tasks_bitmap[src]);
if (dst_table[index] == 1)
__set_bit(NUM_LOAD_INDICES - index - 1,
dst_rq->top_tasks_bitmap[dst]);
if (index > dst_rq->curr_top)
dst_rq->curr_top = index;
top_index = src_rq->curr_top;
if (index == top_index && !src_table[index])
src_rq->curr_top = get_top_index(
src_rq->top_tasks_bitmap[src], top_index);
}
if (prev_window) {
src = 1 - src;
dst = 1 - dst;
src_table = src_rq->top_tasks[src];
dst_table = dst_rq->top_tasks[dst];
index = load_to_index(prev_window);
src_table[index] -= 1;
dst_table[index] += 1;
if (!src_table[index])
__clear_bit(NUM_LOAD_INDICES - index - 1,
src_rq->top_tasks_bitmap[src]);
if (dst_table[index] == 1)
__set_bit(NUM_LOAD_INDICES - index - 1,
dst_rq->top_tasks_bitmap[dst]);
if (index > dst_rq->prev_top)
dst_rq->prev_top = index;
top_index = src_rq->prev_top;
if (index == top_index && !src_table[index])
src_rq->prev_top = get_top_index(
src_rq->top_tasks_bitmap[src], top_index);
}
}
void fixup_busy_time(struct task_struct *p, int new_cpu)
{
struct rq *src_rq = task_rq(p);
@ -3246,8 +3697,6 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
int migrate_type;
struct migration_sum_data d;
bool new_task;
struct related_thread_group *grp;
@ -3281,62 +3730,55 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
new_task = is_new_task(p);
/* Protected by rq_lock */
grp = p->grp;
/*
* For frequency aggregation, we continue to do migration fixups
* even for intra cluster migrations. This is because, the aggregated
* load has to reported on a single CPU regardless.
*/
if (grp && sched_freq_aggregate) {
struct group_cpu_time *cpu_time;
migrate_type = GROUP_TO_GROUP;
/* Protected by rq_lock */
cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
d.src_rq = NULL;
d.src_cpu_time = cpu_time;
src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
/* Protected by rq_lock */
cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
d.dst_rq = NULL;
d.dst_cpu_time = cpu_time;
dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
sync_window_start(dest_rq, cpu_time);
if (p->ravg.curr_window) {
*src_curr_runnable_sum -= p->ravg.curr_window;
*dst_curr_runnable_sum += p->ravg.curr_window;
if (new_task) {
*src_nt_curr_runnable_sum -=
p->ravg.curr_window;
*dst_nt_curr_runnable_sum +=
p->ravg.curr_window;
}
}
if (p->ravg.prev_window) {
*src_prev_runnable_sum -= p->ravg.prev_window;
*dst_prev_runnable_sum += p->ravg.prev_window;
if (new_task) {
*src_nt_prev_runnable_sum -=
p->ravg.prev_window;
*dst_nt_prev_runnable_sum +=
p->ravg.prev_window;
}
}
} else {
migrate_type = RQ_TO_RQ;
d.src_rq = src_rq;
d.src_cpu_time = NULL;
d.dst_rq = dest_rq;
d.dst_cpu_time = NULL;
src_curr_runnable_sum = &src_rq->curr_runnable_sum;
src_prev_runnable_sum = &src_rq->prev_runnable_sum;
src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum;
src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum;
dst_curr_runnable_sum = &dest_rq->curr_runnable_sum;
dst_prev_runnable_sum = &dest_rq->prev_runnable_sum;
dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum;
dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum;
inter_cluster_migration_fixup(p, new_cpu,
task_cpu(p), new_task);
}
if (p->ravg.curr_window) {
*src_curr_runnable_sum -= p->ravg.curr_window;
*dst_curr_runnable_sum += p->ravg.curr_window;
if (new_task) {
*src_nt_curr_runnable_sum -= p->ravg.curr_window;
*dst_nt_curr_runnable_sum += p->ravg.curr_window;
}
}
if (p->ravg.prev_window) {
*src_prev_runnable_sum -= p->ravg.prev_window;
*dst_prev_runnable_sum += p->ravg.prev_window;
if (new_task) {
*src_nt_prev_runnable_sum -= p->ravg.prev_window;
*dst_nt_prev_runnable_sum += p->ravg.prev_window;
}
}
migrate_top_tasks(p, src_rq, dest_rq);
if (p == src_rq->ed_task) {
src_rq->ed_task = NULL;
@ -3344,12 +3786,6 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
dest_rq->ed_task = p;
}
trace_sched_migration_update_sum(p, migrate_type, &d);
BUG_ON((s64)*src_prev_runnable_sum < 0);
BUG_ON((s64)*src_curr_runnable_sum < 0);
BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
done:
if (p->state == TASK_WAKING)
double_rq_unlock(src_rq, dest_rq);
@ -3501,6 +3937,9 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
struct migration_sum_data d;
int migrate_type;
int cpu = cpu_of(rq);
bool new_task = is_new_task(p);
int i;
if (!sched_freq_aggregate)
return;
@ -3511,7 +3950,7 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
/* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
cpu_time = _group_cpu_time(grp, cpu_of(rq));
cpu_time = _group_cpu_time(grp, cpu);
if (event == ADD_TASK) {
sync_window_start(rq, cpu_time);
migrate_type = RQ_TO_GROUP;
@ -3528,6 +3967,19 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
*src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu];
*src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu];
if (new_task) {
*src_nt_curr_runnable_sum -=
p->ravg.curr_window_cpu[cpu];
*src_nt_prev_runnable_sum -=
p->ravg.prev_window_cpu[cpu];
}
update_cluster_load_subtractions(p, cpu,
rq->window_start, new_task);
} else {
migrate_type = GROUP_TO_RQ;
d.src_rq = NULL;
@ -3550,21 +4002,42 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
*src_curr_runnable_sum -= p->ravg.curr_window;
*src_prev_runnable_sum -= p->ravg.prev_window;
if (new_task) {
*src_nt_curr_runnable_sum -= p->ravg.curr_window;
*src_nt_prev_runnable_sum -= p->ravg.prev_window;
}
/*
* Need to reset curr/prev windows for all CPUs, not just the
* ones in the same cluster. Since inter cluster migrations
* did not result in the appropriate book keeping, the values
* per CPU would be inaccurate.
*/
for_each_possible_cpu(i) {
p->ravg.curr_window_cpu[i] = 0;
p->ravg.prev_window_cpu[i] = 0;
}
}
*src_curr_runnable_sum -= p->ravg.curr_window;
*dst_curr_runnable_sum += p->ravg.curr_window;
*src_prev_runnable_sum -= p->ravg.prev_window;
*dst_prev_runnable_sum += p->ravg.prev_window;
if (is_new_task(p)) {
*src_nt_curr_runnable_sum -= p->ravg.curr_window;
if (new_task) {
*dst_nt_curr_runnable_sum += p->ravg.curr_window;
*src_nt_prev_runnable_sum -= p->ravg.prev_window;
*dst_nt_prev_runnable_sum += p->ravg.prev_window;
}
/*
* When a task enter or exits a group, it's curr and prev windows are
* moved to a single CPU. This behavior might be sub-optimal in the
* exit case, however, it saves us the overhead of handling inter
* cluster migration fixups while the task is part of a related group.
*/
p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window;
p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window;
trace_sched_migration_update_sum(p, migrate_type, &d);
BUG_ON((s64)*src_curr_runnable_sum < 0);

View file

@ -351,13 +351,23 @@ struct cfs_bandwidth { };
#ifdef CONFIG_SCHED_HMP
#define NUM_TRACKED_WINDOWS 2
#define NUM_LOAD_INDICES 1000
struct hmp_sched_stats {
int nr_big_tasks;
u64 cumulative_runnable_avg;
u64 pred_demands_sum;
};
struct load_subtractions {
u64 window_start;
u64 subs;
u64 new_subs;
};
struct sched_cluster {
raw_spinlock_t load_lock;
struct list_head list;
struct cpumask cpus;
int id;
@ -742,6 +752,13 @@ struct rq {
u64 prev_runnable_sum;
u64 nt_curr_runnable_sum;
u64 nt_prev_runnable_sum;
struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
DECLARE_BITMAP_ARRAY(top_tasks_bitmap,
NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES);
u8 *top_tasks[NUM_TRACKED_WINDOWS];
u8 curr_table;
int prev_top;
int curr_top;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@ -1056,8 +1073,9 @@ extern unsigned int __read_mostly sched_spill_load;
extern unsigned int __read_mostly sched_upmigrate;
extern unsigned int __read_mostly sched_downmigrate;
extern unsigned int __read_mostly sysctl_sched_spill_nr_run;
extern unsigned int __read_mostly sched_load_granule;
extern void init_new_task_load(struct task_struct *p);
extern void init_new_task_load(struct task_struct *p, bool idle_task);
extern u64 sched_ktime_clock(void);
extern int got_boost_kick(void);
extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb);
@ -1401,6 +1419,7 @@ extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
struct cftype *cft, u64 upmigrate_discourage);
extern void sched_hmp_parse_dt(void);
extern void init_sched_hmp_boost_policy(void);
extern void clear_top_tasks_bitmap(unsigned long *bitmap);
#else /* CONFIG_SCHED_HMP */
@ -1503,7 +1522,9 @@ static inline struct sched_cluster *rq_cluster(struct rq *rq)
return NULL;
}
static inline void init_new_task_load(struct task_struct *p) { }
static inline void init_new_task_load(struct task_struct *p, bool idle_task)
{
}
static inline u64 scale_load_to_cpu(u64 load, int cpu)
{
@ -1570,8 +1591,6 @@ static inline int update_preferred_cluster(struct related_thread_group *grp,
static inline void add_new_task_to_grp(struct task_struct *new) {}
#define sched_enable_hmp 0
#define sched_freq_legacy_mode 1
#define sched_migration_fixup 0
#define PRED_DEMAND_DELTA (0)
static inline void

View file

@ -32,7 +32,7 @@ struct task_struct *idle_thread_get(unsigned int cpu)
if (!tsk)
return ERR_PTR(-ENOMEM);
init_idle(tsk, cpu);
init_idle(tsk, cpu, true);
return tsk;
}