diff --git a/include/linux/sched.h b/include/linux/sched.h index b82530481871..aa64a29d03be 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1241,8 +1241,39 @@ struct sched_statistics { }; #endif +#define RAVG_HIST_SIZE 5 + +/* ravg represents frequency scaled cpu-demand of tasks */ +struct ravg { + /* + * 'window_start' marks the beginning of new window + * + * 'mark_start' marks the beginning of an event (task waking up, task + * starting to execute, task being preempted) within a window + * + * 'sum' represents how runnable a task has been within current + * window. It incorporates both running time and wait time and is + * frequency scaled. + * + * 'sum_history' keeps track of history of 'sum' seen over previous + * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are + * ignored. + * + * 'demand' represents maximum sum seen over previous RAVG_HIST_SIZE + * windows. 'demand' could drive frequency demand for tasks. + */ + u64 window_start, mark_start; + u32 sum, demand; + u32 sum_history[RAVG_HIST_SIZE]; +}; + struct sched_entity { struct load_weight load; /* for load-balancing */ + /* + * Todo : Move ravg to 'struct task_struct', as this is common for both + * real-time and non-realtime tasks + */ + struct ravg ravg; struct rb_node run_node; struct list_head group_node; unsigned int on_rq; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 336290e9724e..6ae7504665ad 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -40,6 +40,7 @@ extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_wake_to_idle; +extern unsigned int sysctl_sched_ravg_window; enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a53673cd810..3059a938045f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -860,6 +860,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); trace_sched_enq_deq_task(p, 1); + rq->cumulative_runnable_avg += p->se.ravg.demand; } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -869,6 +870,8 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); trace_sched_enq_deq_task(p, 0); + rq->cumulative_runnable_avg -= p->se.ravg.demand; + BUG_ON((s64)rq->cumulative_runnable_avg < 0); } void activate_task(struct rq *rq, struct task_struct *p, int flags) @@ -1744,6 +1747,110 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl wq_worker_waking_up(p, cpu_of(rq)); } +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static inline void +update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples) +{ + u32 *hist = &p->se.ravg.sum_history[0]; + int ridx, widx; + u32 max = 0; + + /* Ignore windows where task had no activity */ + if (!runtime) + return; + + /* Push new 'runtime' value onto stack */ + widx = RAVG_HIST_SIZE - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < RAVG_HIST_SIZE; widx++) { + hist[widx] = runtime; + if (hist[widx] > max) + max = hist[widx]; + } + + p->se.ravg.sum = 0; + if (p->on_rq) { + rq->cumulative_runnable_avg -= p->se.ravg.demand; + BUG_ON((s64)rq->cumulative_runnable_avg < 0); + } + /* + * Maximum demand seen over previous RAVG_HIST_SIZE windows drives + * frequency demand for a task. Record maximum in 'demand' attribute. + */ + p->se.ravg.demand = max; + if (p->on_rq) + rq->cumulative_runnable_avg += p->se.ravg.demand; +} + +/* Window size (in ns) */ +__read_mostly unsigned int sysctl_sched_ravg_window = 50000000; + +void update_task_ravg(struct task_struct *p, struct rq *rq, int update_sum) +{ + u32 window_size = sysctl_sched_ravg_window; + int new_window; + u64 wallclock = sched_clock(); + + do { + s64 delta = 0; + int n; + u64 now = wallclock; + + new_window = 0; + delta = now - p->se.ravg.window_start; + BUG_ON(delta < 0); + if (delta > window_size) { + p->se.ravg.window_start += window_size; + now = p->se.ravg.window_start; + new_window = 1; + } + + if (update_sum) { + delta = now - p->se.ravg.mark_start; + BUG_ON(delta < 0); + + if (likely(rq->cur_freq && + rq->cur_freq <= max_possible_freq)) + delta = div64_u64(delta * rq->cur_freq, + max_possible_freq); + p->se.ravg.sum += delta; + WARN_ON(p->se.ravg.sum > window_size); + } + + if (!new_window) + break; + + update_history(rq, p, p->se.ravg.sum, 1); + + delta = wallclock - p->se.ravg.window_start; + BUG_ON(delta < 0); + n = div64_u64(delta, window_size); + if (n) { + if (!update_sum) + p->se.ravg.window_start = wallclock; + else + p->se.ravg.window_start += n * window_size; + BUG_ON(p->se.ravg.window_start > wallclock); + if (update_sum) + update_history(rq, p, window_size, n); + } + p->se.ravg.mark_start = p->se.ravg.window_start; + } while (new_window); + + p->se.ravg.mark_start = wallclock; +} + /* * Mark the task runnable and perform wakeup-preemption. */ @@ -1751,6 +1858,8 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { check_preempt_curr(rq, p, wake_flags); + + update_task_ravg(p, rq, 0); p->state = TASK_RUNNING; trace_sched_wakeup(p); @@ -2142,6 +2251,8 @@ void __dl_clear_params(struct task_struct *p) */ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { + int i; + p->on_rq = 0; p->se.on_rq = 0; @@ -2150,6 +2261,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + p->se.ravg.sum = 0; + p->se.ravg.demand = 0; + p->se.ravg.window_start = 0; + p->se.ravg.mark_start = 0; + for (i = 0; i < RAVG_HIST_SIZE; ++i) + p->se.ravg.sum_history[i] = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS @@ -2416,6 +2534,7 @@ void wake_up_new_task(struct task_struct *p) { unsigned long flags; struct rq *rq; + u64 wallclock = sched_clock(); raw_spin_lock_irqsave(&p->pi_lock, flags); /* Initialize new task's runnable average */ @@ -2431,6 +2550,8 @@ void wake_up_new_task(struct task_struct *p) rq = __task_rq_lock(p); activate_task(rq, p, 0); + p->se.ravg.window_start = wallclock; + p->se.ravg.mark_start = wallclock; p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -3088,6 +3209,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev) if (unlikely(!p)) p = idle_sched_class.pick_next_task(rq, prev); + update_task_ravg(p, rq, 1); return p; } @@ -3097,6 +3219,7 @@ again: if (p) { if (unlikely(p == RETRY_TASK)) goto again; + update_task_ravg(p, rq, 1); return p; } } @@ -7593,6 +7716,7 @@ void __init sched_init(void) rq->cur_freq = 0; rq->max_freq = 0; rq->min_freq = 0; + rq->cumulative_runnable_avg = 0; rq->max_idle_balance_cost = sysctl_sched_migration_cost; rq->cstate = 0; rq->wakeup_latency = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4061d3f9d93d..0dbe55192ef2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -644,6 +644,7 @@ struct rq { #endif int cur_freq, max_freq, min_freq; + u64 cumulative_runnable_avg; #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1243,8 +1244,12 @@ struct sched_class { #endif }; +extern void +update_task_ravg(struct task_struct *p, struct rq *rq, int update_sum); + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { + update_task_ravg(prev, rq, 1); prev->sched_class->put_prev_task(rq, prev); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a633cbf026f9..ff8df5e6614e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -292,6 +292,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_ravg_window", + .data = &sysctl_sched_ravg_window, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_SCHED_DEBUG { .procname = "sched_min_granularity_ns",