From 29a412dffa5cbd6d7d913909cd57d04d9d5cb172 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 18 Nov 2014 13:19:39 +0530 Subject: [PATCH] sched: Avoid frequent migration of running task Power values for cpus can drop quite considerably when it goes idle. As a result, the best choice for running a single task in a cluster can vary quite rapidly. As the task keeps hopping cpus, other cpus go idle and start being seen as more favorable target for running a task, leading to task migrating almost every scheduler tick! Prevent this by keeping track of when a task started running on a cpu and allowing task migration in tick path (migration_needed()) on account of energy efficiency reasons only if the task has run sufficiently long (as determined by sysctl_sched_min_runtime variable). Note that currently sysctl_sched_min_runtime setting is considered only in scheduler_tick()->migration_needed() path and not in idle_balance() path. In other words, a task could be migrated to another cpu which did a idle_balance(). This limitation should not affect high-frequency migrations seen typically (when a single high-demand task runs on high-performance cpu). CRs-Fixed: 756570 Change-Id: I96413b7a81b623193c3bbcec6f3fa9dfec367d99 Signed-off-by: Srivatsa Vaddagiri [joonwoop@codeaurora.org: fixed conflict in set_task_cpu() and __schedule().] Signed-off-by: Joonwoo Park --- Documentation/scheduler/sched-hmp.txt | 18 ++++++++++++++++++ include/linux/sched.h | 1 + include/linux/sched/sysctl.h | 1 + kernel/sched/core.c | 16 ++++++++++++++++ kernel/sched/fair.c | 6 ++++++ kernel/sysctl.c | 7 +++++++ 6 files changed, 49 insertions(+) diff --git a/Documentation/scheduler/sched-hmp.txt b/Documentation/scheduler/sched-hmp.txt index 8a813a3ebef4..947eae43e94b 100644 --- a/Documentation/scheduler/sched-hmp.txt +++ b/Documentation/scheduler/sched-hmp.txt @@ -1250,6 +1250,24 @@ Non-small tasks will prefer to wake up on idle CPUs if this tunable is set to 1. If the tunable is set to 0, non-small tasks will prefer to wake up on mostly idle CPUs which are not completely idle, increasing task packing behavior. +** 7.24 sched_min_runtime + +Appears at: /proc/sys/kernel/sched_min_runtime + +Default value: 200000000 (200ms) + +This tunable helps avouid frequent migration of task on account of +energy-awareness. During scheduler tick, a check is made (in migration_needed()) +whether the running task needs to be migrated to a "better" cpu, which could +either offer better performance or power. When deciding to migrate task on +account of power, we want to avoid "frequent" migration of task (say every +tick), which could be add more overhead for comparatively little gains. A task's +'run_start' attribute is set when it starts running on a cpu. This information +is used in migration_needed() to avoid "frequent" migrations. Once a task has +been associated with a cpu (in either running or runnable state) for more than +'sched_min_vruntime' ns, it is considered eligible for migration in tick path on +account of energy awareness reasons. + ========================= 8. HMP SCHEDULER TRACE POINTS ========================= diff --git a/include/linux/sched.h b/include/linux/sched.h index 5398a8aea026..0876b298c76e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1453,6 +1453,7 @@ struct task_struct { * of this task */ u32 init_load_pct; + u64 run_start; #endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 25bdacde2d83..0ec9fc8cd361 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -48,6 +48,7 @@ extern unsigned int sysctl_sched_cpu_high_irqload; extern unsigned int sysctl_sched_freq_account_wait_time; extern unsigned int sysctl_sched_migration_fixup; extern unsigned int sysctl_sched_heavy_task_pct; +extern unsigned int sysctl_sched_min_runtime; #if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP) extern unsigned int sysctl_sched_init_task_load_pct; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b900b2de3990..f3d385c2dac6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2538,6 +2538,16 @@ static void restore_orig_mark_start(struct task_struct *p, u64 mark_start) p->ravg.mark_start = mark_start; } +/* + * Note down when task started running on a cpu. This information will be handy + * to avoid "too" frequent task migrations for a running task on account of + * power. + */ +static inline void note_run_start(struct task_struct *p, u64 wallclock) +{ + p->run_start = wallclock; +} + #else /* CONFIG_SCHED_HMP */ static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } @@ -2569,6 +2579,8 @@ restore_orig_mark_start(struct task_struct *p, u64 mark_start) { } +static inline void note_run_start(struct task_struct *p, u64 wallclock) { } + #endif /* CONFIG_SCHED_HMP */ #ifdef CONFIG_SMP @@ -2834,6 +2846,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu, pct_task_load(p)); + note_run_start(p, -1); + if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p); @@ -4772,6 +4786,7 @@ static void __sched notrace __schedule(bool preempt) prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, DEQUEUE_SLEEP); + note_run_start(prev, -1); prev->on_rq = 0; /* @@ -4800,6 +4815,7 @@ static void __sched notrace __schedule(bool preempt) clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; + note_run_start(next, wallclock); BUG_ON(task_cpu(next) != cpu_of(rq)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9ab83b5af025..8259d1af3efb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2543,6 +2543,8 @@ unsigned int __read_mostly sched_init_task_load_pelt; unsigned int __read_mostly sched_init_task_load_windows; unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; +unsigned int __read_mostly sysctl_sched_min_runtime = 200000000; /* 200 ms */ + static inline unsigned int task_load(struct task_struct *p) { if (sched_use_pelt) @@ -3602,6 +3604,10 @@ static int lower_power_cpu_available(struct task_struct *p, int cpu) int i; int lowest_power_cpu = task_cpu(p); int lowest_power = power_cost(p, task_cpu(p)); + u64 delta = sched_clock() - p->run_start; + + if (delta < sysctl_sched_min_runtime) + return 0; /* Is a lower-powered idle CPU available which will fit this task? */ for_each_cpu_and(i, tsk_cpus_allowed(p), cpu_online_mask) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1465fb869657..9c2719cc9cc9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -372,6 +372,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, }, + { + .procname = "sched_min_runtime", + .data = &sysctl_sched_min_runtime, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_spill_load", .data = &sysctl_sched_spill_load_pct,