From 38ddcff85af052ad99e4f3f0b6e9659b0ca10dcf Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 7 Aug 2017 15:46:13 +0100 Subject: [PATCH] FROMLIST: sched/fair: Use wake_q length as a hint for wake_wide (from https://patchwork.kernel.org/patch/9895261/) This patch adds a parameter to select_task_rq, sibling_count_hint allowing the caller, where it has this information, to inform the sched_class the number of tasks that are being woken up as part of the same event. The wake_q mechanism is one case where this information is available. select_task_rq_fair can then use the information to detect that it needs to widen the search space for task placement in order to avoid overloading the last-level cache domain's CPUs. * * * The reason I am investigating this change is the following use case on ARM big.LITTLE (asymmetrical CPU capacity): 1 task per CPU, which all repeatedly do X amount of work then pthread_barrier_wait (i.e. sleep until the last task finishes its X and hits the barrier). On big.LITTLE, the tasks which get a "big" CPU finish faster, and then those CPUs pull over the tasks that are still running: v CPU v ->time-> ------------- 0 (big) 11111 /333 ------------- 1 (big) 22222 /444| ------------- 2 (LITTLE) 333333/ ------------- 3 (LITTLE) 444444/ ------------- Now when task 4 hits the barrier (at |) and wakes the others up, there are 4 tasks with prev_cpu= and 0 tasks with prev_cpu=. want_affine therefore means that we'll only look in CPUs 0 and 1 (sd_llc), so tasks will be unnecessarily coscheduled on the bigs until the next load balance, something like this: v CPU v ->time-> ------------------------ 0 (big) 11111 /333 31313\33333 ------------------------ 1 (big) 22222 /444|424\4444444 ------------------------ 2 (LITTLE) 333333/ \222222 ------------------------ 3 (LITTLE) 444444/ \1111 ------------------------ ^^^ underutilization So, I'm trying to get want_affine = 0 for these tasks. I don't _think_ any incarnation of the wakee_flips mechanism can help us here because which task is waker and which tasks are wakees generally changes with each iteration. However pthread_barrier_wait (or more accurately FUTEX_WAKE) has the nice property that we know exactly how many tasks are being woken, so we can cheat. It might be a disadvantage that we "widen" _every_ task that's woken in an event, while select_idle_sibling would work fine for the first sd_llc_size - 1 tasks. IIUC, if wake_affine() behaves correctly this trick wouldn't be necessary on SMP systems, so it might be best guarded by the presence of SD_ASYM_CPUCAPACITY? * * * Final note.. In order to observe "perfect" behaviour for this use case, I also had to disable the TTWU_QUEUE sched feature. Suppose during the wakeup above we are working through the work queue and have placed tasks 3 and 2, and are about to place task 1: v CPU v ->time-> -------------- 0 (big) 11111 /333 3 -------------- 1 (big) 22222 /444|4 -------------- 2 (LITTLE) 333333/ 2 -------------- 3 (LITTLE) 444444/ <- Task 1 should go here -------------- If TTWU_QUEUE is enabled, we will not yet have enqueued task 2 (having instead sent a reschedule IPI) or attached its load to CPU 2. So we are likely to also place task 1 on cpu 2. Disabling TTWU_QUEUE means that we enqueue task 2 before placing task 1, solving this issue. TTWU_QUEUE is there to minimise rq lock contention, and I guess that this contention is less of an issue on big.LITTLE systems since they have relatively few CPUs, which suggests the trade-off makes sense here. Change-Id: I2080302839a263e0841a89efea8589ea53bbda9c Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Josef Bacik Cc: Joel Fernandes Cc: Mike Galbraith Cc: Matt Fleming --- include/linux/sched.h | 3 ++- kernel/sched/core.c | 35 +++++++++++++++++++++++------------ kernel/sched/deadline.c | 3 ++- kernel/sched/fair.c | 21 ++++++++++++++------- kernel/sched/idle_task.c | 3 ++- kernel/sched/rt.c | 3 ++- kernel/sched/sched.h | 3 ++- kernel/sched/stop_task.c | 3 ++- 8 files changed, 49 insertions(+), 25 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 62e179f84aef..60af9d2fa922 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -993,12 +993,13 @@ struct wake_q_node { struct wake_q_head { struct wake_q_node *first; struct wake_q_node **lastp; + int count; }; #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) #define WAKE_Q(name) \ - struct wake_q_head name = { WAKE_Q_TAIL, &name.first } + struct wake_q_head name = { WAKE_Q_TAIL, &name.first, 0 } extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 563f316f5330..18d607f9a417 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -546,6 +546,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; + head->count++; + get_task_struct(task); /* @@ -555,6 +557,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) head->lastp = &node->next; } +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, + int sibling_count_hint); + void wake_up_q(struct wake_q_head *head) { struct wake_q_node *node = head->first; @@ -569,10 +575,10 @@ void wake_up_q(struct wake_q_head *head) task->wake_q.next = NULL; /* - * wake_up_process() implies a wmb() to pair with the queueing + * try_to_wake_up() implies a wmb() to pair with the queueing * in wake_q_add() so as not to miss wakeups. */ - wake_up_process(task); + try_to_wake_up(task, TASK_NORMAL, 0, head->count); put_task_struct(task); } } @@ -1642,12 +1648,14 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, + int sibling_count_hint) { lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags, + sibling_count_hint); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1932,6 +1940,8 @@ static void ttwu_queue(struct task_struct *p, int cpu) * @p: the thread to be awakened * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) + * @sibling_count_hint: A hint at the number of threads that are being woken up + * in this event. * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -1943,7 +1953,8 @@ static void ttwu_queue(struct task_struct *p, int cpu) * or @state didn't match @p's state. */ static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, + int sibling_count_hint) { unsigned long flags; int cpu, success = 0; @@ -2044,8 +2055,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->sched_class->task_waking) p->sched_class->task_waking(p); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); - + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags, + sibling_count_hint); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); @@ -2127,13 +2138,13 @@ out: */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_NORMAL, 0); + return try_to_wake_up(p, TASK_NORMAL, 0, 1); } EXPORT_SYMBOL(wake_up_process); int wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + return try_to_wake_up(p, state, 0, 1); } /* @@ -2467,7 +2478,7 @@ void wake_up_new_task(struct task_struct *p) * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, * as we're not fully set-up yet. */ - __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1)); #endif rq = __task_rq_lock(p); update_rq_clock(rq); @@ -2905,7 +2916,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1); if (dest_cpu == smp_processor_id()) goto unlock; @@ -3560,7 +3571,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, wake_flags); + return try_to_wake_up(curr->private, mode, wake_flags, 1); } EXPORT_SYMBOL(default_wake_function); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ab1a9a99660d..cdf2a0b97611 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1070,7 +1070,8 @@ static void yield_task_dl(struct rq *rq) static int find_later_rq(struct task_struct *task); static int -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { struct task_struct *curr; struct rq *rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9bc717ef81f5..b904a023be95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5773,15 +5773,18 @@ energy_diff(struct energy_env *eenv) * being client/server, worker/dispatcher, interrupt source or whatever is * irrelevant, spread criteria is apparent partner count exceeds socket size. */ -static int wake_wide(struct task_struct *p) +static int wake_wide(struct task_struct *p, int sibling_count_hint) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int llc_size = this_cpu_read(sd_llc_size); + + if (sibling_count_hint >= llc_size) + return 1; if (master < slave) swap(master, slave); - if (slave < factor || master < slave * factor) + if (slave < llc_size || master < slave * llc_size) return 0; return 1; } @@ -6754,7 +6757,8 @@ unlock: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, + int sibling_count_hint) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -6762,9 +6766,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + if (sd_flag & SD_BALANCE_WAKE) { + record_wakee(p); + want_affine = !wake_wide(p, sibling_count_hint) && + !wake_cap(p, cpu, prev_cpu) && + cpumask_test_cpu(cpu, &p->cpus_allowed); + } if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) return select_energy_cpu_brute(p, prev_cpu, sync); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c4ae0f1fdf9b..33d7003fa1b8 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -9,7 +9,8 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3715473fd8f8..069f8982867f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1372,7 +1372,8 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { struct task_struct *curr; struct rq *rq; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index deba080af845..a2d5de8415e1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1261,7 +1261,8 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags, + int subling_count_hint); void (*migrate_task_rq)(struct task_struct *p); void (*task_waking) (struct task_struct *task); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 61f852d46858..a5567ccd8803 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -12,7 +12,8 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { return task_cpu(p); /* stop tasks as never migrate */ }