timer: make deferrable cpu unbound timers really not bound to a cpu

When a deferrable work (INIT_DEFERRABLE_WORK, etc.) is queued via
queue_delayed_work() it's probably intended to run the work item on any
CPU that isn't idle. However, we queue the work to run at a later time
by starting a deferrable timer that binds to whatever CPU the work is
queued on which is same with queue_delayed_work_on(smp_processor_id())
effectively.

As a result WORK_CPU_UNBOUND work items aren't really cpu unbound now.
In fact this is perfectly fine with UP kernel and also won't affect much a
system without dyntick with SMP kernel too as every cpus run timers
periodically.  But on SMP systems with dyntick current implementation leads
deferrable timers not very scalable because the timer's base which has
queued the deferrable timer won't wake up till next non-deferrable timer
expires even though there are possible other non idle cpus are running
which are able to run expired deferrable timers.

The deferrable work is a good example of the current implementation's
victim like below.

INIT_DEFERRABLE_WORK(&dwork, fn);
CPU 0                                 CPU 1
queue_delayed_work(wq, &dwork, HZ);
    queue_delayed_work_on(WORK_CPU_UNBOUND);
        ...
	__mod_timer() -> queues timer to the
			 current cpu's timer
			 base.
	...
tick_nohz_idle_enter() -> cpu enters idle.
A second later
cpu 0 is now in idle.                 cpu 1 exits idle or wasn't in idle so
                                      now it's in active but won't
cpu 0 won't wake up till next         handle cpu unbound deferrable timer
non-deferrable timer expires.         as it's in cpu 0's timer base.

To make all cpu unbound deferrable timers are scalable, introduce a common
timer base which is only for cpu unbound deferrable timers to make those
are indeed cpu unbound so that can be scheduled by tick_do_timer_cpu.
This common timer fixes scalability issue of delayed work and all other cpu
unbound deferrable timer using implementations.

Change-Id: I8b6c57d8b6445a76fa02a8cb598a8ef22aef7200
CC: Thomas Gleixner <tglx@linutronix.de>
CC: John Stultz <john.stultz@linaro.org>
CC: Tejun Heo <tj@kernel.org>
[joonwoop@codeaurora.org: timer->base replaced with CPU index so get
 the deferrable timer wheel from lock_timer_base() instead of
 do_init_timer().]
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
This commit is contained in:
Joonwoo Park 2015-05-26 12:44:42 -07:00 committed by David Keitel
parent 8b72bf241c
commit 646bf5125d
2 changed files with 63 additions and 4 deletions

View file

@ -63,6 +63,7 @@ struct timer_list {
#define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING)
#define TIMER_DEFERRABLE 0x00100000
#define TIMER_IRQSAFE 0x00200000
#define TIMER_PINNED_ON_CPU 0x00400000
#define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
.entry = { .next = TIMER_ENTRY_STATIC }, \
@ -241,6 +242,8 @@ extern enum hrtimer_restart it_real_fn(struct hrtimer *);
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
#include <linux/sysctl.h>
extern struct tvec_base tvec_base_deferrable;
extern unsigned int sysctl_timer_migration;
int timer_migration_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,

View file

@ -94,12 +94,15 @@ struct tvec_base {
struct tvec tv5;
} ____cacheline_aligned;
static inline void __run_timers(struct tvec_base *base);
static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
unsigned int sysctl_timer_migration = 1;
struct tvec_base tvec_base_deferrable;
void timers_update_migration(bool update_nohz)
{
bool on = sysctl_timer_migration && tick_nohz_active;
@ -135,18 +138,62 @@ int timer_migration_handler(struct ctl_table *table, int write,
}
static inline struct tvec_base *get_target_base(struct tvec_base *base,
int pinned)
int pinned, u32 timer_flags)
{
if (!pinned && !(timer_flags & TIMER_PINNED_ON_CPU) &&
(timer_flags & TIMER_DEFERRABLE))
return &tvec_base_deferrable;
if (pinned || !base->migration_enabled)
return this_cpu_ptr(&tvec_bases);
return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
}
static inline void __run_deferrable_timers(void)
{
if (smp_processor_id() == tick_do_timer_cpu &&
time_after_eq(jiffies, tvec_base_deferrable.timer_jiffies))
__run_timers(&tvec_base_deferrable);
}
static inline void init_timer_deferrable_global(void)
{
tvec_base_deferrable.cpu = nr_cpu_ids;
spin_lock_init(&tvec_base_deferrable.lock);
tvec_base_deferrable.timer_jiffies = jiffies;
tvec_base_deferrable.next_timer = tvec_base_deferrable.timer_jiffies;
}
static inline struct tvec_base *get_timer_base(u32 timer_flags)
{
if (!(timer_flags & TIMER_PINNED_ON_CPU) &&
timer_flags & TIMER_DEFERRABLE)
return &tvec_base_deferrable;
else
return per_cpu_ptr(&tvec_bases, timer_flags & TIMER_CPUMASK);
}
#else
static inline struct tvec_base *get_target_base(struct tvec_base *base,
int pinned)
int pinned, u32 timer_flags)
{
return this_cpu_ptr(&tvec_bases);
}
static inline void __run_deferrable_timers(void)
{
}
static inline void init_timer_deferrable_global(void)
{
/*
* initialize cpu unbound deferrable timer base only when CONFIG_SMP.
* UP kernel handles the timers with cpu 0 timer base.
*/
}
static inline struct tvec_base *get_timer_base(u32 timer_flags)
{
return per_cpu_ptr(&tvec_bases, timer_flags & TIMER_CPUMASK);
}
#endif
static unsigned long round_jiffies_common(unsigned long j, int cpu,
@ -768,7 +815,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
struct tvec_base *base;
if (!(tf & TIMER_MIGRATING)) {
base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
base = get_timer_base(tf);
spin_lock_irqsave(&base->lock, *flags);
if (timer->flags == tf)
return base;
@ -797,7 +844,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
debug_activate(timer, expires);
new_base = get_target_base(base, pinned);
new_base = get_target_base(base, pinned, timer->flags);
if (base != new_base) {
/*
@ -819,6 +866,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
}
}
if (pinned == TIMER_PINNED)
timer->flags |= TIMER_PINNED_ON_CPU;
else
timer->flags &= ~TIMER_PINNED_ON_CPU;
timer->expires = expires;
internal_add_timer(base, timer);
@ -1000,6 +1051,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
(timer->flags & ~TIMER_BASEMASK) | cpu);
}
timer->flags |= TIMER_PINNED_ON_CPU;
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
spin_unlock_irqrestore(&base->lock, flags);
@ -1433,6 +1485,8 @@ static void run_timer_softirq(struct softirq_action *h)
{
struct tvec_base *base = this_cpu_ptr(&tvec_bases);
__run_deferrable_timers();
if (time_after_eq(jiffies, base->timer_jiffies))
__run_timers(base);
}
@ -1656,6 +1710,8 @@ static void __init init_timer_cpus(void)
for_each_possible_cpu(cpu)
init_timer_cpu(cpu);
init_timer_deferrable_global();
}
void __init init_timers(void)