diff --git a/kernel/exit.c b/kernel/exit.c index 07110c6020a0..92ff63200287 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -54,6 +54,8 @@ #include #include +#include "sched/tune.h" + #include #include #include @@ -699,6 +701,9 @@ void do_exit(long code) } exit_signals(tsk); /* sets PF_EXITING */ + + schedtune_exit_task(tsk); + /* * tsk->flags are checked in the futex code to protect against * an exiting task cleaning up the robust pi futexes. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 42c45ccc1f62..f56a528c8ae7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -287,6 +287,18 @@ int sysctl_sched_rt_runtime = 950000; /* cpus with isolated domains */ cpumask_var_t cpu_isolated_map; +struct rq * +lock_rq_of(struct task_struct *p, unsigned long *flags) +{ + return task_rq_lock(p, flags); +} + +void +unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags) +{ + task_rq_unlock(rq, p, flags); +} + /* * this_rq_lock - lock this runqueue and disable interrupts. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6390b4cb916c..a26f40cb7fd0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4234,8 +4234,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; - schedtune_enqueue_task(p, cpu_of(rq)); - /* * We want to potentially trigger a freq switch * request only for tasks that are waking up; this is @@ -4246,6 +4244,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } + + /* Update SchedTune accouting */ + schedtune_enqueue_task(p, cpu_of(rq)); + #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -4311,7 +4313,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { - schedtune_dequeue_task(p, cpu_of(rq)); /* * We want to potentially trigger a freq switch @@ -4329,6 +4330,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } } + /* Update SchedTune accouting */ + schedtune_dequeue_task(p, cpu_of(rq)); + #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -5604,7 +5608,6 @@ static inline int find_best_target(struct task_struct *p) * The target CPU can be already at a capacity level higher * than the one required to boost the task. */ - if (new_util > capacity_orig_of(i)) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a06c19c48057..144d4782a455 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1701,6 +1701,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } +extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags); +extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags); + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index a691b8db2888..4c77cc23e65b 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -11,6 +11,10 @@ #include "sched.h" #include "tune.h" +#ifdef CONFIG_CGROUP_SCHEDTUNE +static bool schedtune_initialized = false; +#endif + unsigned int sysctl_sched_cfs_boost __read_mostly; extern struct target_nrg schedtune_target_nrg; @@ -222,6 +226,8 @@ struct boost_groups { /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; } group[BOOSTGROUPS_COUNT]; + /* CPU's boost group locking */ + raw_spinlock_t lock; }; /* Boost groups affecting each CPU in the system */ @@ -298,28 +304,24 @@ schedtune_boostgroup_update(int idx, int boost) return 0; } +#define ENQUEUE_TASK 1 +#define DEQUEUE_TASK -1 + static inline void schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) { - struct boost_groups *bg; - int tasks; - - bg = &per_cpu(cpu_boost_groups, cpu); + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + int tasks = bg->group[idx].tasks + task_count; /* Update boosted tasks count while avoiding to make it negative */ - if (task_count < 0 && bg->group[idx].tasks <= -task_count) - bg->group[idx].tasks = 0; - else - bg->group[idx].tasks += task_count; - - /* Boost group activation or deactivation on that RQ */ - tasks = bg->group[idx].tasks; - if (tasks == 1 || tasks == 0) - schedtune_cpu_update(cpu); + bg->group[idx].tasks = max(0, tasks); trace_sched_tune_tasks_update(p, cpu, tasks, idx, bg->group[idx].boost, bg->boost_max); + /* Boost group activation or deactivation on that RQ */ + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); } /* @@ -327,9 +329,14 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) */ void schedtune_enqueue_task(struct task_struct *p, int cpu) { + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; struct schedtune *st; int idx; + if (!unlikely(schedtune_initialized)) + return; + /* * When a task is marked PF_EXITING by do_exit() it's going to be * dequeued and enqueued multiple times in the exit path. @@ -339,13 +346,109 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) if (p->flags & PF_EXITING) return; - /* Get task boost group */ + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions for example on + * do_exit()::cgroup_exit() and task migration. + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); rcu_read_lock(); + st = task_schedtune(p); idx = st->idx; - rcu_read_unlock(); - schedtune_tasks_update(p, cpu, idx, 1); + schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +int schedtune_allow_attach(struct cgroup_taskset *tset) +{ + /* We always allows tasks to be moved between existing CGroups */ + return 0; +} + +int schedtune_can_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + struct boost_groups *bg; + unsigned long irq_flags; + unsigned int cpu; + struct rq *rq; + int src_bg; /* Source boost group index */ + int dst_bg; /* Destination boost group index */ + int tasks; + + if (!unlikely(schedtune_initialized)) + return 0; + + + cgroup_taskset_for_each(task, css, tset) { + + /* + * Lock the CPU's RQ the task is enqueued to avoid race + * conditions with migration code while the task is being + * accounted + */ + rq = lock_rq_of(task, &irq_flags); + + if (!task->on_rq) { + unlock_rq_of(rq, task, &irq_flags); + continue; + } + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + cpu = cpu_of(rq); + bg = &per_cpu(cpu_boost_groups, cpu); + raw_spin_lock(&bg->lock); + + dst_bg = css_st(css)->idx; + src_bg = task_schedtune(task)->idx; + + /* + * Current task is not changing boostgroup, which can + * happen when the new hierarchy is in use. + */ + if (unlikely(dst_bg == src_bg)) { + raw_spin_unlock(&bg->lock); + unlock_rq_of(rq, task, &irq_flags); + continue; + } + + /* + * This is the case of a RUNNABLE task which is switching its + * current boost group. + */ + + /* Move task from src to dst boost group */ + tasks = bg->group[src_bg].tasks - 1; + bg->group[src_bg].tasks = max(0, tasks); + bg->group[dst_bg].tasks += 1; + + raw_spin_unlock(&bg->lock); + unlock_rq_of(rq, task, &irq_flags); + + /* Update CPU boost group */ + if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1) + schedtune_cpu_update(task_cpu(task)); + + } + + return 0; +} + +void schedtune_cancel_attach(struct cgroup_taskset *tset) +{ + /* This can happen only if SchedTune controller is mounted with + * other hierarchies ane one of them fails. Since usually SchedTune is + * mouted on its own hierarcy, for the time being we do not implement + * a proper rollback mechanism */ + WARN(1, "SchedTune cancel attach not implemented"); } /* @@ -353,26 +456,62 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) */ void schedtune_dequeue_task(struct task_struct *p, int cpu) { + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; struct schedtune *st; int idx; + if (!unlikely(schedtune_initialized)) + return; + /* * When a task is marked PF_EXITING by do_exit() it's going to be * dequeued and enqueued multiple times in the exit path. * Thus we avoid any further update, since we do not want to change * CPU boosting while the task is exiting. - * The last dequeue will be done by cgroup exit() callback. + * The last dequeue is already enforce by the do_exit() code path + * via schedtune_exit_task(). */ if (p->flags & PF_EXITING) return; - /* Get task boost group */ + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); rcu_read_lock(); + st = task_schedtune(p); idx = st->idx; - rcu_read_unlock(); - schedtune_tasks_update(p, cpu, idx, -1); + schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +void schedtune_exit_task(struct task_struct *tsk) +{ + struct schedtune *st; + unsigned long irq_flags; + unsigned int cpu; + struct rq *rq; + int idx; + + if (!unlikely(schedtune_initialized)) + return; + + rq = lock_rq_of(tsk, &irq_flags); + rcu_read_lock(); + + cpu = cpu_of(rq); + st = task_schedtune(tsk); + idx = st->idx; + schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + unlock_rq_of(rq, tsk, &irq_flags); } int schedtune_cpu_boost(int cpu) @@ -518,6 +657,9 @@ schedtune_css_free(struct cgroup_subsys_state *css) struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, +// .allow_attach = schedtune_allow_attach, + .can_attach = schedtune_can_attach, + .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 99637758a8af..be1785eb1c5b 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -17,6 +17,8 @@ struct target_nrg { int schedtune_cpu_boost(int cpu); int schedtune_task_boost(struct task_struct *tsk); +void schedtune_exit_task(struct task_struct *tsk); + void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); @@ -25,6 +27,8 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost() #define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost() +#define schedtune_exit_task(task) do { } while (0) + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) @@ -39,6 +43,8 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #define schedtune_cpu_boost(cpu) 0 #define schedtune_task_boost(tsk) 0 +#define schedtune_exit_task(task) do { } while (0) + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0)