diff --git a/include/linux/sched.h b/include/linux/sched.h index 77d3b4c106cd..fc0f5db45791 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1289,6 +1289,7 @@ struct ravg { u32 sum_history[RAVG_HIST_SIZE_MAX]; #ifdef CONFIG_SCHED_FREQ_INPUT u32 curr_window, prev_window; + u16 active_windows; #endif }; @@ -2125,10 +2126,15 @@ static inline cputime_t task_gtime(struct task_struct *t) extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); +struct sched_load { + unsigned long prev_load; + unsigned long new_task_load; +}; + #if defined(CONFIG_SCHED_FREQ_INPUT) extern int sched_set_window(u64 window_start, unsigned int window_size); extern unsigned long sched_get_busy(int cpu); -extern void sched_get_cpus_busy(unsigned long *busy, +extern void sched_get_cpus_busy(struct sched_load *busy, const struct cpumask *query_cpus); extern void sched_set_io_is_busy(int val); int sched_update_freq_max_load(const cpumask_t *cpumask); @@ -2141,6 +2147,8 @@ static inline unsigned long sched_get_busy(int cpu) { return 0; } +static inline void sched_get_cpus_busy(struct sched_load *busy, + const struct cpumask *query_cpus) {}; static inline void sched_set_io_is_busy(int val) {}; static inline int sched_update_freq_max_load(const cpumask_t *cpumask) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 5db0256590c6..0fb660306a9f 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -69,6 +69,9 @@ extern unsigned int sysctl_sched_powerband_limit_pct; extern unsigned int sysctl_sched_lowspill_freq; extern unsigned int sysctl_sched_pack_freq; extern unsigned int sysctl_sched_boost; +#if defined(CONFIG_SCHED_FREQ_INPUT) +extern unsigned int sysctl_sched_new_task_windows; +#endif #else /* CONFIG_SCHED_HMP */ diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 69aede209948..ae04e2095389 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -244,6 +244,9 @@ TRACE_EVENT(sched_update_task_ravg, __field( u64, ps ) __field( u32, curr_window ) __field( u32, prev_window ) + __field( u64, nt_cs ) + __field( u64, nt_ps ) + __field( u32, active_windows ) #endif ), @@ -267,12 +270,15 @@ TRACE_EVENT(sched_update_task_ravg, __entry->ps = rq->prev_runnable_sum; __entry->curr_window = p->ravg.curr_window; __entry->prev_window = p->ravg.prev_window; + __entry->nt_cs = rq->nt_curr_runnable_sum; + __entry->nt_ps = rq->nt_prev_runnable_sum; + __entry->active_windows = p->ravg.active_windows; #endif ), TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu" #ifdef CONFIG_SCHED_FREQ_INPUT - " cs %llu ps %llu cur_window %u prev_window %u" + " cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u" #endif , __entry->wallclock, __entry->win_start, __entry->delta, task_event_names[__entry->evt], __entry->cpu, @@ -282,7 +288,9 @@ TRACE_EVENT(sched_update_task_ravg, __entry->sum, __entry->irqtime #ifdef CONFIG_SCHED_FREQ_INPUT , __entry->cs, __entry->ps, __entry->curr_window, - __entry->prev_window + __entry->prev_window, + __entry->nt_cs, __entry->nt_ps, + __entry->active_windows #endif ) ); @@ -374,37 +382,44 @@ TRACE_EVENT(sched_migration_update_sum, __field(int, pid ) __field( u64, cs ) __field( u64, ps ) + __field( s64, nt_cs ) + __field( s64, nt_ps ) ), TP_fast_assign( __entry->cpu = cpu_of(rq); __entry->cs = rq->curr_runnable_sum; __entry->ps = rq->prev_runnable_sum; + __entry->nt_cs = (s64)rq->nt_curr_runnable_sum; + __entry->nt_ps = (s64)rq->nt_prev_runnable_sum; __entry->pid = p->pid; ), - TP_printk("cpu %d: cs %llu ps %llu pid %d", __entry->cpu, - __entry->cs, __entry->ps, __entry->pid) + TP_printk("cpu %d: cs %llu ps %llu nt_cs %lld nt_ps %lld pid %d", + __entry->cpu, __entry->cs, __entry->ps, + __entry->nt_cs, __entry->nt_ps, __entry->pid) ); TRACE_EVENT(sched_get_busy, - TP_PROTO(int cpu, u64 load), + TP_PROTO(int cpu, u64 load, u64 nload), - TP_ARGS(cpu, load), + TP_ARGS(cpu, load, nload), TP_STRUCT__entry( __field( int, cpu ) __field( u64, load ) + __field( u64, nload ) ), TP_fast_assign( __entry->cpu = cpu; __entry->load = load; + __entry->nload = nload; ), - TP_printk("cpu %d load %lld", - __entry->cpu, __entry->load) + TP_printk("cpu %d load %lld new_task_load %lld", + __entry->cpu, __entry->load, __entry->nload) ); TRACE_EVENT(sched_freq_alert, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 933c0e5baa99..4074dd46bc29 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1226,6 +1226,8 @@ static __read_mostly unsigned int sched_window_stats_policy = __read_mostly unsigned int sysctl_sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG; +__read_mostly unsigned int sysctl_sched_new_task_windows = 5; + static __read_mostly unsigned int sched_account_wait_time = 1; __read_mostly unsigned int sysctl_sched_account_wait_time = 1; @@ -1469,6 +1471,11 @@ heavy_task_wakeup(struct task_struct *p, struct rq *rq, int event) return (rq->window_start - p->ravg.mark_start > sched_ravg_window); } +static inline bool is_new_task(struct task_struct *p) +{ + return p->ravg.active_windows < sysctl_sched_new_task_windows; +} + /* * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) */ @@ -1481,11 +1488,17 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, u64 window_start = rq->window_start; u32 window_size = sched_ravg_window; u64 delta; + bool new_task; new_window = mark_start < window_start; - if (new_window) + if (new_window) { nr_full_windows = div64_u64((window_start - mark_start), window_size); + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + new_task = is_new_task(p); /* Handle per-task window rollover. We don't care about the idle * task or exiting tasks. */ @@ -1516,14 +1529,18 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, /* A new window has started. The RQ demand must be rolled * over if p is the current task. */ if (p_is_curr_task) { - u64 prev_sum = 0; + u64 prev_sum = 0, nt_prev_sum = 0; /* p is either idle task or an exiting task */ - if (!nr_full_windows) + if (!nr_full_windows) { prev_sum = rq->curr_runnable_sum; + nt_prev_sum = rq->nt_curr_runnable_sum; + } rq->prev_runnable_sum = prev_sum; rq->curr_runnable_sum = 0; + rq->nt_prev_runnable_sum = nt_prev_sum; + rq->nt_curr_runnable_sum = 0; } else if (heavy_task_wakeup(p, rq, event)) { /* A new window has started. If p is a waking @@ -1535,6 +1552,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, * tunable. */ p->ravg.prev_window = p->ravg.demand; rq->prev_runnable_sum += p->ravg.demand; + if (new_task) + rq->nt_prev_runnable_sum += p->ravg.demand; } return; @@ -1553,6 +1572,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, delta = irqtime; delta = scale_exec_time(delta, rq); rq->curr_runnable_sum += delta; + if (new_task) + rq->nt_curr_runnable_sum += delta; if (!is_idle_task(p) && !exiting_task(p)) p->ravg.curr_window += delta; @@ -1586,10 +1607,14 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, p->ravg.prev_window = delta; } rq->prev_runnable_sum += delta; + if (new_task) + rq->nt_prev_runnable_sum += delta; /* Account piece of busy time in the current window. */ delta = scale_exec_time(wallclock - window_start, rq); rq->curr_runnable_sum += delta; + if (new_task) + rq->nt_curr_runnable_sum += delta; if (!exiting_task(p)) p->ravg.curr_window = delta; @@ -1615,6 +1640,11 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, delta = scale_exec_time(window_start - mark_start, rq); if (!is_idle_task(p) && !exiting_task(p)) p->ravg.prev_window += delta; + + rq->nt_prev_runnable_sum = rq->nt_curr_runnable_sum; + if (new_task) + rq->nt_prev_runnable_sum += delta; + delta += rq->curr_runnable_sum; } else { /* Since at least one full window has elapsed, @@ -1623,14 +1653,27 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, delta = scale_exec_time(window_size, rq); if (!is_idle_task(p) && !exiting_task(p)) p->ravg.prev_window = delta; + + if (new_task) + rq->nt_prev_runnable_sum = delta; + else + rq->nt_prev_runnable_sum = 0; } - /* Rollover is done here by overwriting the values in - * prev_runnable_sum and curr_runnable_sum. */ + /* + * Rollover for normal runnable sum is done here by overwriting + * the values in prev_runnable_sum and curr_runnable_sum. + * Rollover for new task runnable sum has completed by previous + * if-else statement. + */ rq->prev_runnable_sum = delta; /* Account piece of busy time in the current window. */ delta = scale_exec_time(wallclock - window_start, rq); rq->curr_runnable_sum = delta; + if (new_task) + rq->nt_curr_runnable_sum = delta; + else + rq->nt_curr_runnable_sum = 0; if (!is_idle_task(p) && !exiting_task(p)) p->ravg.curr_window = delta; @@ -1654,6 +1697,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, /* Roll window over. If IRQ busy time was just in the current * window then that is all that need be accounted. */ rq->prev_runnable_sum = rq->curr_runnable_sum; + rq->nt_prev_runnable_sum = rq->nt_curr_runnable_sum; + rq->nt_curr_runnable_sum = 0; if (mark_start > window_start) { rq->curr_runnable_sum = scale_exec_time(irqtime, rq); return; @@ -2080,6 +2125,7 @@ static inline void set_window_start(struct rq *rq) rq->window_start = cpu_rq(sync_cpu)->window_start; #ifdef CONFIG_SCHED_FREQ_INPUT rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; #endif raw_spin_unlock(&sync_rq->lock); } @@ -2212,6 +2258,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) rq->window_start = window_start; #ifdef CONFIG_SCHED_FREQ_INPUT rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; #endif reset_cpu_hmp_stats(cpu, 1); @@ -2269,12 +2316,13 @@ scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq) return div64_u64(load * (u64)src_freq, (u64)dst_freq); } -void sched_get_cpus_busy(unsigned long *busy, const struct cpumask *query_cpus) +void sched_get_cpus_busy(struct sched_load *busy, + const struct cpumask *query_cpus) { unsigned long flags; struct rq *rq; const int cpus = cpumask_weight(query_cpus); - u64 load[cpus]; + u64 load[cpus], nload[cpus]; unsigned int cur_freq[cpus], max_freq[cpus]; int notifier_sent[cpus]; int cpu, i = 0; @@ -2299,6 +2347,7 @@ void sched_get_cpus_busy(unsigned long *busy, const struct cpumask *query_cpus) update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_clock(), 0); load[i] = rq->old_busy_time = rq->prev_runnable_sum; + nload[i] = rq->nt_prev_runnable_sum; /* * Scale load in reference to rq->max_possible_freq. * @@ -2306,6 +2355,7 @@ void sched_get_cpus_busy(unsigned long *busy, const struct cpumask *query_cpus) * rq->max_freq. */ load[i] = scale_load_to_cpu(load[i], cpu); + nload[i] = scale_load_to_cpu(nload[i], cpu); notifier_sent[i] = rq->notifier_sent; rq->notifier_sent = 0; @@ -2325,18 +2375,29 @@ void sched_get_cpus_busy(unsigned long *busy, const struct cpumask *query_cpus) if (!notifier_sent[i]) { load[i] = scale_load_to_freq(load[i], max_freq[i], cur_freq[i]); + nload[i] = scale_load_to_freq(nload[i], max_freq[i], + cur_freq[i]); if (load[i] > window_size) load[i] = window_size; + if (nload[i] > window_size) + nload[i] = window_size; + load[i] = scale_load_to_freq(load[i], cur_freq[i], rq->max_possible_freq); + nload[i] = scale_load_to_freq(nload[i], cur_freq[i], + rq->max_possible_freq); } else { load[i] = scale_load_to_freq(load[i], max_freq[i], rq->max_possible_freq); + nload[i] = scale_load_to_freq(nload[i], max_freq[i], + rq->max_possible_freq); } - busy[i] = div64_u64(load[i], NSEC_PER_USEC); + busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC); + busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC); - trace_sched_get_busy(cpu, busy[i]); + trace_sched_get_busy(cpu, busy[i].prev_load, + busy[i].new_task_load); i++; } } @@ -2344,12 +2405,12 @@ void sched_get_cpus_busy(unsigned long *busy, const struct cpumask *query_cpus) unsigned long sched_get_busy(int cpu) { struct cpumask query_cpu = CPU_MASK_NONE; - unsigned long busy; + struct sched_load busy; cpumask_set_cpu(cpu, &query_cpu); sched_get_cpus_busy(&busy, &query_cpu); - return busy; + return busy.prev_load; } void sched_set_io_is_busy(int val) @@ -2399,6 +2460,7 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu) struct rq *src_rq = task_rq(p); struct rq *dest_rq = cpu_rq(new_cpu); u64 wallclock; + bool new_task; if (!sched_enable_hmp || !sched_migration_fixup || exiting_task(p) || (!p->on_rq && p->state != TASK_WAKING)) @@ -2421,18 +2483,30 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu) update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); + new_task = is_new_task(p); + if (p->ravg.curr_window) { src_rq->curr_runnable_sum -= p->ravg.curr_window; dest_rq->curr_runnable_sum += p->ravg.curr_window; + if (new_task) { + src_rq->nt_curr_runnable_sum -= p->ravg.curr_window; + dest_rq->nt_curr_runnable_sum += p->ravg.curr_window; + } } if (p->ravg.prev_window) { src_rq->prev_runnable_sum -= p->ravg.prev_window; dest_rq->prev_runnable_sum += p->ravg.prev_window; + if (new_task) { + src_rq->nt_prev_runnable_sum -= p->ravg.prev_window; + dest_rq->nt_prev_runnable_sum += p->ravg.prev_window; + } } BUG_ON((s64)src_rq->prev_runnable_sum < 0); BUG_ON((s64)src_rq->curr_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0); trace_sched_migration_update_sum(src_rq, p); trace_sched_migration_update_sum(dest_rq, p); @@ -9429,6 +9503,7 @@ void __init sched_init(void) #ifdef CONFIG_SCHED_FREQ_INPUT rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; rq->old_busy_time = 0; rq->notifier_sent = 0; #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b38041e3df9b..fa15ca43e312 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -702,6 +702,8 @@ struct rq { #ifdef CONFIG_SCHED_FREQ_INPUT u64 curr_runnable_sum; u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dfe5a6e2d22a..369120e75efe 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -431,6 +431,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_new_task_windows", + .data = &sysctl_sched_new_task_windows, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_window_update_handler, + }, { .procname = "sched_boost", .data = &sysctl_sched_boost,