From b40bf941f61756bcca03a818b4c8fa857612f8cd Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Mon, 2 Mar 2015 11:25:17 -0800
Subject: [PATCH] sched: add scheduling latency tracking procfs node

Add a new procfs node /proc/sys/kernel/sched_max_latency_us to track the
worst scheduling latency.  It provides easier way to identify maximum
scheduling latency seen across the CPUs.

Change-Id: I6e435bbf825c0a4dff2eded4a1256fb93f108d0e
[joonwoop@codeaurora.org: fixed conflict in update_stats_wait_end().]
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 Documentation/scheduler/sched-stats.txt |  7 +++
 include/linux/sched/sysctl.h            |  4 ++
 kernel/sched/fair.c                     | 63 +++++++++++++++++++++++++
 kernel/sysctl.c                         |  5 ++
 4 files changed, 79 insertions(+)

diff --git a/Documentation/scheduler/sched-stats.txt b/Documentation/scheduler/sched-stats.txt
index 30635c5ac2f5..158e01697459 100644
--- a/Documentation/scheduler/sched-stats.txt
+++ b/Documentation/scheduler/sched-stats.txt
@@ -156,6 +156,13 @@ Set sched_latency_warn_threshold_us or sched_latency_panic_threshold_us with
 non-zero threshold to warn or panic system when scheduling latency higher than
 configured threshold is detected.  Default is 0 (disabled) for both.
 
+/proc/sys/kernel/sched_max_latency_us
+----------------
+/proc/sys/kernel/sched_max_latency_us shows the maximum scheduling latency seen
+accross the CPUs.  The file shows the maximum latency seen in microseconds along
+with the cpu number or cpu id and the task that incurred maximum latency on that
+cpu.  The maximum latency can be reset by writing any value to the file.
+
 A program could be easily written to make use of these extra fields to
 report on how well a particular process or set of processes is faring
 under the scheduler's policies.  A simple version of such a program is
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index fdb75cab5243..c614b17c6069 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -132,6 +132,10 @@ extern unsigned int sysctl_sched_autogroup_enabled;
 #ifdef CONFIG_SCHEDSTATS
 extern unsigned int sysctl_sched_latency_panic_threshold;
 extern unsigned int sysctl_sched_latency_warn_threshold;
+
+extern int sched_max_latency_sysctl(struct ctl_table *table, int write,
+				    void __user *buffer, size_t *lenp,
+				    loff_t *ppos);
 #endif
 
 extern int sched_rr_timeslice;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1da6091b54f3..ec3ee54800a8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -126,6 +126,14 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #ifdef CONFIG_SCHEDSTATS
 unsigned int sysctl_sched_latency_panic_threshold;
 unsigned int sysctl_sched_latency_warn_threshold;
+
+struct sched_max_latency {
+	unsigned int latency_us;
+	char comm[TASK_COMM_LEN];
+	pid_t pid;
+};
+
+static DEFINE_PER_CPU(struct sched_max_latency, sched_max_latency);
 #endif /* CONFIG_SCHEDSTATS */
 
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@ -756,6 +764,54 @@ static void update_curr_fair(struct rq *rq)
 }
 
 #ifdef CONFIG_SCHEDSTATS
+int sched_max_latency_sysctl(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = 0;
+	int i, cpu = nr_cpu_ids;
+	char msg[256];
+	unsigned long flags;
+	struct rq *rq;
+	struct sched_max_latency max, *lat;
+
+	if (!write) {
+		max.latency_us = 0;
+		for_each_possible_cpu(i) {
+			rq = cpu_rq(i);
+			raw_spin_lock_irqsave(&rq->lock, flags);
+
+			lat = &per_cpu(sched_max_latency, i);
+			if (max.latency_us < lat->latency_us) {
+				max = *lat;
+				cpu = i;
+			}
+
+			raw_spin_unlock_irqrestore(&rq->lock, flags);
+		}
+
+		if (cpu != nr_cpu_ids) {
+			table->maxlen =
+			    snprintf(msg, sizeof(msg),
+				     "cpu%d comm=%s pid=%u latency=%u(us)",
+				     cpu, max.comm, max.pid, max.latency_us);
+			table->data = msg;
+			ret = proc_dostring(table, write, buffer, lenp, ppos);
+		}
+	} else {
+		for_each_possible_cpu(i) {
+			rq = cpu_rq(i);
+			raw_spin_lock_irqsave(&rq->lock, flags);
+
+			memset(&per_cpu(sched_max_latency, i), 0,
+			       sizeof(struct sched_max_latency));
+
+			raw_spin_unlock_irqrestore(&rq->lock, flags);
+		}
+	}
+
+	return ret;
+}
+
 static inline void check_for_high_latency(struct task_struct *p, u64 latency_us)
 {
 	int do_warn, do_panic;
@@ -792,6 +848,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct task_struct *p;
 	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+	struct sched_max_latency *max;
 
 	if (entity_is_task(se)) {
 		p = task_of(se);
@@ -805,6 +862,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			return;
 		}
 		trace_sched_stat_wait(p, delta);
+		max = this_cpu_ptr(&sched_max_latency);
+		if (max->latency_us < delta >> 10) {
+			max->latency_us = delta;
+			max->pid = task_of(se)->pid;
+			memcpy(max->comm, task_of(se)->comm, TASK_COMM_LEN);
+		}
 		check_for_high_latency(p, delta >> 10);
 	}
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 188c7fabeec0..591cacea6a8d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -608,6 +608,11 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 	},
+	{
+		.procname	= "sched_max_latency_us",
+		.mode		= 0644,
+		.proc_handler	= sched_max_latency_sysctl,
+	},
 #endif
 #ifdef CONFIG_PROVE_LOCKING
 	{