From 59f16ae0345c902c1d09da75e0f89d7e7ddbc54f Mon Sep 17 00:00:00 2001
From: Olav Haugan <ohaugan@codeaurora.org>
Date: Thu, 18 Aug 2016 17:22:44 -0700
Subject: [PATCH] core_ctrl: Move core control into kernel

Move core control from out-of-tree module into the kernel proper.

Core control monitors load on CPUs and controls how many CPUs are
available for the system to use at any point in time. This can help save
power. Core control can be configured through sysfs interface.

Change-Id: Ia78e701468ea3828195c2a15c9cf9fafd099804a
Signed-off-by: Olav Haugan <ohaugan@codeaurora.org>
---
 init/Kconfig            |   10 +
 kernel/sched/Makefile   |    1 +
 kernel/sched/core_ctl.c | 1014 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 1025 insertions(+)
 create mode 100644 kernel/sched/core_ctl.c

diff --git a/init/Kconfig b/init/Kconfig
index 9ad1ae9d9da8..6020a351c57b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1170,6 +1170,16 @@ config SCHED_HMP_CSTATE_AWARE
 	  with CPUs C-state. If this is enabled, scheduler places tasks
 	  onto the shallowest C-state CPU among the most power efficient CPUs.
 
+config SCHED_CORE_CTL
+	bool "QTI Core Control"
+	depends on SMP
+	help
+	  This options enables the core control functionality in
+	  the scheduler. Core control automatically offline and
+	  online cores based on cpu load and utilization.
+
+	  If unsure, say N here.
+
 config CHECKPOINT_RESTORE
 	bool "Checkpoint/restore support" if EXPERT
 	select PROC_CHILDREN
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 1f159743ebfc..508b65690288 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o
diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c
new file mode 100644
index 000000000000..8f071757d516
--- /dev/null
+++ b/kernel/sched/core_ctl.c
@@ -0,0 +1,1014 @@
+/* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+
+#include <trace/events/power.h>
+
+#define MAX_CPUS_PER_GROUP 4
+
+struct cpu_data {
+	/* Per CPU data. */
+	bool	inited;
+	bool	online;
+	bool	rejected;
+	bool	is_busy;
+	bool    not_preferred;
+	unsigned int busy;
+	unsigned int cpu;
+	struct list_head sib;
+	unsigned int first_cpu;
+
+	/* Per cluster data set only on first CPU */
+	unsigned int min_cpus;
+	unsigned int max_cpus;
+	unsigned int offline_delay_ms;
+	unsigned int busy_up_thres[MAX_CPUS_PER_GROUP];
+	unsigned int busy_down_thres[MAX_CPUS_PER_GROUP];
+	unsigned int online_cpus;
+	unsigned int avail_cpus;
+	unsigned int num_cpus;
+	unsigned int need_cpus;
+	unsigned int task_thres;
+	s64 need_ts;
+	struct list_head lru;
+	bool pending;
+	spinlock_t pending_lock;
+	bool is_big_cluster;
+	int nrrun;
+	bool nrrun_changed;
+	struct timer_list timer;
+	struct task_struct *hotplug_thread;
+	struct kobject kobj;
+};
+
+static DEFINE_PER_CPU(struct cpu_data, cpu_state);
+static DEFINE_SPINLOCK(state_lock);
+static void apply_need(struct cpu_data *f);
+static void wake_up_hotplug_thread(struct cpu_data *state);
+
+/* ========================= sysfs interface =========================== */
+
+static ssize_t store_min_cpus(struct cpu_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->min_cpus = min(val, state->max_cpus);
+	wake_up_hotplug_thread(state);
+
+	return count;
+}
+
+static ssize_t show_min_cpus(struct cpu_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus);
+}
+
+static ssize_t store_max_cpus(struct cpu_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	val = min(val, state->num_cpus);
+	state->max_cpus = val;
+	state->min_cpus = min(state->min_cpus, state->max_cpus);
+	wake_up_hotplug_thread(state);
+
+	return count;
+}
+
+static ssize_t show_max_cpus(struct cpu_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus);
+}
+
+static ssize_t store_offline_delay_ms(struct cpu_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->offline_delay_ms = val;
+	apply_need(state);
+
+	return count;
+}
+
+static ssize_t show_task_thres(struct cpu_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres);
+}
+
+static ssize_t store_task_thres(struct cpu_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	if (val < state->num_cpus)
+		return -EINVAL;
+
+	state->task_thres = val;
+	apply_need(state);
+
+	return count;
+}
+
+static ssize_t show_offline_delay_ms(struct cpu_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms);
+}
+
+static ssize_t store_busy_up_thres(struct cpu_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val[MAX_CPUS_PER_GROUP];
+	int ret, i;
+
+	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+	if (ret != 1 && ret != state->num_cpus)
+		return -EINVAL;
+
+	if (ret == 1) {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_up_thres[i] = val[0];
+	} else {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_up_thres[i] = val[i];
+	}
+	apply_need(state);
+	return count;
+}
+
+static ssize_t show_busy_up_thres(struct cpu_data *state, char *buf)
+{
+	int i, count = 0;
+
+	for (i = 0; i < state->num_cpus; i++)
+		count += snprintf(buf + count, PAGE_SIZE - count, "%u ",
+				  state->busy_up_thres[i]);
+	count += snprintf(buf + count, PAGE_SIZE - count, "\n");
+	return count;
+}
+
+static ssize_t store_busy_down_thres(struct cpu_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val[MAX_CPUS_PER_GROUP];
+	int ret, i;
+
+	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+	if (ret != 1 && ret != state->num_cpus)
+		return -EINVAL;
+
+	if (ret == 1) {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_down_thres[i] = val[0];
+	} else {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_down_thres[i] = val[i];
+	}
+	apply_need(state);
+	return count;
+}
+
+static ssize_t show_busy_down_thres(struct cpu_data *state, char *buf)
+{
+	int i, count = 0;
+
+	for (i = 0; i < state->num_cpus; i++)
+		count += snprintf(buf + count, PAGE_SIZE - count, "%u ",
+				  state->busy_down_thres[i]);
+	count += snprintf(buf + count, PAGE_SIZE - count, "\n");
+	return count;
+}
+
+static ssize_t store_is_big_cluster(struct cpu_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->is_big_cluster = val ? 1 : 0;
+	return count;
+}
+
+static ssize_t show_is_big_cluster(struct cpu_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster);
+}
+
+static ssize_t show_cpus(struct cpu_data *state, char *buf)
+{
+	struct cpu_data *c;
+	ssize_t count = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry(c, &state->lru, sib) {
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"CPU%u (%s)\n", c->cpu,
+					c->online ? "Online" : "Offline");
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+	return count;
+}
+
+static ssize_t show_need_cpus(struct cpu_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus);
+}
+
+static ssize_t show_online_cpus(struct cpu_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->online_cpus);
+}
+
+static ssize_t show_global_state(struct cpu_data *state, char *buf)
+{
+	struct cpu_data *c;
+	ssize_t count = 0;
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"CPU%u\n", cpu);
+		c = &per_cpu(cpu_state, cpu);
+		if (!c->inited)
+			continue;
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tCPU: %u\n", c->cpu);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tOnline: %u\n", c->online);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tRejected: %u\n", c->rejected);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tFirst CPU: %u\n", c->first_cpu);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tBusy%%: %u\n", c->busy);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tIs busy: %u\n", c->is_busy);
+		if (c->cpu != c->first_cpu)
+			continue;
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tNr running: %u\n", c->nrrun);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tAvail CPUs: %u\n", c->avail_cpus);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tNeed CPUs: %u\n", c->need_cpus);
+	}
+
+	return count;
+}
+
+static ssize_t store_not_preferred(struct cpu_data *state,
+						const char *buf, size_t count)
+{
+	struct cpu_data *c;
+	unsigned int i, first_cpu;
+	unsigned int val[MAX_CPUS_PER_GROUP];
+	int ret;
+
+	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+	if (ret != 1 && ret != state->num_cpus)
+		return -EINVAL;
+
+	first_cpu = state->first_cpu;
+
+	for (i = 0; i < state->num_cpus; i++) {
+		c = &per_cpu(cpu_state, first_cpu);
+		c->not_preferred = val[i];
+		first_cpu++;
+	}
+
+	return count;
+}
+
+static ssize_t show_not_preferred(struct cpu_data *state, char *buf)
+{
+	struct cpu_data *c;
+	ssize_t count = 0;
+	unsigned int i, first_cpu;
+
+	first_cpu = state->first_cpu;
+
+	for (i = 0; i < state->num_cpus; i++) {
+		c = &per_cpu(cpu_state, first_cpu);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+				"\tCPU:%d %u\n", first_cpu, c->not_preferred);
+		first_cpu++;
+	}
+
+	return count;
+}
+
+struct core_ctl_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct cpu_data *, char *);
+	ssize_t (*store)(struct cpu_data *, const char *, size_t count);
+};
+
+#define core_ctl_attr_ro(_name)		\
+static struct core_ctl_attr _name =	\
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define core_ctl_attr_rw(_name)			\
+static struct core_ctl_attr _name =		\
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+core_ctl_attr_rw(min_cpus);
+core_ctl_attr_rw(max_cpus);
+core_ctl_attr_rw(offline_delay_ms);
+core_ctl_attr_rw(busy_up_thres);
+core_ctl_attr_rw(busy_down_thres);
+core_ctl_attr_rw(task_thres);
+core_ctl_attr_rw(is_big_cluster);
+core_ctl_attr_ro(cpus);
+core_ctl_attr_ro(need_cpus);
+core_ctl_attr_ro(online_cpus);
+core_ctl_attr_ro(global_state);
+core_ctl_attr_rw(not_preferred);
+
+static struct attribute *default_attrs[] = {
+	&min_cpus.attr,
+	&max_cpus.attr,
+	&offline_delay_ms.attr,
+	&busy_up_thres.attr,
+	&busy_down_thres.attr,
+	&task_thres.attr,
+	&is_big_cluster.attr,
+	&cpus.attr,
+	&need_cpus.attr,
+	&online_cpus.attr,
+	&global_state.attr,
+	&not_preferred.attr,
+	NULL
+};
+
+#define to_cpu_data(k) container_of(k, struct cpu_data, kobj)
+#define to_attr(a) container_of(a, struct core_ctl_attr, attr)
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct cpu_data *data = to_cpu_data(kobj);
+	struct core_ctl_attr *cattr = to_attr(attr);
+	ssize_t ret = -EIO;
+
+	if (cattr->show)
+		ret = cattr->show(data, buf);
+
+	return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+		     const char *buf, size_t count)
+{
+	struct cpu_data *data = to_cpu_data(kobj);
+	struct core_ctl_attr *cattr = to_attr(attr);
+	ssize_t ret = -EIO;
+
+	if (cattr->store)
+		ret = cattr->store(data, buf, count);
+
+	return ret;
+}
+
+static const struct sysfs_ops sysfs_ops = {
+	.show	= show,
+	.store	= store,
+};
+
+static struct kobj_type ktype_core_ctl = {
+	.sysfs_ops	= &sysfs_ops,
+	.default_attrs	= default_attrs,
+};
+
+/* ==================== runqueue based core count =================== */
+
+#define RQ_AVG_TOLERANCE 2
+#define RQ_AVG_DEFAULT_MS 20
+#define NR_RUNNING_TOLERANCE 5
+static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS;
+
+static s64 rq_avg_timestamp_ms;
+static struct timer_list rq_avg_timer;
+
+static void update_running_avg(bool trigger_update)
+{
+	int cpu;
+	struct cpu_data *pcpu;
+	int avg, iowait_avg, big_avg, old_nrrun;
+	s64 now;
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+
+	now = ktime_to_ms(ktime_get());
+	if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) {
+		spin_unlock_irqrestore(&state_lock, flags);
+		return;
+	}
+	rq_avg_timestamp_ms = now;
+	sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg);
+
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	/*
+	 * Round up to the next integer if the average nr running tasks
+	 * is within NR_RUNNING_TOLERANCE/100 of the next integer.
+	 * If normal rounding up is used, it will allow a transient task
+	 * to trigger online event. By the time core is onlined, the task
+	 * has finished.
+	 * Rounding to closest suffers same problem because scheduler
+	 * might only provide running stats per jiffy, and a transient
+	 * task could skew the number for one jiffy. If core control
+	 * samples every 2 jiffies, it will observe 0.5 additional running
+	 * average which rounds up to 1 task.
+	 */
+	avg = (avg + NR_RUNNING_TOLERANCE) / 100;
+	big_avg = (big_avg + NR_RUNNING_TOLERANCE) / 100;
+
+	for_each_possible_cpu(cpu) {
+		pcpu = &per_cpu(cpu_state, cpu);
+		if (!pcpu->inited || pcpu->first_cpu != cpu)
+			continue;
+		old_nrrun = pcpu->nrrun;
+		/*
+		 * Big cluster only need to take care of big tasks, but if
+		 * there are not enough big cores, big tasks need to be run
+		 * on little as well. Thus for little's runqueue stat, it
+		 * has to use overall runqueue average, or derive what big
+		 * tasks would have to be run on little. The latter approach
+		 * is not easy to get given core control reacts much slower
+		 * than scheduler, and can't predict scheduler's behavior.
+		 */
+		pcpu->nrrun = pcpu->is_big_cluster ? big_avg : avg;
+		if (pcpu->nrrun != old_nrrun) {
+			if (trigger_update)
+				apply_need(pcpu);
+			else
+				pcpu->nrrun_changed = true;
+		}
+	}
+}
+
+/* adjust needed CPUs based on current runqueue information */
+static unsigned int apply_task_need(struct cpu_data *f, unsigned int new_need)
+{
+	/* Online all cores if there are enough tasks */
+	if (f->nrrun >= f->task_thres)
+		return f->num_cpus;
+
+	/* only online more cores if there are tasks to run */
+	if (f->nrrun > new_need)
+		return new_need + 1;
+
+	return new_need;
+}
+
+static u64 round_to_nw_start(void)
+{
+	unsigned long step = msecs_to_jiffies(rq_avg_period_ms);
+	u64 jif = get_jiffies_64();
+
+	do_div(jif, step);
+	return (jif + 1) * step;
+}
+
+static void rq_avg_timer_func(unsigned long not_used)
+{
+	update_running_avg(true);
+	mod_timer(&rq_avg_timer, round_to_nw_start());
+}
+
+/* ======================= load based core count  ====================== */
+
+static unsigned int apply_limits(struct cpu_data *f, unsigned int need_cpus)
+{
+	return min(max(f->min_cpus, need_cpus), f->max_cpus);
+}
+
+static bool eval_need(struct cpu_data *f)
+{
+	unsigned long flags;
+	struct cpu_data *c;
+	unsigned int need_cpus = 0, last_need, thres_idx;
+	int ret = 0;
+	bool need_flag = false;
+	s64 now;
+
+	if (unlikely(!f->inited))
+		return 0;
+
+	spin_lock_irqsave(&state_lock, flags);
+	thres_idx = f->online_cpus ? f->online_cpus - 1 : 0;
+	list_for_each_entry(c, &f->lru, sib) {
+		if (c->busy >= f->busy_up_thres[thres_idx])
+			c->is_busy = true;
+		else if (c->busy < f->busy_down_thres[thres_idx])
+			c->is_busy = false;
+		need_cpus += c->is_busy;
+	}
+	need_cpus = apply_task_need(f, need_cpus);
+	need_flag = apply_limits(f, need_cpus) != apply_limits(f, f->need_cpus);
+	last_need = f->need_cpus;
+
+	now = ktime_to_ms(ktime_get());
+
+	if (need_cpus == last_need) {
+		f->need_ts = now;
+		spin_unlock_irqrestore(&state_lock, flags);
+		return 0;
+	}
+
+	if (need_cpus > last_need) {
+		ret = 1;
+	} else if (need_cpus < last_need) {
+		s64 elapsed = now - f->need_ts;
+
+		if (elapsed >= f->offline_delay_ms) {
+			ret = 1;
+		} else {
+			mod_timer(&f->timer, jiffies +
+				  msecs_to_jiffies(f->offline_delay_ms));
+		}
+	}
+
+	if (ret) {
+		f->need_ts = now;
+		f->need_cpus = need_cpus;
+	}
+
+	trace_core_ctl_eval_need(f->cpu, last_need, need_cpus,
+				 ret && need_flag);
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return ret && need_flag;
+}
+
+static void apply_need(struct cpu_data *f)
+{
+	if (eval_need(f))
+		wake_up_hotplug_thread(f);
+}
+
+static int core_ctl_set_busy(unsigned int cpu, unsigned int busy)
+{
+	struct cpu_data *c = &per_cpu(cpu_state, cpu);
+	struct cpu_data *f;
+	unsigned int old_is_busy = c->is_busy;
+
+	if (!c->inited)
+		return 0;
+	f = &per_cpu(cpu_state, c->first_cpu);
+
+	update_running_avg(false);
+	if (c->busy == busy && !f->nrrun_changed)
+		return 0;
+	c->busy = busy;
+	f->nrrun_changed = false;
+
+	apply_need(f);
+	trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy);
+	return 0;
+}
+
+/* ========================= core count enforcement ==================== */
+
+/*
+ * If current thread is hotplug thread, don't attempt to wake up
+ * itself or other hotplug threads because it will deadlock. Instead,
+ * schedule a timer to fire in next timer tick and wake up the thread.
+ */
+static void wake_up_hotplug_thread(struct cpu_data *state)
+{
+	unsigned long flags;
+	int cpu;
+	struct cpu_data *pcpu;
+	bool no_wakeup = false;
+
+	for_each_possible_cpu(cpu) {
+		pcpu = &per_cpu(cpu_state, cpu);
+		if (cpu != pcpu->first_cpu)
+			continue;
+		if (pcpu->hotplug_thread == current) {
+			no_wakeup = true;
+			break;
+		}
+	}
+
+	spin_lock_irqsave(&state->pending_lock, flags);
+	state->pending = true;
+	spin_unlock_irqrestore(&state->pending_lock, flags);
+
+	if (no_wakeup) {
+		spin_lock_irqsave(&state_lock, flags);
+		mod_timer(&state->timer, jiffies);
+		spin_unlock_irqrestore(&state_lock, flags);
+	} else {
+		wake_up_process(state->hotplug_thread);
+	}
+}
+
+static void core_ctl_timer_func(unsigned long cpu)
+{
+	struct cpu_data *state = &per_cpu(cpu_state, cpu);
+	unsigned long flags;
+
+	if (eval_need(state)) {
+		spin_lock_irqsave(&state->pending_lock, flags);
+		state->pending = true;
+		spin_unlock_irqrestore(&state->pending_lock, flags);
+		wake_up_process(state->hotplug_thread);
+	}
+
+}
+
+static int core_ctl_online_core(unsigned int cpu)
+{
+	int ret;
+	struct device *dev;
+
+	lock_device_hotplug();
+	dev = get_cpu_device(cpu);
+	if (!dev) {
+		pr_err("%s: failed to get cpu%d device\n", __func__, cpu);
+		ret = -ENODEV;
+	} else {
+		ret = device_online(dev);
+	}
+	unlock_device_hotplug();
+	return ret;
+}
+
+static int core_ctl_offline_core(unsigned int cpu)
+{
+	int ret;
+	struct device *dev;
+
+	lock_device_hotplug();
+	dev = get_cpu_device(cpu);
+	if (!dev) {
+		pr_err("%s: failed to get cpu%d device\n", __func__, cpu);
+		ret = -ENODEV;
+	} else {
+		ret = device_offline(dev);
+	}
+	unlock_device_hotplug();
+	return ret;
+}
+
+static void __ref do_hotplug(struct cpu_data *f)
+{
+	unsigned int need;
+	struct cpu_data *c, *tmp;
+
+	need = apply_limits(f, f->need_cpus);
+	pr_debug("Trying to adjust group %u to %u\n", f->first_cpu, need);
+
+	if (f->online_cpus > need) {
+		list_for_each_entry_safe(c, tmp, &f->lru, sib) {
+			if (!c->online)
+				continue;
+
+			if (f->online_cpus == need)
+				break;
+
+			/* Don't offline busy CPUs. */
+			if (c->is_busy)
+				continue;
+
+			pr_debug("Trying to Offline CPU%u\n", c->cpu);
+			if (core_ctl_offline_core(c->cpu))
+				pr_debug("Unable to Offline CPU%u\n", c->cpu);
+		}
+
+		/*
+		 * If the number of online CPUs is within the limits, then
+		 * don't force any busy CPUs offline.
+		 */
+		if (f->online_cpus <= f->max_cpus)
+			return;
+
+		list_for_each_entry_safe(c, tmp, &f->lru, sib) {
+			if (!c->online)
+				continue;
+
+			if (f->online_cpus <= f->max_cpus)
+				break;
+
+			pr_debug("Trying to Offline CPU%u\n", c->cpu);
+			if (core_ctl_offline_core(c->cpu))
+				pr_debug("Unable to Offline CPU%u\n", c->cpu);
+		}
+	} else if (f->online_cpus < need) {
+		list_for_each_entry_safe(c, tmp, &f->lru, sib) {
+			if (c->online || c->rejected || c->not_preferred)
+				continue;
+			if (f->online_cpus == need)
+				break;
+
+			pr_debug("Trying to Online CPU%u\n", c->cpu);
+			if (core_ctl_online_core(c->cpu))
+				pr_debug("Unable to Online CPU%u\n", c->cpu);
+		}
+
+		if (f->online_cpus == need)
+			return;
+
+
+		list_for_each_entry_safe(c, tmp, &f->lru, sib) {
+			if (c->online || c->rejected || !c->not_preferred)
+				continue;
+			if (f->online_cpus == need)
+				break;
+
+			pr_debug("Trying to Online CPU%u\n", c->cpu);
+			if (core_ctl_online_core(c->cpu))
+				pr_debug("Unable to Online CPU%u\n", c->cpu);
+		}
+
+	}
+}
+
+static int __ref try_hotplug(void *data)
+{
+	struct cpu_data *f = data;
+	unsigned long flags;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_lock_irqsave(&f->pending_lock, flags);
+		if (!f->pending) {
+			spin_unlock_irqrestore(&f->pending_lock, flags);
+			schedule();
+			if (kthread_should_stop())
+				break;
+			spin_lock_irqsave(&f->pending_lock, flags);
+		}
+		set_current_state(TASK_RUNNING);
+		f->pending = false;
+		spin_unlock_irqrestore(&f->pending_lock, flags);
+
+		do_hotplug(f);
+	}
+
+	return 0;
+}
+
+static int __ref cpu_callback(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
+{
+	uint32_t cpu = (uintptr_t)hcpu;
+	struct cpu_data *state = &per_cpu(cpu_state, cpu);
+	struct cpu_data *f;
+	int ret = NOTIFY_OK;
+	unsigned long flags;
+
+	/* Don't affect suspend resume */
+	if (action & CPU_TASKS_FROZEN)
+		return NOTIFY_OK;
+
+	if (unlikely(!state->inited))
+		return NOTIFY_OK;
+
+	f = &per_cpu(cpu_state, state->first_cpu);
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+
+		/* If online state of CPU somehow got out of sync, fix it. */
+		if (state->online) {
+			f->online_cpus--;
+			state->online = false;
+			pr_warn("CPU%d offline when state is online\n", cpu);
+		}
+
+		if (state->rejected) {
+			state->rejected = false;
+			f->avail_cpus++;
+		}
+
+		/*
+		 * If a CPU is in the process of coming up, mark it as online
+		 * so that there's no race with hotplug thread bringing up more
+		 * CPUs than necessary.
+		 */
+		if (apply_limits(f, f->need_cpus) <= f->online_cpus) {
+			pr_debug("Prevent CPU%d onlining\n", cpu);
+			ret = NOTIFY_BAD;
+		} else {
+			state->online = true;
+			f->online_cpus++;
+		}
+		break;
+
+	case CPU_ONLINE:
+		/*
+		 * Moving to the end of the list should only happen in
+		 * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an
+		 * infinite list traversal when thermal (or other entities)
+		 * reject trying to online CPUs.
+		 */
+		spin_lock_irqsave(&state_lock, flags);
+		list_del(&state->sib);
+		list_add_tail(&state->sib, &f->lru);
+		spin_unlock_irqrestore(&state_lock, flags);
+		break;
+
+	case CPU_DEAD:
+		/* Move a CPU to the end of the LRU when it goes offline. */
+		spin_lock_irqsave(&state_lock, flags);
+		list_del(&state->sib);
+		list_add_tail(&state->sib, &f->lru);
+		spin_unlock_irqrestore(&state_lock, flags);
+
+		/* Fall through */
+
+	case CPU_UP_CANCELED:
+
+		/* If online state of CPU somehow got out of sync, fix it. */
+		if (!state->online) {
+			f->online_cpus++;
+			pr_warn("CPU%d online when state is offline\n", cpu);
+		}
+
+		if (!state->rejected && action == CPU_UP_CANCELED) {
+			state->rejected = true;
+			f->avail_cpus--;
+		}
+
+		state->online = false;
+		state->busy = 0;
+		f->online_cpus--;
+		break;
+	}
+
+	if (f->online_cpus < apply_limits(f, f->need_cpus)
+	    && f->online_cpus < f->avail_cpus
+	    && action == CPU_DEAD)
+		wake_up_hotplug_thread(f);
+
+	return ret;
+}
+
+static struct notifier_block __refdata cpu_notifier = {
+	.notifier_call = cpu_callback,
+};
+
+/* ============================ init code ============================== */
+
+static int group_init(struct cpumask *mask)
+{
+	struct device *dev;
+	unsigned int first_cpu = cpumask_first(mask);
+	struct cpu_data *f = &per_cpu(cpu_state, first_cpu);
+	struct cpu_data *state;
+	unsigned int cpu;
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+
+	if (likely(f->inited))
+		return 0;
+
+	dev = get_cpu_device(first_cpu);
+	if (!dev)
+		return -ENODEV;
+
+	pr_info("Creating CPU group %d\n", first_cpu);
+
+	f->num_cpus = cpumask_weight(mask);
+	if (f->num_cpus > MAX_CPUS_PER_GROUP) {
+		pr_err("HW configuration not supported\n");
+		return -EINVAL;
+	}
+	f->min_cpus = 1;
+	f->max_cpus = f->num_cpus;
+	f->need_cpus  = f->num_cpus;
+	f->avail_cpus  = f->num_cpus;
+	f->offline_delay_ms = 100;
+	f->task_thres = UINT_MAX;
+	f->nrrun = f->num_cpus;
+	INIT_LIST_HEAD(&f->lru);
+	init_timer(&f->timer);
+	spin_lock_init(&f->pending_lock);
+	f->timer.function = core_ctl_timer_func;
+	f->timer.data = first_cpu;
+
+	for_each_cpu(cpu, mask) {
+		pr_info("Init CPU%u state\n", cpu);
+
+		state = &per_cpu(cpu_state, cpu);
+		state->cpu = cpu;
+		state->first_cpu = first_cpu;
+
+		if (cpu_online(cpu)) {
+			f->online_cpus++;
+			state->online = true;
+		}
+
+		list_add_tail(&state->sib, &f->lru);
+	}
+
+	f->hotplug_thread = kthread_run(try_hotplug, (void *) f,
+					"core_ctl/%d", first_cpu);
+	sched_setscheduler_nocheck(f->hotplug_thread, SCHED_FIFO, &param);
+
+	for_each_cpu(cpu, mask) {
+		state = &per_cpu(cpu_state, cpu);
+		state->inited = true;
+	}
+
+	kobject_init(&f->kobj, &ktype_core_ctl);
+	return kobject_add(&f->kobj, &dev->kobj, "core_ctl");
+}
+
+static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct cpufreq_policy *policy = data;
+
+	switch (val) {
+	case CPUFREQ_CREATE_POLICY:
+		group_init(policy->related_cpus);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_pol_nb = {
+	.notifier_call = cpufreq_policy_cb,
+};
+
+static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct cpufreq_govinfo *info = data;
+
+	switch (val) {
+	case CPUFREQ_LOAD_CHANGE:
+		core_ctl_set_busy(info->cpu, info->load);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_gov_nb = {
+	.notifier_call = cpufreq_gov_cb,
+};
+
+static int __init core_ctl_init(void)
+{
+	struct cpufreq_policy *policy;
+	unsigned int cpu;
+
+	register_cpu_notifier(&cpu_notifier);
+	cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER);
+	cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER);
+	init_timer_deferrable(&rq_avg_timer);
+	rq_avg_timer.function = rq_avg_timer_func;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		policy = cpufreq_cpu_get(cpu);
+		if (policy) {
+			group_init(policy->related_cpus);
+			cpufreq_cpu_put(policy);
+		}
+	}
+	put_online_cpus();
+	mod_timer(&rq_avg_timer, round_to_nw_start());
+	return 0;
+}
+
+late_initcall(core_ctl_init);