Merge remote-tracking branch 'linaro-ext/EAS/v4.4-easv5.2+aosp-changes' into android-4.4
Change-Id: Ic24b43ee867bc4f70b31bedaad734717b64b86a1
This commit is contained in:
commit
341e02d8bb
41 changed files with 7187 additions and 150 deletions
|
@ -0,0 +1,360 @@
|
|||
===========================================================
|
||||
Energy cost bindings for Energy Aware Scheduling
|
||||
===========================================================
|
||||
|
||||
===========================================================
|
||||
1 - Introduction
|
||||
===========================================================
|
||||
|
||||
This note specifies bindings required for energy-aware scheduling
|
||||
(EAS)[1]. Historically, the scheduler's primary objective has been
|
||||
performance. EAS aims to provide an alternative objective - energy
|
||||
efficiency. EAS relies on a simple platform energy cost model to
|
||||
guide scheduling decisions. The model only considers the CPU
|
||||
subsystem.
|
||||
|
||||
This note is aligned with the definition of the layout of physical
|
||||
CPUs in the system as described in the ARM topology binding
|
||||
description [2]. The concept is applicable to any system so long as
|
||||
the cost model data is provided for those processing elements in
|
||||
that system's topology that EAS is required to service.
|
||||
|
||||
Processing elements refer to hardware threads, CPUs and clusters of
|
||||
related CPUs in increasing order of hierarchy.
|
||||
|
||||
EAS requires two key cost metrics - busy costs and idle costs. Busy
|
||||
costs comprise of a list of compute capacities for the processing
|
||||
element in question and the corresponding power consumption at that
|
||||
capacity. Idle costs comprise of a list of power consumption values
|
||||
for each idle state [C-state] that the processing element supports.
|
||||
For a detailed description of these metrics, their derivation and
|
||||
their use see [3].
|
||||
|
||||
These cost metrics are required for processing elements in all
|
||||
scheduling domain levels that EAS is required to service.
|
||||
|
||||
===========================================================
|
||||
2 - energy-costs node
|
||||
===========================================================
|
||||
|
||||
Energy costs for the processing elements in scheduling domains that
|
||||
EAS is required to service are defined in the energy-costs node
|
||||
which acts as a container for the actual per processing element cost
|
||||
nodes. A single energy-costs node is required for a given system.
|
||||
|
||||
- energy-costs node
|
||||
|
||||
Usage: Required
|
||||
|
||||
Description: The energy-costs node is a container node and
|
||||
it's sub-nodes describe costs for each processing element at
|
||||
all scheduling domain levels that EAS is required to
|
||||
service.
|
||||
|
||||
Node name must be "energy-costs".
|
||||
|
||||
The energy-costs node's parent node must be the cpus node.
|
||||
|
||||
The energy-costs node's child nodes can be:
|
||||
|
||||
- one or more cost nodes.
|
||||
|
||||
Any other configuration is considered invalid.
|
||||
|
||||
The energy-costs node can only contain a single type of child node
|
||||
whose bindings are described in paragraph 4.
|
||||
|
||||
===========================================================
|
||||
3 - energy-costs node child nodes naming convention
|
||||
===========================================================
|
||||
|
||||
energy-costs child nodes must follow a naming convention where the
|
||||
node name must be "thread-costN", "core-costN", "cluster-costN"
|
||||
depending on whether the costs in the node are for a thread, core or
|
||||
cluster. N (where N = {0, 1, ...}) is the node number and has no
|
||||
bearing to the OS' logical thread, core or cluster index.
|
||||
|
||||
===========================================================
|
||||
4 - cost node bindings
|
||||
===========================================================
|
||||
|
||||
Bindings for cost nodes are defined as follows:
|
||||
|
||||
- cluster-cost node
|
||||
|
||||
Description: must be declared within an energy-costs node. A
|
||||
system can contain multiple clusters and each cluster
|
||||
serviced by EAS must have a corresponding cluster-costs
|
||||
node.
|
||||
|
||||
The cluster-cost node name must be "cluster-costN" as
|
||||
described in 3 above.
|
||||
|
||||
A cluster-cost node must be a leaf node with no children.
|
||||
|
||||
Properties for cluster-cost nodes are described in paragraph
|
||||
5 below.
|
||||
|
||||
Any other configuration is considered invalid.
|
||||
|
||||
- core-cost node
|
||||
|
||||
Description: must be declared within an energy-costs node. A
|
||||
system can contain multiple cores and each core serviced by
|
||||
EAS must have a corresponding core-cost node.
|
||||
|
||||
The core-cost node name must be "core-costN" as described in
|
||||
3 above.
|
||||
|
||||
A core-cost node must be a leaf node with no children.
|
||||
|
||||
Properties for core-cost nodes are described in paragraph
|
||||
5 below.
|
||||
|
||||
Any other configuration is considered invalid.
|
||||
|
||||
- thread-cost node
|
||||
|
||||
Description: must be declared within an energy-costs node. A
|
||||
system can contain cores with multiple hardware threads and
|
||||
each thread serviced by EAS must have a corresponding
|
||||
thread-cost node.
|
||||
|
||||
The core-cost node name must be "core-costN" as described in
|
||||
3 above.
|
||||
|
||||
A core-cost node must be a leaf node with no children.
|
||||
|
||||
Properties for thread-cost nodes are described in paragraph
|
||||
5 below.
|
||||
|
||||
Any other configuration is considered invalid.
|
||||
|
||||
===========================================================
|
||||
5 - Cost node properties
|
||||
==========================================================
|
||||
|
||||
All cost node types must have only the following properties:
|
||||
|
||||
- busy-cost-data
|
||||
|
||||
Usage: required
|
||||
Value type: An array of 2-item tuples. Each item is of type
|
||||
u32.
|
||||
Definition: The first item in the tuple is the capacity
|
||||
value as described in [3]. The second item in the tuple is
|
||||
the energy cost value as described in [3].
|
||||
|
||||
- idle-cost-data
|
||||
|
||||
Usage: required
|
||||
Value type: An array of 1-item tuples. The item is of type
|
||||
u32.
|
||||
Definition: The item in the tuple is the energy cost value
|
||||
as described in [3].
|
||||
|
||||
===========================================================
|
||||
4 - Extensions to the cpu node
|
||||
===========================================================
|
||||
|
||||
The cpu node is extended with a property that establishes the
|
||||
connection between the processing element represented by the cpu
|
||||
node and the cost-nodes associated with this processing element.
|
||||
|
||||
The connection is expressed in line with the topological hierarchy
|
||||
that this processing element belongs to starting with the level in
|
||||
the hierarchy that this processing element itself belongs to through
|
||||
to the highest level that EAS is required to service. The
|
||||
connection cannot be sparse and must be contiguous from the
|
||||
processing element's level through to the highest desired level. The
|
||||
highest desired level must be the same for all processing elements.
|
||||
|
||||
Example: Given that a cpu node may represent a thread that is a part
|
||||
of a core, this property may contain multiple elements which
|
||||
associate the thread with cost nodes describing the costs for the
|
||||
thread itself, the core the thread belongs to, the cluster the core
|
||||
belongs to and so on. The elements must be ordered from the lowest
|
||||
level nodes to the highest desired level that EAS must service. The
|
||||
highest desired level must be the same for all cpu nodes. The
|
||||
elements must not be sparse: there must be elements for the current
|
||||
thread, the next level of hierarchy (core) and so on without any
|
||||
'holes'.
|
||||
|
||||
Example: Given that a cpu node may represent a core that is a part
|
||||
of a cluster of related cpus this property may contain multiple
|
||||
elements which associate the core with cost nodes describing the
|
||||
costs for the core itself, the cluster the core belongs to and so
|
||||
on. The elements must be ordered from the lowest level nodes to the
|
||||
highest desired level that EAS must service. The highest desired
|
||||
level must be the same for all cpu nodes. The elements must not be
|
||||
sparse: there must be elements for the current thread, the next
|
||||
level of hierarchy (core) and so on without any 'holes'.
|
||||
|
||||
If the system comprises of hierarchical clusters of clusters, this
|
||||
property will contain multiple associations with the relevant number
|
||||
of cluster elements in hierarchical order.
|
||||
|
||||
Property added to the cpu node:
|
||||
|
||||
- sched-energy-costs
|
||||
|
||||
Usage: required
|
||||
Value type: List of phandles
|
||||
Definition: a list of phandles to specific cost nodes in the
|
||||
energy-costs parent node that correspond to the processing
|
||||
element represented by this cpu node in hierarchical order
|
||||
of topology.
|
||||
|
||||
The order of phandles in the list is significant. The first
|
||||
phandle is to the current processing element's own cost
|
||||
node. Subsequent phandles are to higher hierarchical level
|
||||
cost nodes up until the maximum level that EAS is to
|
||||
service.
|
||||
|
||||
All cpu nodes must have the same highest level cost node.
|
||||
|
||||
The phandle list must not be sparsely populated with handles
|
||||
to non-contiguous hierarchical levels. See commentary above
|
||||
for clarity.
|
||||
|
||||
Any other configuration is invalid.
|
||||
|
||||
===========================================================
|
||||
5 - Example dts
|
||||
===========================================================
|
||||
|
||||
Example 1 (ARM 64-bit, 6-cpu system, two clusters of cpus, one
|
||||
cluster of 2 Cortex-A57 cpus, one cluster of 4 Cortex-A53 cpus):
|
||||
|
||||
cpus {
|
||||
#address-cells = <2>;
|
||||
#size-cells = <0>;
|
||||
.
|
||||
.
|
||||
.
|
||||
A57_0: cpu@0 {
|
||||
compatible = "arm,cortex-a57","arm,armv8";
|
||||
reg = <0x0 0x0>;
|
||||
device_type = "cpu";
|
||||
enable-method = "psci";
|
||||
next-level-cache = <&A57_L2>;
|
||||
clocks = <&scpi_dvfs 0>;
|
||||
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
|
||||
sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
|
||||
};
|
||||
|
||||
A57_1: cpu@1 {
|
||||
compatible = "arm,cortex-a57","arm,armv8";
|
||||
reg = <0x0 0x1>;
|
||||
device_type = "cpu";
|
||||
enable-method = "psci";
|
||||
next-level-cache = <&A57_L2>;
|
||||
clocks = <&scpi_dvfs 0>;
|
||||
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
|
||||
sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
|
||||
};
|
||||
|
||||
A53_0: cpu@100 {
|
||||
compatible = "arm,cortex-a53","arm,armv8";
|
||||
reg = <0x0 0x100>;
|
||||
device_type = "cpu";
|
||||
enable-method = "psci";
|
||||
next-level-cache = <&A53_L2>;
|
||||
clocks = <&scpi_dvfs 1>;
|
||||
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
|
||||
sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
|
||||
};
|
||||
|
||||
A53_1: cpu@101 {
|
||||
compatible = "arm,cortex-a53","arm,armv8";
|
||||
reg = <0x0 0x101>;
|
||||
device_type = "cpu";
|
||||
enable-method = "psci";
|
||||
next-level-cache = <&A53_L2>;
|
||||
clocks = <&scpi_dvfs 1>;
|
||||
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
|
||||
sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
|
||||
};
|
||||
|
||||
A53_2: cpu@102 {
|
||||
compatible = "arm,cortex-a53","arm,armv8";
|
||||
reg = <0x0 0x102>;
|
||||
device_type = "cpu";
|
||||
enable-method = "psci";
|
||||
next-level-cache = <&A53_L2>;
|
||||
clocks = <&scpi_dvfs 1>;
|
||||
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
|
||||
sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
|
||||
};
|
||||
|
||||
A53_3: cpu@103 {
|
||||
compatible = "arm,cortex-a53","arm,armv8";
|
||||
reg = <0x0 0x103>;
|
||||
device_type = "cpu";
|
||||
enable-method = "psci";
|
||||
next-level-cache = <&A53_L2>;
|
||||
clocks = <&scpi_dvfs 1>;
|
||||
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
|
||||
sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
|
||||
};
|
||||
|
||||
energy-costs {
|
||||
CPU_COST_0: core-cost0 {
|
||||
busy-cost-data = <
|
||||
417 168
|
||||
579 251
|
||||
744 359
|
||||
883 479
|
||||
1024 616
|
||||
>;
|
||||
idle-cost-data = <
|
||||
15
|
||||
0
|
||||
>;
|
||||
};
|
||||
CPU_COST_1: core-cost1 {
|
||||
busy-cost-data = <
|
||||
235 33
|
||||
302 46
|
||||
368 61
|
||||
406 76
|
||||
447 93
|
||||
>;
|
||||
idle-cost-data = <
|
||||
6
|
||||
0
|
||||
>;
|
||||
};
|
||||
CLUSTER_COST_0: cluster-cost0 {
|
||||
busy-cost-data = <
|
||||
417 24
|
||||
579 32
|
||||
744 43
|
||||
883 49
|
||||
1024 64
|
||||
>;
|
||||
idle-cost-data = <
|
||||
65
|
||||
24
|
||||
>;
|
||||
};
|
||||
CLUSTER_COST_1: cluster-cost1 {
|
||||
busy-cost-data = <
|
||||
235 26
|
||||
303 30
|
||||
368 39
|
||||
406 47
|
||||
447 57
|
||||
>;
|
||||
idle-cost-data = <
|
||||
56
|
||||
17
|
||||
>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
===============================================================================
|
||||
[1] https://lkml.org/lkml/2015/5/12/728
|
||||
[2] Documentation/devicetree/bindings/topology.txt
|
||||
[3] Documentation/scheduler/sched-energy.txt
|
362
Documentation/scheduler/sched-energy.txt
Normal file
362
Documentation/scheduler/sched-energy.txt
Normal file
|
@ -0,0 +1,362 @@
|
|||
Energy cost model for energy-aware scheduling (EXPERIMENTAL)
|
||||
|
||||
Introduction
|
||||
=============
|
||||
|
||||
The basic energy model uses platform energy data stored in sched_group_energy
|
||||
data structures attached to the sched_groups in the sched_domain hierarchy. The
|
||||
energy cost model offers two functions that can be used to guide scheduling
|
||||
decisions:
|
||||
|
||||
1. static unsigned int sched_group_energy(struct energy_env *eenv)
|
||||
2. static int energy_diff(struct energy_env *eenv)
|
||||
|
||||
sched_group_energy() estimates the energy consumed by all cpus in a specific
|
||||
sched_group including any shared resources owned exclusively by this group of
|
||||
cpus. Resources shared with other cpus are excluded (e.g. later level caches).
|
||||
|
||||
energy_diff() estimates the total energy impact of a utilization change. That
|
||||
is, adding, removing, or migrating utilization (tasks).
|
||||
|
||||
Both functions use a struct energy_env to specify the scenario to be evaluated:
|
||||
|
||||
struct energy_env {
|
||||
struct sched_group *sg_top;
|
||||
struct sched_group *sg_cap;
|
||||
int cap_idx;
|
||||
int util_delta;
|
||||
int src_cpu;
|
||||
int dst_cpu;
|
||||
int energy;
|
||||
};
|
||||
|
||||
sg_top: sched_group to be evaluated. Not used by energy_diff().
|
||||
|
||||
sg_cap: sched_group covering the cpus in the same frequency domain. Set by
|
||||
sched_group_energy().
|
||||
|
||||
cap_idx: Capacity state to be used for energy calculations. Set by
|
||||
find_new_capacity().
|
||||
|
||||
util_delta: Amount of utilization to be added, removed, or migrated.
|
||||
|
||||
src_cpu: Source cpu from where 'util_delta' utilization is removed. Should be
|
||||
-1 if no source (e.g. task wake-up).
|
||||
|
||||
dst_cpu: Destination cpu where 'util_delta' utilization is added. Should be -1
|
||||
if utilization is removed (e.g. terminating tasks).
|
||||
|
||||
energy: Result of sched_group_energy().
|
||||
|
||||
The metric used to represent utilization is the actual per-entity running time
|
||||
averaged over time using a geometric series. Very similar to the existing
|
||||
per-entity load-tracking, but _not_ scaled by task priority and capped by the
|
||||
capacity of the cpu. The latter property does mean that utilization may
|
||||
underestimate the compute requirements for task on fully/over utilized cpus.
|
||||
The greatest potential for energy savings without affecting performance too much
|
||||
is scenarios where the system isn't fully utilized. If the system is deemed
|
||||
fully utilized load-balancing should be done with task load (includes task
|
||||
priority) instead in the interest of fairness and performance.
|
||||
|
||||
|
||||
Background and Terminology
|
||||
===========================
|
||||
|
||||
To make it clear from the start:
|
||||
|
||||
energy = [joule] (resource like a battery on powered devices)
|
||||
power = energy/time = [joule/second] = [watt]
|
||||
|
||||
The goal of energy-aware scheduling is to minimize energy, while still getting
|
||||
the job done. That is, we want to maximize:
|
||||
|
||||
performance [inst/s]
|
||||
--------------------
|
||||
power [W]
|
||||
|
||||
which is equivalent to minimizing:
|
||||
|
||||
energy [J]
|
||||
-----------
|
||||
instruction
|
||||
|
||||
while still getting 'good' performance. It is essentially an alternative
|
||||
optimization objective to the current performance-only objective for the
|
||||
scheduler. This alternative considers two objectives: energy-efficiency and
|
||||
performance. Hence, there needs to be a user controllable knob to switch the
|
||||
objective. Since it is early days, this is currently a sched_feature
|
||||
(ENERGY_AWARE).
|
||||
|
||||
The idea behind introducing an energy cost model is to allow the scheduler to
|
||||
evaluate the implications of its decisions rather than applying energy-saving
|
||||
techniques blindly that may only have positive effects on some platforms. At
|
||||
the same time, the energy cost model must be as simple as possible to minimize
|
||||
the scheduler latency impact.
|
||||
|
||||
Platform topology
|
||||
------------------
|
||||
|
||||
The system topology (cpus, caches, and NUMA information, not peripherals) is
|
||||
represented in the scheduler by the sched_domain hierarchy which has
|
||||
sched_groups attached at each level that covers one or more cpus (see
|
||||
sched-domains.txt for more details). To add energy awareness to the scheduler
|
||||
we need to consider power and frequency domains.
|
||||
|
||||
Power domain:
|
||||
|
||||
A power domain is a part of the system that can be powered on/off
|
||||
independently. Power domains are typically organized in a hierarchy where you
|
||||
may be able to power down just a cpu or a group of cpus along with any
|
||||
associated resources (e.g. shared caches). Powering up a cpu means that all
|
||||
power domains it is a part of in the hierarchy must be powered up. Hence, it is
|
||||
more expensive to power up the first cpu that belongs to a higher level power
|
||||
domain than powering up additional cpus in the same high level domain. Two
|
||||
level power domain hierarchy example:
|
||||
|
||||
Power source
|
||||
+-------------------------------+----...
|
||||
per group PD G G
|
||||
| +----------+ |
|
||||
+--------+-------| Shared | (other groups)
|
||||
per-cpu PD G G | resource |
|
||||
| | +----------+
|
||||
+-------+ +-------+
|
||||
| CPU 0 | | CPU 1 |
|
||||
+-------+ +-------+
|
||||
|
||||
Frequency domain:
|
||||
|
||||
Frequency domains (P-states) typically cover the same group of cpus as one of
|
||||
the power domain levels. That is, there might be several smaller power domains
|
||||
sharing the same frequency (P-state) or there might be a power domain spanning
|
||||
multiple frequency domains.
|
||||
|
||||
From a scheduling point of view there is no need to know the actual frequencies
|
||||
[Hz]. All the scheduler cares about is the compute capacity available at the
|
||||
current state (P-state) the cpu is in and any other available states. For that
|
||||
reason, and to also factor in any cpu micro-architecture differences, compute
|
||||
capacity scaling states are called 'capacity states' in this document. For SMP
|
||||
systems this is equivalent to P-states. For mixed micro-architecture systems
|
||||
(like ARM big.LITTLE) it is P-states scaled according to the micro-architecture
|
||||
performance relative to the other cpus in the system.
|
||||
|
||||
Energy modelling:
|
||||
------------------
|
||||
|
||||
Due to the hierarchical nature of the power domains, the most obvious way to
|
||||
model energy costs is therefore to associate power and energy costs with
|
||||
domains (groups of cpus). Energy costs of shared resources are associated with
|
||||
the group of cpus that share the resources, only the cost of powering the
|
||||
cpu itself and any private resources (e.g. private L1 caches) is associated
|
||||
with the per-cpu groups (lowest level).
|
||||
|
||||
For example, for an SMP system with per-cpu power domains and a cluster level
|
||||
(group of cpus) power domain we get the overall energy costs to be:
|
||||
|
||||
energy = energy_cluster + n * energy_cpu
|
||||
|
||||
where 'n' is the number of cpus powered up and energy_cluster is the cost paid
|
||||
as soon as any cpu in the cluster is powered up.
|
||||
|
||||
The power and frequency domains can naturally be mapped onto the existing
|
||||
sched_domain hierarchy and sched_groups by adding the necessary data to the
|
||||
existing data structures.
|
||||
|
||||
The energy model considers energy consumption from two contributors (shown in
|
||||
the illustration below):
|
||||
|
||||
1. Busy energy: Energy consumed while a cpu and the higher level groups that it
|
||||
belongs to are busy running tasks. Busy energy is associated with the state of
|
||||
the cpu, not an event. The time the cpu spends in this state varies. Thus, the
|
||||
most obvious platform parameter for this contribution is busy power
|
||||
(energy/time).
|
||||
|
||||
2. Idle energy: Energy consumed while a cpu and higher level groups that it
|
||||
belongs to are idle (in a C-state). Like busy energy, idle energy is associated
|
||||
with the state of the cpu. Thus, the platform parameter for this contribution
|
||||
is idle power (energy/time).
|
||||
|
||||
Energy consumed during transitions from an idle-state (C-state) to a busy state
|
||||
(P-state) or going the other way is ignored by the model to simplify the energy
|
||||
model calculations.
|
||||
|
||||
|
||||
Power
|
||||
^
|
||||
| busy->idle idle->busy
|
||||
| transition transition
|
||||
|
|
||||
| _ __
|
||||
| / \ / \__________________
|
||||
|______________/ \ /
|
||||
| \ /
|
||||
| Busy \ Idle / Busy
|
||||
| low P-state \____________/ high P-state
|
||||
|
|
||||
+------------------------------------------------------------> time
|
||||
|
||||
Busy |--------------| |-----------------|
|
||||
|
||||
Wakeup |------| |------|
|
||||
|
||||
Idle |------------|
|
||||
|
||||
|
||||
The basic algorithm
|
||||
====================
|
||||
|
||||
The basic idea is to determine the total energy impact when utilization is
|
||||
added or removed by estimating the impact at each level in the sched_domain
|
||||
hierarchy starting from the bottom (sched_group contains just a single cpu).
|
||||
The energy cost comes from busy time (sched_group is awake because one or more
|
||||
cpus are busy) and idle time (in an idle-state). Energy model numbers account
|
||||
for energy costs associated with all cpus in the sched_group as a group.
|
||||
|
||||
for_each_domain(cpu, sd) {
|
||||
sg = sched_group_of(cpu)
|
||||
energy_before = curr_util(sg) * busy_power(sg)
|
||||
+ (1-curr_util(sg)) * idle_power(sg)
|
||||
energy_after = new_util(sg) * busy_power(sg)
|
||||
+ (1-new_util(sg)) * idle_power(sg)
|
||||
energy_diff += energy_before - energy_after
|
||||
|
||||
}
|
||||
|
||||
return energy_diff
|
||||
|
||||
{curr, new}_util: The cpu utilization at the lowest level and the overall
|
||||
non-idle time for the entire group for higher levels. Utilization is in the
|
||||
range 0.0 to 1.0 in the pseudo-code.
|
||||
|
||||
busy_power: The power consumption of the sched_group.
|
||||
|
||||
idle_power: The power consumption of the sched_group when idle.
|
||||
|
||||
Note: It is a fundamental assumption that the utilization is (roughly) scale
|
||||
invariant. Task utilization tracking factors in any frequency scaling and
|
||||
performance scaling differences due to difference cpu microarchitectures such
|
||||
that task utilization can be used across the entire system.
|
||||
|
||||
|
||||
Platform energy data
|
||||
=====================
|
||||
|
||||
struct sched_group_energy can be attached to sched_groups in the sched_domain
|
||||
hierarchy and has the following members:
|
||||
|
||||
cap_states:
|
||||
List of struct capacity_state representing the supported capacity states
|
||||
(P-states). struct capacity_state has two members: cap and power, which
|
||||
represents the compute capacity and the busy_power of the state. The
|
||||
list must be ordered by capacity low->high.
|
||||
|
||||
nr_cap_states:
|
||||
Number of capacity states in cap_states list.
|
||||
|
||||
idle_states:
|
||||
List of struct idle_state containing idle_state power cost for each
|
||||
idle-state supported by the system orderd by shallowest state first.
|
||||
All states must be included at all level in the hierarchy, i.e. a
|
||||
sched_group spanning just a single cpu must also include coupled
|
||||
idle-states (cluster states). In addition to the cpuidle idle-states,
|
||||
the list must also contain an entry for the idling using the arch
|
||||
default idle (arch_idle_cpu()). Despite this state may not be a true
|
||||
hardware idle-state it is considered the shallowest idle-state in the
|
||||
energy model and must be the first entry. cpus may enter this state
|
||||
(possibly 'active idling') if cpuidle decides not enter a cpuidle
|
||||
idle-state. Default idle may not be used when cpuidle is enabled.
|
||||
In this case, it should just be a copy of the first cpuidle idle-state.
|
||||
|
||||
nr_idle_states:
|
||||
Number of idle states in idle_states list.
|
||||
|
||||
There are no unit requirements for the energy cost data. Data can be normalized
|
||||
with any reference, however, the normalization must be consistent across all
|
||||
energy cost data. That is, one bogo-joule/watt must be the same quantity for
|
||||
data, but we don't care what it is.
|
||||
|
||||
A recipe for platform characterization
|
||||
=======================================
|
||||
|
||||
Obtaining the actual model data for a particular platform requires some way of
|
||||
measuring power/energy. There isn't a tool to help with this (yet). This
|
||||
section provides a recipe for use as reference. It covers the steps used to
|
||||
characterize the ARM TC2 development platform. This sort of measurements is
|
||||
expected to be done anyway when tuning cpuidle and cpufreq for a given
|
||||
platform.
|
||||
|
||||
The energy model needs two types of data (struct sched_group_energy holds
|
||||
these) for each sched_group where energy costs should be taken into account:
|
||||
|
||||
1. Capacity state information
|
||||
|
||||
A list containing the compute capacity and power consumption when fully
|
||||
utilized attributed to the group as a whole for each available capacity state.
|
||||
At the lowest level (group contains just a single cpu) this is the power of the
|
||||
cpu alone without including power consumed by resources shared with other cpus.
|
||||
It basically needs to fit the basic modelling approach described in "Background
|
||||
and Terminology" section:
|
||||
|
||||
energy_system = energy_shared + n * energy_cpu
|
||||
|
||||
for a system containing 'n' busy cpus. Only 'energy_cpu' should be included at
|
||||
the lowest level. 'energy_shared' is included at the next level which
|
||||
represents the group of cpus among which the resources are shared.
|
||||
|
||||
This model is, of course, a simplification of reality. Thus, power/energy
|
||||
attributions might not always exactly represent how the hardware is designed.
|
||||
Also, busy power is likely to depend on the workload. It is therefore
|
||||
recommended to use a representative mix of workloads when characterizing the
|
||||
capacity states.
|
||||
|
||||
If the group has no capacity scaling support, the list will contain a single
|
||||
state where power is the busy power attributed to the group. The capacity
|
||||
should be set to a default value (1024).
|
||||
|
||||
When frequency domains include multiple power domains, the group representing
|
||||
the frequency domain and all child groups share capacity states. This must be
|
||||
indicated by setting the SD_SHARE_CAP_STATES sched_domain flag. All groups at
|
||||
all levels that share the capacity state must have the list of capacity states
|
||||
with the power set to the contribution of the individual group.
|
||||
|
||||
2. Idle power information
|
||||
|
||||
Stored in the idle_states list. The power number is the group idle power
|
||||
consumption in each idle state as well when the group is idle but has not
|
||||
entered an idle-state ('active idle' as mentioned earlier). Due to the way the
|
||||
energy model is defined, the idle power of the deepest group idle state can
|
||||
alternatively be accounted for in the parent group busy power. In that case the
|
||||
group idle state power values are offset such that the idle power of the
|
||||
deepest state is zero. It is less intuitive, but it is easier to measure as
|
||||
idle power consumed by the group and the busy/idle power of the parent group
|
||||
cannot be distinguished without per group measurement points.
|
||||
|
||||
Measuring capacity states and idle power:
|
||||
|
||||
The capacity states' capacity and power can be estimated by running a benchmark
|
||||
workload at each available capacity state. By restricting the benchmark to run
|
||||
on subsets of cpus it is possible to extrapolate the power consumption of
|
||||
shared resources.
|
||||
|
||||
ARM TC2 has two clusters of two and three cpus respectively. Each cluster has a
|
||||
shared L2 cache. TC2 has on-chip energy counters per cluster. Running a
|
||||
benchmark workload on just one cpu in a cluster means that power is consumed in
|
||||
the cluster (higher level group) and a single cpu (lowest level group). Adding
|
||||
another benchmark task to another cpu increases the power consumption by the
|
||||
amount consumed by the additional cpu. Hence, it is possible to extrapolate the
|
||||
cluster busy power.
|
||||
|
||||
For platforms that don't have energy counters or equivalent instrumentation
|
||||
built-in, it may be possible to use an external DAQ to acquire similar data.
|
||||
|
||||
If the benchmark includes some performance score (for example sysbench cpu
|
||||
benchmark), this can be used to record the compute capacity.
|
||||
|
||||
Measuring idle power requires insight into the idle state implementation on the
|
||||
particular platform. Specifically, if the platform has coupled idle-states (or
|
||||
package states). To measure non-coupled per-cpu idle-states it is necessary to
|
||||
keep one cpu busy to keep any shared resources alive to isolate the idle power
|
||||
of the cpu from idle/busy power of the shared resources. The cpu can be tricked
|
||||
into different per-cpu idle states by disabling the other states. Based on
|
||||
various combinations of measurements with specific cpus busy and disabling
|
||||
idle-states it is possible to extrapolate the idle-state power.
|
366
Documentation/scheduler/sched-tune.txt
Normal file
366
Documentation/scheduler/sched-tune.txt
Normal file
|
@ -0,0 +1,366 @@
|
|||
Central, scheduler-driven, power-performance control
|
||||
(EXPERIMENTAL)
|
||||
|
||||
Abstract
|
||||
========
|
||||
|
||||
The topic of a single simple power-performance tunable, that is wholly
|
||||
scheduler centric, and has well defined and predictable properties has come up
|
||||
on several occasions in the past [1,2]. With techniques such as a scheduler
|
||||
driven DVFS [3], we now have a good framework for implementing such a tunable.
|
||||
This document describes the overall ideas behind its design and implementation.
|
||||
|
||||
|
||||
Table of Contents
|
||||
=================
|
||||
|
||||
1. Motivation
|
||||
2. Introduction
|
||||
3. Signal Boosting Strategy
|
||||
4. OPP selection using boosted CPU utilization
|
||||
5. Per task group boosting
|
||||
6. Question and Answers
|
||||
- What about "auto" mode?
|
||||
- What about boosting on a congested system?
|
||||
- How CPUs are boosted when we have tasks with multiple boost values?
|
||||
7. References
|
||||
|
||||
|
||||
1. Motivation
|
||||
=============
|
||||
|
||||
Sched-DVFS [3] is a new event-driven cpufreq governor which allows the
|
||||
scheduler to select the optimal DVFS operating point (OPP) for running a task
|
||||
allocated to a CPU. The introduction of sched-DVFS enables running workloads at
|
||||
the most energy efficient OPPs.
|
||||
|
||||
However, sometimes it may be desired to intentionally boost the performance of
|
||||
a workload even if that could imply a reasonable increase in energy
|
||||
consumption. For example, in order to reduce the response time of a task, we
|
||||
may want to run the task at a higher OPP than the one that is actually required
|
||||
by it's CPU bandwidth demand.
|
||||
|
||||
This last requirement is especially important if we consider that one of the
|
||||
main goals of the sched-DVFS component is to replace all currently available
|
||||
CPUFreq policies. Since sched-DVFS is event based, as opposed to the sampling
|
||||
driven governors we currently have, it is already more responsive at selecting
|
||||
the optimal OPP to run tasks allocated to a CPU. However, just tracking the
|
||||
actual task load demand may not be enough from a performance standpoint. For
|
||||
example, it is not possible to get behaviors similar to those provided by the
|
||||
"performance" and "interactive" CPUFreq governors.
|
||||
|
||||
This document describes an implementation of a tunable, stacked on top of the
|
||||
sched-DVFS which extends its functionality to support task performance
|
||||
boosting.
|
||||
|
||||
By "performance boosting" we mean the reduction of the time required to
|
||||
complete a task activation, i.e. the time elapsed from a task wakeup to its
|
||||
next deactivation (e.g. because it goes back to sleep or it terminates). For
|
||||
example, if we consider a simple periodic task which executes the same workload
|
||||
for 5[s] every 20[s] while running at a certain OPP, a boosted execution of
|
||||
that task must complete each of its activations in less than 5[s].
|
||||
|
||||
A previous attempt [5] to introduce such a boosting feature has not been
|
||||
successful mainly because of the complexity of the proposed solution. The
|
||||
approach described in this document exposes a single simple interface to
|
||||
user-space. This single tunable knob allows the tuning of system wide
|
||||
scheduler behaviours ranging from energy efficiency at one end through to
|
||||
incremental performance boosting at the other end. This first tunable affects
|
||||
all tasks. However, a more advanced extension of the concept is also provided
|
||||
which uses CGroups to boost the performance of only selected tasks while using
|
||||
the energy efficient default for all others.
|
||||
|
||||
The rest of this document introduces in more details the proposed solution
|
||||
which has been named SchedTune.
|
||||
|
||||
|
||||
2. Introduction
|
||||
===============
|
||||
|
||||
SchedTune exposes a simple user-space interface with a single power-performance
|
||||
tunable:
|
||||
|
||||
/proc/sys/kernel/sched_cfs_boost
|
||||
|
||||
This permits expressing a boost value as an integer in the range [0..100].
|
||||
|
||||
A value of 0 (default) configures the CFS scheduler for maximum energy
|
||||
efficiency. This means that sched-DVFS runs the tasks at the minimum OPP
|
||||
required to satisfy their workload demand.
|
||||
A value of 100 configures scheduler for maximum performance, which translates
|
||||
to the selection of the maximum OPP on that CPU.
|
||||
|
||||
The range between 0 and 100 can be set to satisfy other scenarios suitably. For
|
||||
example to satisfy interactive response or depending on other system events
|
||||
(battery level etc).
|
||||
|
||||
A CGroup based extension is also provided, which permits further user-space
|
||||
defined task classification to tune the scheduler for different goals depending
|
||||
on the specific nature of the task, e.g. background vs interactive vs
|
||||
low-priority.
|
||||
|
||||
The overall design of the SchedTune module is built on top of "Per-Entity Load
|
||||
Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating
|
||||
Performance Point (OPP) selection.
|
||||
Each time a task is allocated on a CPU, sched-DVFS has the opportunity to tune
|
||||
the operating frequency of that CPU to better match the workload demand. The
|
||||
selection of the actual OPP being activated is influenced by the global boost
|
||||
value, or the boost value for the task CGroup when in use.
|
||||
|
||||
This simple biasing approach leverages existing frameworks, which means minimal
|
||||
modifications to the scheduler, and yet it allows to achieve a range of
|
||||
different behaviours all from a single simple tunable knob.
|
||||
The only new concept introduced is that of signal boosting.
|
||||
|
||||
|
||||
3. Signal Boosting Strategy
|
||||
===========================
|
||||
|
||||
The whole PELT machinery works based on the value of a few load tracking signals
|
||||
which basically track the CPU bandwidth requirements for tasks and the capacity
|
||||
of CPUs. The basic idea behind the SchedTune knob is to artificially inflate
|
||||
some of these load tracking signals to make a task or RQ appears more demanding
|
||||
that it actually is.
|
||||
|
||||
Which signals have to be inflated depends on the specific "consumer". However,
|
||||
independently from the specific (signal, consumer) pair, it is important to
|
||||
define a simple and possibly consistent strategy for the concept of boosting a
|
||||
signal.
|
||||
|
||||
A boosting strategy defines how the "abstract" user-space defined
|
||||
sched_cfs_boost value is translated into an internal "margin" value to be added
|
||||
to a signal to get its inflated value:
|
||||
|
||||
margin := boosting_strategy(sched_cfs_boost, signal)
|
||||
boosted_signal := signal + margin
|
||||
|
||||
Different boosting strategies were identified and analyzed before selecting the
|
||||
one found to be most effective.
|
||||
|
||||
Signal Proportional Compensation (SPC)
|
||||
--------------------------------------
|
||||
|
||||
In this boosting strategy the sched_cfs_boost value is used to compute a
|
||||
margin which is proportional to the complement of the original signal.
|
||||
When a signal has a maximum possible value, its complement is defined as
|
||||
the delta from the actual value and its possible maximum.
|
||||
|
||||
Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as
|
||||
the maximum possible value, the margin becomes:
|
||||
|
||||
margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal)
|
||||
|
||||
Using this boosting strategy:
|
||||
- a 100% sched_cfs_boost means that the signal is scaled to the maximum value
|
||||
- each value in the range of sched_cfs_boost effectively inflates the signal in
|
||||
question by a quantity which is proportional to the maximum value.
|
||||
|
||||
For example, by applying the SPC boosting strategy to the selection of the OPP
|
||||
to run a task it is possible to achieve these behaviors:
|
||||
|
||||
- 0% boosting: run the task at the minimum OPP required by its workload
|
||||
- 100% boosting: run the task at the maximum OPP available for the CPU
|
||||
- 50% boosting: run at the half-way OPP between minimum and maximum
|
||||
|
||||
Which means that, at 50% boosting, a task will be scheduled to run at half of
|
||||
the maximum theoretically achievable performance on the specific target
|
||||
platform.
|
||||
|
||||
A graphical representation of an SPC boosted signal is represented in the
|
||||
following figure where:
|
||||
a) "-" represents the original signal
|
||||
b) "b" represents a 50% boosted signal
|
||||
c) "p" represents a 100% boosted signal
|
||||
|
||||
|
||||
^
|
||||
| SCHED_LOAD_SCALE
|
||||
+-----------------------------------------------------------------+
|
||||
|pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
|
||||
|
|
||||
| boosted_signal
|
||||
| bbbbbbbbbbbbbbbbbbbbbbbb
|
||||
|
|
||||
| original signal
|
||||
| bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+
|
||||
| |
|
||||
|bbbbbbbbbbbbbbbbbb |
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
| +-----------------------+
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
|------------------+
|
||||
|
|
||||
|
|
||||
+----------------------------------------------------------------------->
|
||||
|
||||
The plot above shows a ramped load signal (titled 'original_signal') and it's
|
||||
boosted equivalent. For each step of the original signal the boosted signal
|
||||
corresponding to a 50% boost is midway from the original signal and the upper
|
||||
bound. Boosting by 100% generates a boosted signal which is always saturated to
|
||||
the upper bound.
|
||||
|
||||
|
||||
4. OPP selection using boosted CPU utilization
|
||||
==============================================
|
||||
|
||||
It is worth calling out that the implementation does not introduce any new load
|
||||
signals. Instead, it provides an API to tune existing signals. This tuning is
|
||||
done on demand and only in scheduler code paths where it is sensible to do so.
|
||||
The new API calls are defined to return either the default signal or a boosted
|
||||
one, depending on the value of sched_cfs_boost. This is a clean an non invasive
|
||||
modification of the existing existing code paths.
|
||||
|
||||
The signal representing a CPU's utilization is boosted according to the
|
||||
previously described SPC boosting strategy. To sched-DVFS, this allows a CPU
|
||||
(ie CFS run-queue) to appear more used then it actually is.
|
||||
|
||||
Thus, with the sched_cfs_boost enabled we have the following main functions to
|
||||
get the current utilization of a CPU:
|
||||
|
||||
cpu_util()
|
||||
boosted_cpu_util()
|
||||
|
||||
The new boosted_cpu_util() is similar to the first but returns a boosted
|
||||
utilization signal which is a function of the sched_cfs_boost value.
|
||||
|
||||
This function is used in the CFS scheduler code paths where sched-DVFS needs to
|
||||
decide the OPP to run a CPU at.
|
||||
For example, this allows selecting the highest OPP for a CPU which has
|
||||
the boost value set to 100%.
|
||||
|
||||
|
||||
5. Per task group boosting
|
||||
==========================
|
||||
|
||||
The availability of a single knob which is used to boost all tasks in the
|
||||
system is certainly a simple solution but it quite likely doesn't fit many
|
||||
utilization scenarios, especially in the mobile device space.
|
||||
|
||||
For example, on battery powered devices there usually are many background
|
||||
services which are long running and need energy efficient scheduling. On the
|
||||
other hand, some applications are more performance sensitive and require an
|
||||
interactive response and/or maximum performance, regardless of the energy cost.
|
||||
To better service such scenarios, the SchedTune implementation has an extension
|
||||
that provides a more fine grained boosting interface.
|
||||
|
||||
A new CGroup controller, namely "schedtune", could be enabled which allows to
|
||||
defined and configure task groups with different boosting values.
|
||||
Tasks that require special performance can be put into separate CGroups.
|
||||
The value of the boost associated with the tasks in this group can be specified
|
||||
using a single knob exposed by the CGroup controller:
|
||||
|
||||
schedtune.boost
|
||||
|
||||
This knob allows the definition of a boost value that is to be used for
|
||||
SPC boosting of all tasks attached to this group.
|
||||
|
||||
The current schedtune controller implementation is really simple and has these
|
||||
main characteristics:
|
||||
|
||||
1) It is only possible to create 1 level depth hierarchies
|
||||
|
||||
The root control groups define the system-wide boost value to be applied
|
||||
by default to all tasks. Its direct subgroups are named "boost groups" and
|
||||
they define the boost value for specific set of tasks.
|
||||
Further nested subgroups are not allowed since they do not have a sensible
|
||||
meaning from a user-space standpoint.
|
||||
|
||||
2) It is possible to define only a limited number of "boost groups"
|
||||
|
||||
This number is defined at compile time and by default configured to 16.
|
||||
This is a design decision motivated by two main reasons:
|
||||
a) In a real system we do not expect utilization scenarios with more then few
|
||||
boost groups. For example, a reasonable collection of groups could be
|
||||
just "background", "interactive" and "performance".
|
||||
b) It simplifies the implementation considerably, especially for the code
|
||||
which has to compute the per CPU boosting once there are multiple
|
||||
RUNNABLE tasks with different boost values.
|
||||
|
||||
Such a simple design should allow servicing the main utilization scenarios identified
|
||||
so far. It provides a simple interface which can be used to manage the
|
||||
power-performance of all tasks or only selected tasks.
|
||||
Moreover, this interface can be easily integrated by user-space run-times (e.g.
|
||||
Android, ChromeOS) to implement a QoS solution for task boosting based on tasks
|
||||
classification, which has been a long standing requirement.
|
||||
|
||||
Setup and usage
|
||||
---------------
|
||||
|
||||
0. Use a kernel with CGROUP_SCHEDTUNE support enabled
|
||||
|
||||
1. Check that the "schedtune" CGroup controller is available:
|
||||
|
||||
root@linaro-nano:~# cat /proc/cgroups
|
||||
#subsys_name hierarchy num_cgroups enabled
|
||||
cpuset 0 1 1
|
||||
cpu 0 1 1
|
||||
schedtune 0 1 1
|
||||
|
||||
2. Mount a tmpfs to create the CGroups mount point (Optional)
|
||||
|
||||
root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup
|
||||
|
||||
3. Mount the "schedtune" controller
|
||||
|
||||
root@linaro-nano:~# mkdir /sys/fs/cgroup/stune
|
||||
root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune
|
||||
|
||||
4. Setup the system-wide boost value (Optional)
|
||||
|
||||
If not configured the root control group has a 0% boost value, which
|
||||
basically disables boosting for all tasks in the system thus running in
|
||||
an energy-efficient mode.
|
||||
|
||||
root@linaro-nano:~# echo $SYSBOOST > /sys/fs/cgroup/stune/schedtune.boost
|
||||
|
||||
5. Create task groups and configure their specific boost value (Optional)
|
||||
|
||||
For example here we create a "performance" boost group configure to boost
|
||||
all its tasks to 100%
|
||||
|
||||
root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance
|
||||
root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost
|
||||
|
||||
6. Move tasks into the boost group
|
||||
|
||||
For example, the following moves the tasks with PID $TASKPID (and all its
|
||||
threads) into the "performance" boost group.
|
||||
|
||||
root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs
|
||||
|
||||
This simple configuration allows only the threads of the $TASKPID task to run,
|
||||
when needed, at the highest OPP in the most capable CPU of the system.
|
||||
|
||||
|
||||
6. Question and Answers
|
||||
=======================
|
||||
|
||||
What about "auto" mode?
|
||||
-----------------------
|
||||
|
||||
The 'auto' mode as described in [5] can be implemented by interfacing SchedTune
|
||||
with some suitable user-space element. This element could use the exposed
|
||||
system-wide or cgroup based interface.
|
||||
|
||||
How are multiple groups of tasks with different boost values managed?
|
||||
---------------------------------------------------------------------
|
||||
|
||||
The current SchedTune implementation keeps track of the boosted RUNNABLE tasks
|
||||
on a CPU. Once sched-DVFS selects the OPP to run a CPU at, the CPU utilization
|
||||
is boosted with a value which is the maximum of the boost values of the
|
||||
currently RUNNABLE tasks in its RQ.
|
||||
|
||||
This allows sched-DVFS to boost a CPU only while there are boosted tasks ready
|
||||
to run and switch back to the energy efficient mode as soon as the last boosted
|
||||
task is dequeued.
|
||||
|
||||
|
||||
7. References
|
||||
=============
|
||||
[1] http://lwn.net/Articles/552889
|
||||
[2] http://lkml.org/lkml/2012/5/18/91
|
||||
[3] http://lkml.org/lkml/2015/6/26/620
|
|
@ -24,6 +24,13 @@ void init_cpu_topology(void);
|
|||
void store_cpu_topology(unsigned int cpuid);
|
||||
const struct cpumask *cpu_coregroup_mask(int cpu);
|
||||
|
||||
#ifdef CONFIG_CPU_FREQ
|
||||
#include <linux/cpufreq.h>
|
||||
#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
|
||||
#endif
|
||||
#define arch_scale_cpu_capacity scale_cpu_capacity
|
||||
extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
|
||||
|
||||
#else
|
||||
|
||||
static inline void init_cpu_topology(void) { }
|
||||
|
|
|
@ -42,9 +42,15 @@
|
|||
*/
|
||||
static DEFINE_PER_CPU(unsigned long, cpu_scale);
|
||||
|
||||
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
||||
unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
||||
{
|
||||
#if CONFIG_CPU_FREQ
|
||||
unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
|
||||
|
||||
return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
|
||||
#else
|
||||
return per_cpu(cpu_scale, cpu);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
|
||||
|
@ -153,6 +159,8 @@ static void __init parse_dt_topology(void)
|
|||
|
||||
}
|
||||
|
||||
static const struct sched_group_energy * const cpu_core_energy(int cpu);
|
||||
|
||||
/*
|
||||
* Look for a customed capacity of a CPU in the cpu_capacity table during the
|
||||
* boot. The update of all CPUs is in O(n^2) for heteregeneous system but the
|
||||
|
@ -160,10 +168,14 @@ static void __init parse_dt_topology(void)
|
|||
*/
|
||||
static void update_cpu_capacity(unsigned int cpu)
|
||||
{
|
||||
if (!cpu_capacity(cpu))
|
||||
return;
|
||||
unsigned long capacity = SCHED_CAPACITY_SCALE;
|
||||
|
||||
set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
|
||||
if (cpu_core_energy(cpu)) {
|
||||
int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1;
|
||||
capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap;
|
||||
}
|
||||
|
||||
set_capacity_scale(cpu, capacity);
|
||||
|
||||
pr_info("CPU%u: update cpu_capacity %lu\n",
|
||||
cpu, arch_scale_cpu_capacity(NULL, cpu));
|
||||
|
@ -275,17 +287,138 @@ void store_cpu_topology(unsigned int cpuid)
|
|||
cpu_topology[cpuid].socket_id, mpidr);
|
||||
}
|
||||
|
||||
/*
|
||||
* ARM TC2 specific energy cost model data. There are no unit requirements for
|
||||
* the data. Data can be normalized to any reference point, but the
|
||||
* normalization must be consistent. That is, one bogo-joule/watt must be the
|
||||
* same quantity for all data, but we don't care what it is.
|
||||
*/
|
||||
static struct idle_state idle_states_cluster_a7[] = {
|
||||
{ .power = 25 }, /* arch_cpu_idle() (active idle) = WFI */
|
||||
{ .power = 25 }, /* WFI */
|
||||
{ .power = 10 }, /* cluster-sleep-l */
|
||||
};
|
||||
|
||||
static struct idle_state idle_states_cluster_a15[] = {
|
||||
{ .power = 70 }, /* arch_cpu_idle() (active idle) = WFI */
|
||||
{ .power = 70 }, /* WFI */
|
||||
{ .power = 25 }, /* cluster-sleep-b */
|
||||
};
|
||||
|
||||
static struct capacity_state cap_states_cluster_a7[] = {
|
||||
/* Cluster only power */
|
||||
{ .cap = 150, .power = 2967, }, /* 350 MHz */
|
||||
{ .cap = 172, .power = 2792, }, /* 400 MHz */
|
||||
{ .cap = 215, .power = 2810, }, /* 500 MHz */
|
||||
{ .cap = 258, .power = 2815, }, /* 600 MHz */
|
||||
{ .cap = 301, .power = 2919, }, /* 700 MHz */
|
||||
{ .cap = 344, .power = 2847, }, /* 800 MHz */
|
||||
{ .cap = 387, .power = 3917, }, /* 900 MHz */
|
||||
{ .cap = 430, .power = 4905, }, /* 1000 MHz */
|
||||
};
|
||||
|
||||
static struct capacity_state cap_states_cluster_a15[] = {
|
||||
/* Cluster only power */
|
||||
{ .cap = 426, .power = 7920, }, /* 500 MHz */
|
||||
{ .cap = 512, .power = 8165, }, /* 600 MHz */
|
||||
{ .cap = 597, .power = 8172, }, /* 700 MHz */
|
||||
{ .cap = 682, .power = 8195, }, /* 800 MHz */
|
||||
{ .cap = 768, .power = 8265, }, /* 900 MHz */
|
||||
{ .cap = 853, .power = 8446, }, /* 1000 MHz */
|
||||
{ .cap = 938, .power = 11426, }, /* 1100 MHz */
|
||||
{ .cap = 1024, .power = 15200, }, /* 1200 MHz */
|
||||
};
|
||||
|
||||
static struct sched_group_energy energy_cluster_a7 = {
|
||||
.nr_idle_states = ARRAY_SIZE(idle_states_cluster_a7),
|
||||
.idle_states = idle_states_cluster_a7,
|
||||
.nr_cap_states = ARRAY_SIZE(cap_states_cluster_a7),
|
||||
.cap_states = cap_states_cluster_a7,
|
||||
};
|
||||
|
||||
static struct sched_group_energy energy_cluster_a15 = {
|
||||
.nr_idle_states = ARRAY_SIZE(idle_states_cluster_a15),
|
||||
.idle_states = idle_states_cluster_a15,
|
||||
.nr_cap_states = ARRAY_SIZE(cap_states_cluster_a15),
|
||||
.cap_states = cap_states_cluster_a15,
|
||||
};
|
||||
|
||||
static struct idle_state idle_states_core_a7[] = {
|
||||
{ .power = 0 }, /* arch_cpu_idle (active idle) = WFI */
|
||||
{ .power = 0 }, /* WFI */
|
||||
{ .power = 0 }, /* cluster-sleep-l */
|
||||
};
|
||||
|
||||
static struct idle_state idle_states_core_a15[] = {
|
||||
{ .power = 0 }, /* arch_cpu_idle (active idle) = WFI */
|
||||
{ .power = 0 }, /* WFI */
|
||||
{ .power = 0 }, /* cluster-sleep-b */
|
||||
};
|
||||
|
||||
static struct capacity_state cap_states_core_a7[] = {
|
||||
/* Power per cpu */
|
||||
{ .cap = 150, .power = 187, }, /* 350 MHz */
|
||||
{ .cap = 172, .power = 275, }, /* 400 MHz */
|
||||
{ .cap = 215, .power = 334, }, /* 500 MHz */
|
||||
{ .cap = 258, .power = 407, }, /* 600 MHz */
|
||||
{ .cap = 301, .power = 447, }, /* 700 MHz */
|
||||
{ .cap = 344, .power = 549, }, /* 800 MHz */
|
||||
{ .cap = 387, .power = 761, }, /* 900 MHz */
|
||||
{ .cap = 430, .power = 1024, }, /* 1000 MHz */
|
||||
};
|
||||
|
||||
static struct capacity_state cap_states_core_a15[] = {
|
||||
/* Power per cpu */
|
||||
{ .cap = 426, .power = 2021, }, /* 500 MHz */
|
||||
{ .cap = 512, .power = 2312, }, /* 600 MHz */
|
||||
{ .cap = 597, .power = 2756, }, /* 700 MHz */
|
||||
{ .cap = 682, .power = 3125, }, /* 800 MHz */
|
||||
{ .cap = 768, .power = 3524, }, /* 900 MHz */
|
||||
{ .cap = 853, .power = 3846, }, /* 1000 MHz */
|
||||
{ .cap = 938, .power = 5177, }, /* 1100 MHz */
|
||||
{ .cap = 1024, .power = 6997, }, /* 1200 MHz */
|
||||
};
|
||||
|
||||
static struct sched_group_energy energy_core_a7 = {
|
||||
.nr_idle_states = ARRAY_SIZE(idle_states_core_a7),
|
||||
.idle_states = idle_states_core_a7,
|
||||
.nr_cap_states = ARRAY_SIZE(cap_states_core_a7),
|
||||
.cap_states = cap_states_core_a7,
|
||||
};
|
||||
|
||||
static struct sched_group_energy energy_core_a15 = {
|
||||
.nr_idle_states = ARRAY_SIZE(idle_states_core_a15),
|
||||
.idle_states = idle_states_core_a15,
|
||||
.nr_cap_states = ARRAY_SIZE(cap_states_core_a15),
|
||||
.cap_states = cap_states_core_a15,
|
||||
};
|
||||
|
||||
/* sd energy functions */
|
||||
static inline
|
||||
const struct sched_group_energy * const cpu_cluster_energy(int cpu)
|
||||
{
|
||||
return cpu_topology[cpu].socket_id ? &energy_cluster_a7 :
|
||||
&energy_cluster_a15;
|
||||
}
|
||||
|
||||
static inline
|
||||
const struct sched_group_energy * const cpu_core_energy(int cpu)
|
||||
{
|
||||
return cpu_topology[cpu].socket_id ? &energy_core_a7 :
|
||||
&energy_core_a15;
|
||||
}
|
||||
|
||||
static inline int cpu_corepower_flags(void)
|
||||
{
|
||||
return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
|
||||
return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \
|
||||
SD_SHARE_CAP_STATES;
|
||||
}
|
||||
|
||||
static struct sched_domain_topology_level arm_topology[] = {
|
||||
#ifdef CONFIG_SCHED_MC
|
||||
{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
|
||||
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
|
||||
{ cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
|
||||
#endif
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
||||
{ cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
|
|
|
@ -22,6 +22,15 @@ void init_cpu_topology(void);
|
|||
void store_cpu_topology(unsigned int cpuid);
|
||||
const struct cpumask *cpu_coregroup_mask(int cpu);
|
||||
|
||||
struct sched_domain;
|
||||
#ifdef CONFIG_CPU_FREQ
|
||||
#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
|
||||
extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
|
||||
extern unsigned long cpufreq_scale_max_freq_capacity(int cpu);
|
||||
#endif
|
||||
#define arch_scale_cpu_capacity scale_cpu_capacity
|
||||
extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
|
||||
|
||||
#include <asm-generic/topology.h>
|
||||
|
||||
#endif /* _ASM_ARM_TOPOLOGY_H */
|
||||
|
|
|
@ -19,10 +19,30 @@
|
|||
#include <linux/nodemask.h>
|
||||
#include <linux/of.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched_energy.h>
|
||||
|
||||
#include <asm/cputype.h>
|
||||
#include <asm/topology.h>
|
||||
|
||||
static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
|
||||
|
||||
unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
||||
{
|
||||
#ifdef CONFIG_CPU_FREQ
|
||||
unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
|
||||
|
||||
return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
|
||||
#else
|
||||
return per_cpu(cpu_scale, cpu);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
|
||||
{
|
||||
per_cpu(cpu_scale, cpu) = capacity;
|
||||
}
|
||||
|
||||
static int __init get_cpu_for_node(struct device_node *node)
|
||||
{
|
||||
struct device_node *cpu_node;
|
||||
|
@ -206,11 +226,67 @@ out:
|
|||
struct cpu_topology cpu_topology[NR_CPUS];
|
||||
EXPORT_SYMBOL_GPL(cpu_topology);
|
||||
|
||||
/* sd energy functions */
|
||||
static inline
|
||||
const struct sched_group_energy * const cpu_cluster_energy(int cpu)
|
||||
{
|
||||
struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1];
|
||||
|
||||
if (!sge) {
|
||||
pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return sge;
|
||||
}
|
||||
|
||||
static inline
|
||||
const struct sched_group_energy * const cpu_core_energy(int cpu)
|
||||
{
|
||||
struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0];
|
||||
|
||||
if (!sge) {
|
||||
pr_warn("Invalid sched_group_energy for CPU%d\n", cpu);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return sge;
|
||||
}
|
||||
|
||||
const struct cpumask *cpu_coregroup_mask(int cpu)
|
||||
{
|
||||
return &cpu_topology[cpu].core_sibling;
|
||||
}
|
||||
|
||||
static inline int cpu_corepower_flags(void)
|
||||
{
|
||||
return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \
|
||||
SD_SHARE_CAP_STATES;
|
||||
}
|
||||
|
||||
static struct sched_domain_topology_level arm64_topology[] = {
|
||||
#ifdef CONFIG_SCHED_MC
|
||||
{ cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
|
||||
#endif
|
||||
{ cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
static void update_cpu_capacity(unsigned int cpu)
|
||||
{
|
||||
unsigned long capacity = SCHED_CAPACITY_SCALE;
|
||||
|
||||
if (cpu_core_energy(cpu)) {
|
||||
int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1;
|
||||
capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap;
|
||||
}
|
||||
|
||||
set_capacity_scale(cpu, capacity);
|
||||
|
||||
pr_info("CPU%d: update cpu_capacity %lu\n",
|
||||
cpu, arch_scale_cpu_capacity(NULL, cpu));
|
||||
}
|
||||
|
||||
static void update_siblings_masks(unsigned int cpuid)
|
||||
{
|
||||
struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
|
||||
|
@ -272,6 +348,7 @@ void store_cpu_topology(unsigned int cpuid)
|
|||
|
||||
topology_populated:
|
||||
update_siblings_masks(cpuid);
|
||||
update_cpu_capacity(cpuid);
|
||||
}
|
||||
|
||||
static void __init reset_cpu_topology(void)
|
||||
|
@ -302,4 +379,8 @@ void __init init_cpu_topology(void)
|
|||
*/
|
||||
if (of_have_populated_dt() && parse_dt_topology())
|
||||
reset_cpu_topology();
|
||||
else
|
||||
set_sched_topology(arm64_topology);
|
||||
|
||||
init_sched_energy_costs();
|
||||
}
|
||||
|
|
|
@ -112,6 +112,14 @@ config CPU_FREQ_DEFAULT_GOV_INTERACTIVE
|
|||
loading your cpufreq low-level hardware driver, using the
|
||||
'interactive' governor for latency-sensitive workloads.
|
||||
|
||||
config CPU_FREQ_DEFAULT_GOV_SCHED
|
||||
bool "sched"
|
||||
select CPU_FREQ_GOV_SCHED
|
||||
help
|
||||
Use the CPUfreq governor 'sched' as default. This scales
|
||||
cpu frequency using CPU utilization estimates from the
|
||||
scheduler.
|
||||
|
||||
endchoice
|
||||
|
||||
config CPU_FREQ_GOV_PERFORMANCE
|
||||
|
@ -207,6 +215,19 @@ config CPU_FREQ_GOV_CONSERVATIVE
|
|||
|
||||
If in doubt, say N.
|
||||
|
||||
config CPU_FREQ_GOV_SCHED
|
||||
bool "'sched' cpufreq governor"
|
||||
depends on CPU_FREQ
|
||||
depends on SMP
|
||||
select CPU_FREQ_GOV_COMMON
|
||||
help
|
||||
'sched' - this governor scales cpu frequency from the
|
||||
scheduler as a function of cpu capacity utilization. It does
|
||||
not evaluate utilization on a periodic basis (as ondemand
|
||||
does) but instead is event-driven by the scheduler.
|
||||
|
||||
If in doubt, say N.
|
||||
|
||||
comment "CPU frequency scaling drivers"
|
||||
|
||||
config CPUFREQ_DT
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include <linux/suspend.h>
|
||||
#include <linux/syscore_ops.h>
|
||||
#include <linux/tick.h>
|
||||
#include <linux/sched.h>
|
||||
#include <trace/events/power.h>
|
||||
|
||||
static LIST_HEAD(cpufreq_policy_list);
|
||||
|
@ -154,6 +155,12 @@ bool have_governor_per_policy(void)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(have_governor_per_policy);
|
||||
|
||||
bool cpufreq_driver_is_slow(void)
|
||||
{
|
||||
return !(cpufreq_driver->flags & CPUFREQ_DRIVER_FAST);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cpufreq_driver_is_slow);
|
||||
|
||||
struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy)
|
||||
{
|
||||
if (have_governor_per_policy())
|
||||
|
@ -347,6 +354,50 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
|
|||
#endif
|
||||
}
|
||||
|
||||
/*********************************************************************
|
||||
* FREQUENCY INVARIANT CPU CAPACITY *
|
||||
*********************************************************************/
|
||||
|
||||
static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
|
||||
static DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE;
|
||||
|
||||
static void
|
||||
scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs)
|
||||
{
|
||||
unsigned long cur = freqs ? freqs->new : policy->cur;
|
||||
unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max;
|
||||
struct cpufreq_cpuinfo *cpuinfo = &policy->cpuinfo;
|
||||
int cpu;
|
||||
|
||||
pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n",
|
||||
cpumask_pr_args(policy->cpus), cur, policy->max, scale);
|
||||
|
||||
for_each_cpu(cpu, policy->cpus)
|
||||
per_cpu(freq_scale, cpu) = scale;
|
||||
|
||||
if (freqs)
|
||||
return;
|
||||
|
||||
scale = (policy->max << SCHED_CAPACITY_SHIFT) / cpuinfo->max_freq;
|
||||
|
||||
pr_debug("cpus %*pbl cur max/max freq %u/%u kHz max freq scale %lu\n",
|
||||
cpumask_pr_args(policy->cpus), policy->max, cpuinfo->max_freq,
|
||||
scale);
|
||||
|
||||
for_each_cpu(cpu, policy->cpus)
|
||||
per_cpu(max_freq_scale, cpu) = scale;
|
||||
}
|
||||
|
||||
unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu)
|
||||
{
|
||||
return per_cpu(freq_scale, cpu);
|
||||
}
|
||||
|
||||
unsigned long cpufreq_scale_max_freq_capacity(int cpu)
|
||||
{
|
||||
return per_cpu(max_freq_scale, cpu);
|
||||
}
|
||||
|
||||
static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
|
||||
struct cpufreq_freqs *freqs, unsigned int state)
|
||||
{
|
||||
|
@ -423,6 +474,7 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy,
|
|||
void cpufreq_freq_transition_begin(struct cpufreq_policy *policy,
|
||||
struct cpufreq_freqs *freqs)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
/*
|
||||
* Catch double invocations of _begin() which lead to self-deadlock.
|
||||
|
@ -450,6 +502,10 @@ wait:
|
|||
|
||||
spin_unlock(&policy->transition_lock);
|
||||
|
||||
scale_freq_capacity(policy, freqs);
|
||||
for_each_cpu(cpu, policy->cpus)
|
||||
trace_cpu_capacity(capacity_curr_of(cpu), cpu);
|
||||
|
||||
cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin);
|
||||
|
@ -2126,6 +2182,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
|
|||
blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
|
||||
CPUFREQ_NOTIFY, new_policy);
|
||||
|
||||
scale_freq_capacity(new_policy, NULL);
|
||||
|
||||
policy->min = new_policy->min;
|
||||
policy->max = new_policy->max;
|
||||
trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu);
|
||||
|
|
|
@ -192,7 +192,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
|
|||
}
|
||||
|
||||
/* Take note of the planned idle state. */
|
||||
sched_idle_set_state(target_state);
|
||||
sched_idle_set_state(target_state, index);
|
||||
|
||||
trace_cpu_idle_rcuidle(index, dev->cpu);
|
||||
time_start = ktime_get();
|
||||
|
@ -205,7 +205,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
|
|||
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
|
||||
|
||||
/* The cpu is no longer idle or about to enter idle. */
|
||||
sched_idle_set_state(NULL);
|
||||
sched_idle_set_state(NULL, -1);
|
||||
|
||||
if (broadcast) {
|
||||
if (WARN_ON_ONCE(!irqs_disabled()))
|
||||
|
|
|
@ -26,6 +26,10 @@ SUBSYS(cpu)
|
|||
SUBSYS(cpuacct)
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_CGROUP_SCHEDTUNE)
|
||||
SUBSYS(schedtune)
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_BLK_CGROUP)
|
||||
SUBSYS(io)
|
||||
#endif
|
||||
|
|
|
@ -160,6 +160,7 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
|
|||
int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
|
||||
int cpufreq_update_policy(unsigned int cpu);
|
||||
bool have_governor_per_policy(void);
|
||||
bool cpufreq_driver_is_slow(void);
|
||||
struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
|
||||
#else
|
||||
static inline unsigned int cpufreq_get(unsigned int cpu)
|
||||
|
@ -317,6 +318,14 @@ struct cpufreq_driver {
|
|||
*/
|
||||
#define CPUFREQ_NEED_INITIAL_FREQ_CHECK (1 << 5)
|
||||
|
||||
/*
|
||||
* Indicates that it is safe to call cpufreq_driver_target from
|
||||
* non-interruptable context in scheduler hot paths. Drivers must
|
||||
* opt-in to this flag, as the safe default is that they might sleep
|
||||
* or be too slow for hot path use.
|
||||
*/
|
||||
#define CPUFREQ_DRIVER_FAST (1 << 6)
|
||||
|
||||
int cpufreq_register_driver(struct cpufreq_driver *driver_data);
|
||||
int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
|
||||
|
||||
|
@ -490,6 +499,9 @@ extern struct cpufreq_governor cpufreq_gov_conservative;
|
|||
#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE)
|
||||
extern struct cpufreq_governor cpufreq_gov_interactive;
|
||||
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_interactive)
|
||||
#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED)
|
||||
extern struct cpufreq_governor cpufreq_gov_sched;
|
||||
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched)
|
||||
#endif
|
||||
|
||||
/*********************************************************************
|
||||
|
@ -619,4 +631,8 @@ unsigned int cpufreq_generic_get(unsigned int cpu);
|
|||
int cpufreq_generic_init(struct cpufreq_policy *policy,
|
||||
struct cpufreq_frequency_table *table,
|
||||
unsigned int transition_latency);
|
||||
|
||||
struct sched_domain;
|
||||
unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
|
||||
unsigned long cpufreq_scale_max_freq_capacity(int cpu);
|
||||
#endif /* _LINUX_CPUFREQ_H */
|
||||
|
|
|
@ -204,7 +204,7 @@ static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
|
|||
#endif
|
||||
|
||||
/* kernel/sched/idle.c */
|
||||
extern void sched_idle_set_state(struct cpuidle_state *idle_state);
|
||||
extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index);
|
||||
extern void default_idle_call(void);
|
||||
|
||||
#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
|
||||
|
|
|
@ -173,6 +173,9 @@ extern bool single_task_running(void);
|
|||
extern unsigned long nr_iowait(void);
|
||||
extern unsigned long nr_iowait_cpu(int cpu);
|
||||
extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
|
||||
#ifdef CONFIG_CPU_QUIET
|
||||
extern u64 nr_running_integral(unsigned int cpu);
|
||||
#endif
|
||||
|
||||
extern void calc_global_load(unsigned long ticks);
|
||||
|
||||
|
@ -314,6 +317,15 @@ extern char ___assert_task_state[1 - 2*!!(
|
|||
/* Task command name length */
|
||||
#define TASK_COMM_LEN 16
|
||||
|
||||
enum task_event {
|
||||
PUT_PREV_TASK = 0,
|
||||
PICK_NEXT_TASK = 1,
|
||||
TASK_WAKE = 2,
|
||||
TASK_MIGRATE = 3,
|
||||
TASK_UPDATE = 4,
|
||||
IRQ_UPDATE = 5,
|
||||
};
|
||||
|
||||
#include <linux/spinlock.h>
|
||||
|
||||
/*
|
||||
|
@ -927,6 +939,14 @@ enum cpu_idle_type {
|
|||
#define SCHED_CAPACITY_SHIFT 10
|
||||
#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
|
||||
|
||||
struct sched_capacity_reqs {
|
||||
unsigned long cfs;
|
||||
unsigned long rt;
|
||||
unsigned long dl;
|
||||
|
||||
unsigned long total;
|
||||
};
|
||||
|
||||
/*
|
||||
* Wake-queues are lists of tasks with a pending wakeup, whose
|
||||
* callers have already marked the task as woken internally,
|
||||
|
@ -989,6 +1009,7 @@ extern void wake_up_q(struct wake_q_head *head);
|
|||
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
|
||||
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
|
||||
#define SD_NUMA 0x4000 /* cross-node balancing */
|
||||
#define SD_SHARE_CAP_STATES 0x8000 /* Domain members share capacity state */
|
||||
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
static inline int cpu_smt_flags(void)
|
||||
|
@ -1021,6 +1042,24 @@ struct sched_domain_attr {
|
|||
|
||||
extern int sched_domain_level_max;
|
||||
|
||||
struct capacity_state {
|
||||
unsigned long cap; /* compute capacity */
|
||||
unsigned long power; /* power consumption at this compute capacity */
|
||||
};
|
||||
|
||||
struct idle_state {
|
||||
unsigned long power; /* power consumption in this idle state */
|
||||
};
|
||||
|
||||
struct sched_group_energy {
|
||||
unsigned int nr_idle_states; /* number of idle states */
|
||||
struct idle_state *idle_states; /* ptr to idle state array */
|
||||
unsigned int nr_cap_states; /* number of capacity states */
|
||||
struct capacity_state *cap_states; /* ptr to capacity state array */
|
||||
};
|
||||
|
||||
unsigned long capacity_curr_of(int cpu);
|
||||
|
||||
struct sched_group;
|
||||
|
||||
struct sched_domain {
|
||||
|
@ -1119,6 +1158,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu);
|
|||
|
||||
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
|
||||
typedef int (*sched_domain_flags_f)(void);
|
||||
typedef
|
||||
const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu);
|
||||
|
||||
#define SDTL_OVERLAP 0x01
|
||||
|
||||
|
@ -1131,6 +1172,7 @@ struct sd_data {
|
|||
struct sched_domain_topology_level {
|
||||
sched_domain_mask_f mask;
|
||||
sched_domain_flags_f sd_flags;
|
||||
sched_domain_energy_f energy;
|
||||
int flags;
|
||||
int numa_level;
|
||||
struct sd_data data;
|
||||
|
@ -1241,6 +1283,41 @@ struct sched_statistics {
|
|||
};
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
#define RAVG_HIST_SIZE_MAX 5
|
||||
|
||||
/* ravg represents frequency scaled cpu-demand of tasks */
|
||||
struct ravg {
|
||||
/*
|
||||
* 'mark_start' marks the beginning of an event (task waking up, task
|
||||
* starting to execute, task being preempted) within a window
|
||||
*
|
||||
* 'sum' represents how runnable a task has been within current
|
||||
* window. It incorporates both running time and wait time and is
|
||||
* frequency scaled.
|
||||
*
|
||||
* 'sum_history' keeps track of history of 'sum' seen over previous
|
||||
* RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
|
||||
* ignored.
|
||||
*
|
||||
* 'demand' represents maximum sum seen over previous
|
||||
* sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
|
||||
* demand for tasks.
|
||||
*
|
||||
* 'curr_window' represents task's contribution to cpu busy time
|
||||
* statistics (rq->curr_runnable_sum) in current window
|
||||
*
|
||||
* 'prev_window' represents task's contribution to cpu busy time
|
||||
* statistics (rq->prev_runnable_sum) in previous window
|
||||
*/
|
||||
u64 mark_start;
|
||||
u32 sum, demand;
|
||||
u32 sum_history[RAVG_HIST_SIZE_MAX];
|
||||
u32 curr_window, prev_window;
|
||||
u16 active_windows;
|
||||
};
|
||||
#endif
|
||||
|
||||
struct sched_entity {
|
||||
struct load_weight load; /* for load-balancing */
|
||||
struct rb_node run_node;
|
||||
|
@ -1398,6 +1475,15 @@ struct task_struct {
|
|||
const struct sched_class *sched_class;
|
||||
struct sched_entity se;
|
||||
struct sched_rt_entity rt;
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
struct ravg ravg;
|
||||
/*
|
||||
* 'init_load_pct' represents the initial task load assigned to children
|
||||
* of this task
|
||||
*/
|
||||
u32 init_load_pct;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
struct task_group *sched_task_group;
|
||||
#endif
|
||||
|
|
|
@ -39,6 +39,16 @@ extern unsigned int sysctl_sched_latency;
|
|||
extern unsigned int sysctl_sched_min_granularity;
|
||||
extern unsigned int sysctl_sched_wakeup_granularity;
|
||||
extern unsigned int sysctl_sched_child_runs_first;
|
||||
extern unsigned int sysctl_sched_is_big_little;
|
||||
extern unsigned int sysctl_sched_sync_hint_enable;
|
||||
extern unsigned int sysctl_sched_initial_task_util;
|
||||
extern unsigned int sysctl_sched_cstate_aware;
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
extern unsigned int sysctl_sched_use_walt_cpu_util;
|
||||
extern unsigned int sysctl_sched_use_walt_task_util;
|
||||
extern unsigned int sysctl_sched_walt_init_task_load_pct;
|
||||
extern unsigned int sysctl_sched_walt_cpu_high_irqload;
|
||||
#endif
|
||||
|
||||
enum sched_tunable_scaling {
|
||||
SCHED_TUNABLESCALING_NONE,
|
||||
|
@ -77,6 +87,22 @@ extern int sysctl_sched_rt_runtime;
|
|||
extern unsigned int sysctl_sched_cfs_bandwidth_slice;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_TUNE
|
||||
extern unsigned int sysctl_sched_cfs_boost;
|
||||
int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *length,
|
||||
loff_t *ppos);
|
||||
static inline unsigned int get_sysctl_sched_cfs_boost(void)
|
||||
{
|
||||
return sysctl_sched_cfs_boost;
|
||||
}
|
||||
#else
|
||||
static inline unsigned int get_sysctl_sched_cfs_boost(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_AUTOGROUP
|
||||
extern unsigned int sysctl_sched_autogroup_enabled;
|
||||
#endif
|
||||
|
|
44
include/linux/sched_energy.h
Normal file
44
include/linux/sched_energy.h
Normal file
|
@ -0,0 +1,44 @@
|
|||
#ifndef _LINUX_SCHED_ENERGY_H
|
||||
#define _LINUX_SCHED_ENERGY_H
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
/*
|
||||
* There doesn't seem to be an NR_CPUS style max number of sched domain
|
||||
* levels so here's an arbitrary constant one for the moment.
|
||||
*
|
||||
* The levels alluded to here correspond to entries in struct
|
||||
* sched_domain_topology_level that are meant to be populated by arch
|
||||
* specific code (topology.c).
|
||||
*/
|
||||
#define NR_SD_LEVELS 8
|
||||
|
||||
#define SD_LEVEL0 0
|
||||
#define SD_LEVEL1 1
|
||||
#define SD_LEVEL2 2
|
||||
#define SD_LEVEL3 3
|
||||
#define SD_LEVEL4 4
|
||||
#define SD_LEVEL5 5
|
||||
#define SD_LEVEL6 6
|
||||
#define SD_LEVEL7 7
|
||||
|
||||
/*
|
||||
* Convenience macro for iterating through said sd levels.
|
||||
*/
|
||||
#define for_each_possible_sd_level(level) \
|
||||
for (level = 0; level < NR_SD_LEVELS; level++)
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
|
||||
|
||||
void init_sched_energy_costs(void);
|
||||
|
||||
#else
|
||||
|
||||
#define init_sched_energy_costs() do { } while (0)
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#endif
|
|
@ -189,6 +189,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item);
|
|||
extern void dec_zone_state(struct zone *, enum zone_stat_item);
|
||||
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
|
||||
|
||||
void quiet_vmstat(void);
|
||||
void cpu_vm_stats_fold(int cpu);
|
||||
void refresh_zone_stat_thresholds(void);
|
||||
|
||||
|
@ -249,6 +250,7 @@ static inline void __dec_zone_page_state(struct page *page,
|
|||
|
||||
static inline void refresh_zone_stat_thresholds(void) { }
|
||||
static inline void cpu_vm_stats_fold(int cpu) { }
|
||||
static inline void quiet_vmstat(void) { }
|
||||
|
||||
static inline void drain_zonestat(struct zone *zone,
|
||||
struct per_cpu_pageset *pset) { }
|
||||
|
|
87
include/trace/events/cpufreq_sched.h
Normal file
87
include/trace/events/cpufreq_sched.h
Normal file
|
@ -0,0 +1,87 @@
|
|||
/*
|
||||
* Copyright (C) 2015 Steve Muckle <smuckle@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#undef TRACE_SYSTEM
|
||||
#define TRACE_SYSTEM cpufreq_sched
|
||||
|
||||
#if !defined(_TRACE_CPUFREQ_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
|
||||
#define _TRACE_CPUFREQ_SCHED_H
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <linux/tracepoint.h>
|
||||
|
||||
TRACE_EVENT(cpufreq_sched_throttled,
|
||||
TP_PROTO(unsigned int rem),
|
||||
TP_ARGS(rem),
|
||||
TP_STRUCT__entry(
|
||||
__field( unsigned int, rem)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->rem = rem;
|
||||
),
|
||||
TP_printk("throttled - %d usec remaining", __entry->rem)
|
||||
);
|
||||
|
||||
TRACE_EVENT(cpufreq_sched_request_opp,
|
||||
TP_PROTO(int cpu,
|
||||
unsigned long capacity,
|
||||
unsigned int freq_new,
|
||||
unsigned int requested_freq),
|
||||
TP_ARGS(cpu, capacity, freq_new, requested_freq),
|
||||
TP_STRUCT__entry(
|
||||
__field( int, cpu)
|
||||
__field( unsigned long, capacity)
|
||||
__field( unsigned int, freq_new)
|
||||
__field( unsigned int, requested_freq)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->cpu = cpu;
|
||||
__entry->capacity = capacity;
|
||||
__entry->freq_new = freq_new;
|
||||
__entry->requested_freq = requested_freq;
|
||||
),
|
||||
TP_printk("cpu %d cap change, cluster cap request %ld => OPP %d "
|
||||
"(cur %d)",
|
||||
__entry->cpu, __entry->capacity, __entry->freq_new,
|
||||
__entry->requested_freq)
|
||||
);
|
||||
|
||||
TRACE_EVENT(cpufreq_sched_update_capacity,
|
||||
TP_PROTO(int cpu,
|
||||
bool request,
|
||||
struct sched_capacity_reqs *scr,
|
||||
unsigned long new_capacity),
|
||||
TP_ARGS(cpu, request, scr, new_capacity),
|
||||
TP_STRUCT__entry(
|
||||
__field( int, cpu)
|
||||
__field( bool, request)
|
||||
__field( unsigned long, cfs)
|
||||
__field( unsigned long, rt)
|
||||
__field( unsigned long, dl)
|
||||
__field( unsigned long, total)
|
||||
__field( unsigned long, new_total)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->cpu = cpu;
|
||||
__entry->request = request;
|
||||
__entry->cfs = scr->cfs;
|
||||
__entry->rt = scr->rt;
|
||||
__entry->dl = scr->dl;
|
||||
__entry->total = scr->total;
|
||||
__entry->new_total = new_capacity;
|
||||
),
|
||||
TP_printk("cpu=%d set_cap=%d cfs=%ld rt=%ld dl=%ld old_tot=%ld "
|
||||
"new_tot=%ld",
|
||||
__entry->cpu, __entry->request, __entry->cfs, __entry->rt,
|
||||
__entry->dl, __entry->total, __entry->new_total)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_CPUFREQ_SCHED_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
#include <trace/define_trace.h>
|
|
@ -145,6 +145,13 @@ TRACE_EVENT(cpu_frequency_limits,
|
|||
(unsigned long)__entry->cpu_id)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(cpu, cpu_capacity,
|
||||
|
||||
TP_PROTO(unsigned int capacity, unsigned int cpu_id),
|
||||
|
||||
TP_ARGS(capacity, cpu_id)
|
||||
);
|
||||
|
||||
TRACE_EVENT(device_pm_callback_start,
|
||||
|
||||
TP_PROTO(struct device *dev, const char *pm_ops, int event),
|
||||
|
|
|
@ -611,6 +611,503 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
|
|||
|
||||
TP_printk("cpu=%d", __entry->cpu)
|
||||
);
|
||||
|
||||
TRACE_EVENT(sched_contrib_scale_f,
|
||||
|
||||
TP_PROTO(int cpu, unsigned long freq_scale_factor,
|
||||
unsigned long cpu_scale_factor),
|
||||
|
||||
TP_ARGS(cpu, freq_scale_factor, cpu_scale_factor),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, cpu)
|
||||
__field(unsigned long, freq_scale_factor)
|
||||
__field(unsigned long, cpu_scale_factor)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->cpu = cpu;
|
||||
__entry->freq_scale_factor = freq_scale_factor;
|
||||
__entry->cpu_scale_factor = cpu_scale_factor;
|
||||
),
|
||||
|
||||
TP_printk("cpu=%d freq_scale_factor=%lu cpu_scale_factor=%lu",
|
||||
__entry->cpu, __entry->freq_scale_factor,
|
||||
__entry->cpu_scale_factor)
|
||||
);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
/*
|
||||
* Tracepoint for accounting sched averages for tasks.
|
||||
*/
|
||||
TRACE_EVENT(sched_load_avg_task,
|
||||
|
||||
TP_PROTO(struct task_struct *tsk, struct sched_avg *avg),
|
||||
|
||||
TP_ARGS(tsk, avg),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array( char, comm, TASK_COMM_LEN )
|
||||
__field( pid_t, pid )
|
||||
__field( int, cpu )
|
||||
__field( unsigned long, load_avg )
|
||||
__field( unsigned long, util_avg )
|
||||
__field( u64, load_sum )
|
||||
__field( u32, util_sum )
|
||||
__field( u32, period_contrib )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
|
||||
__entry->pid = tsk->pid;
|
||||
__entry->cpu = task_cpu(tsk);
|
||||
__entry->load_avg = avg->load_avg;
|
||||
__entry->util_avg = avg->util_avg;
|
||||
__entry->load_sum = avg->load_sum;
|
||||
__entry->util_sum = avg->util_sum;
|
||||
__entry->period_contrib = avg->period_contrib;
|
||||
),
|
||||
|
||||
TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu load_sum=%llu"
|
||||
" util_sum=%u period_contrib=%u",
|
||||
__entry->comm,
|
||||
__entry->pid,
|
||||
__entry->cpu,
|
||||
__entry->load_avg,
|
||||
__entry->util_avg,
|
||||
(u64)__entry->load_sum,
|
||||
(u32)__entry->util_sum,
|
||||
(u32)__entry->period_contrib)
|
||||
);
|
||||
|
||||
/*
|
||||
* Tracepoint for accounting sched averages for cpus.
|
||||
*/
|
||||
TRACE_EVENT(sched_load_avg_cpu,
|
||||
|
||||
TP_PROTO(int cpu, struct cfs_rq *cfs_rq),
|
||||
|
||||
TP_ARGS(cpu, cfs_rq),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( int, cpu )
|
||||
__field( unsigned long, load_avg )
|
||||
__field( unsigned long, util_avg )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->cpu = cpu;
|
||||
__entry->load_avg = cfs_rq->avg.load_avg;
|
||||
__entry->util_avg = cfs_rq->avg.util_avg;
|
||||
),
|
||||
|
||||
TP_printk("cpu=%d load_avg=%lu util_avg=%lu",
|
||||
__entry->cpu, __entry->load_avg, __entry->util_avg)
|
||||
);
|
||||
|
||||
/*
|
||||
* Tracepoint for sched_tune_config settings
|
||||
*/
|
||||
TRACE_EVENT(sched_tune_config,
|
||||
|
||||
TP_PROTO(int boost),
|
||||
|
||||
TP_ARGS(boost),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( int, boost )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->boost = boost;
|
||||
),
|
||||
|
||||
TP_printk("boost=%d ", __entry->boost)
|
||||
);
|
||||
|
||||
/*
|
||||
* Tracepoint for accounting CPU boosted utilization
|
||||
*/
|
||||
TRACE_EVENT(sched_boost_cpu,
|
||||
|
||||
TP_PROTO(int cpu, unsigned long util, long margin),
|
||||
|
||||
TP_ARGS(cpu, util, margin),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( int, cpu )
|
||||
__field( unsigned long, util )
|
||||
__field(long, margin )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->cpu = cpu;
|
||||
__entry->util = util;
|
||||
__entry->margin = margin;
|
||||
),
|
||||
|
||||
TP_printk("cpu=%d util=%lu margin=%ld",
|
||||
__entry->cpu,
|
||||
__entry->util,
|
||||
__entry->margin)
|
||||
);
|
||||
|
||||
/*
|
||||
* Tracepoint for schedtune_tasks_update
|
||||
*/
|
||||
TRACE_EVENT(sched_tune_tasks_update,
|
||||
|
||||
TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx,
|
||||
int boost, int max_boost),
|
||||
|
||||
TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array( char, comm, TASK_COMM_LEN )
|
||||
__field( pid_t, pid )
|
||||
__field( int, cpu )
|
||||
__field( int, tasks )
|
||||
__field( int, idx )
|
||||
__field( int, boost )
|
||||
__field( int, max_boost )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
|
||||
__entry->pid = tsk->pid;
|
||||
__entry->cpu = cpu;
|
||||
__entry->tasks = tasks;
|
||||
__entry->idx = idx;
|
||||
__entry->boost = boost;
|
||||
__entry->max_boost = max_boost;
|
||||
),
|
||||
|
||||
TP_printk("pid=%d comm=%s "
|
||||
"cpu=%d tasks=%d idx=%d boost=%d max_boost=%d",
|
||||
__entry->pid, __entry->comm,
|
||||
__entry->cpu, __entry->tasks, __entry->idx,
|
||||
__entry->boost, __entry->max_boost)
|
||||
);
|
||||
|
||||
/*
|
||||
* Tracepoint for schedtune_boostgroup_update
|
||||
*/
|
||||
TRACE_EVENT(sched_tune_boostgroup_update,
|
||||
|
||||
TP_PROTO(int cpu, int variation, int max_boost),
|
||||
|
||||
TP_ARGS(cpu, variation, max_boost),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( int, cpu )
|
||||
__field( int, variation )
|
||||
__field( int, max_boost )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->cpu = cpu;
|
||||
__entry->variation = variation;
|
||||
__entry->max_boost = max_boost;
|
||||
),
|
||||
|
||||
TP_printk("cpu=%d variation=%d max_boost=%d",
|
||||
__entry->cpu, __entry->variation, __entry->max_boost)
|
||||
);
|
||||
|
||||
/*
|
||||
* Tracepoint for accounting task boosted utilization
|
||||
*/
|
||||
TRACE_EVENT(sched_boost_task,
|
||||
|
||||
TP_PROTO(struct task_struct *tsk, unsigned long util, long margin),
|
||||
|
||||
TP_ARGS(tsk, util, margin),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array( char, comm, TASK_COMM_LEN )
|
||||
__field( pid_t, pid )
|
||||
__field( unsigned long, util )
|
||||
__field( long, margin )
|
||||
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
|
||||
__entry->pid = tsk->pid;
|
||||
__entry->util = util;
|
||||
__entry->margin = margin;
|
||||
),
|
||||
|
||||
TP_printk("comm=%s pid=%d util=%lu margin=%ld",
|
||||
__entry->comm, __entry->pid,
|
||||
__entry->util,
|
||||
__entry->margin)
|
||||
);
|
||||
|
||||
/*
|
||||
* Tracepoint for accounting sched group energy
|
||||
*/
|
||||
TRACE_EVENT(sched_energy_diff,
|
||||
|
||||
TP_PROTO(struct task_struct *tsk, int scpu, int dcpu, int udelta,
|
||||
int nrgb, int nrga, int nrgd, int capb, int capa, int capd,
|
||||
int nrgn, int nrgp),
|
||||
|
||||
TP_ARGS(tsk, scpu, dcpu, udelta,
|
||||
nrgb, nrga, nrgd, capb, capa, capd,
|
||||
nrgn, nrgp),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array( char, comm, TASK_COMM_LEN )
|
||||
__field( pid_t, pid )
|
||||
__field( int, scpu )
|
||||
__field( int, dcpu )
|
||||
__field( int, udelta )
|
||||
__field( int, nrgb )
|
||||
__field( int, nrga )
|
||||
__field( int, nrgd )
|
||||
__field( int, capb )
|
||||
__field( int, capa )
|
||||
__field( int, capd )
|
||||
__field( int, nrgn )
|
||||
__field( int, nrgp )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
|
||||
__entry->pid = tsk->pid;
|
||||
__entry->scpu = scpu;
|
||||
__entry->dcpu = dcpu;
|
||||
__entry->udelta = udelta;
|
||||
__entry->nrgb = nrgb;
|
||||
__entry->nrga = nrga;
|
||||
__entry->nrgd = nrgd;
|
||||
__entry->capb = capb;
|
||||
__entry->capa = capa;
|
||||
__entry->capd = capd;
|
||||
__entry->nrgn = nrgn;
|
||||
__entry->nrgp = nrgp;
|
||||
),
|
||||
|
||||
TP_printk("pid=%d comm=%s "
|
||||
"src_cpu=%d dst_cpu=%d usage_delta=%d "
|
||||
"nrg_before=%d nrg_after=%d nrg_diff=%d "
|
||||
"cap_before=%d cap_after=%d cap_delta=%d "
|
||||
"nrg_delta=%d nrg_payoff=%d",
|
||||
__entry->pid, __entry->comm,
|
||||
__entry->scpu, __entry->dcpu, __entry->udelta,
|
||||
__entry->nrgb, __entry->nrga, __entry->nrgd,
|
||||
__entry->capb, __entry->capa, __entry->capd,
|
||||
__entry->nrgn, __entry->nrgp)
|
||||
);
|
||||
|
||||
/*
|
||||
* Tracepoint for schedtune_tasks_update
|
||||
*/
|
||||
TRACE_EVENT(sched_tune_filter,
|
||||
|
||||
TP_PROTO(int nrg_delta, int cap_delta,
|
||||
int nrg_gain, int cap_gain,
|
||||
int payoff, int region),
|
||||
|
||||
TP_ARGS(nrg_delta, cap_delta, nrg_gain, cap_gain, payoff, region),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( int, nrg_delta )
|
||||
__field( int, cap_delta )
|
||||
__field( int, nrg_gain )
|
||||
__field( int, cap_gain )
|
||||
__field( int, payoff )
|
||||
__field( int, region )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->nrg_delta = nrg_delta;
|
||||
__entry->cap_delta = cap_delta;
|
||||
__entry->nrg_gain = nrg_gain;
|
||||
__entry->cap_gain = cap_gain;
|
||||
__entry->payoff = payoff;
|
||||
__entry->region = region;
|
||||
),
|
||||
|
||||
TP_printk("nrg_delta=%d cap_delta=%d nrg_gain=%d cap_gain=%d payoff=%d region=%d",
|
||||
__entry->nrg_delta, __entry->cap_delta,
|
||||
__entry->nrg_gain, __entry->cap_gain,
|
||||
__entry->payoff, __entry->region)
|
||||
);
|
||||
|
||||
/*
|
||||
* Tracepoint for system overutilized flag
|
||||
*/
|
||||
TRACE_EVENT(sched_overutilized,
|
||||
|
||||
TP_PROTO(bool overutilized),
|
||||
|
||||
TP_ARGS(overutilized),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( bool, overutilized )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->overutilized = overutilized;
|
||||
),
|
||||
|
||||
TP_printk("overutilized=%d",
|
||||
__entry->overutilized ? 1 : 0)
|
||||
);
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
struct rq;
|
||||
|
||||
TRACE_EVENT(walt_update_task_ravg,
|
||||
|
||||
TP_PROTO(struct task_struct *p, struct rq *rq, int evt,
|
||||
u64 wallclock, u64 irqtime),
|
||||
|
||||
TP_ARGS(p, rq, evt, wallclock, irqtime),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array( char, comm, TASK_COMM_LEN )
|
||||
__field( pid_t, pid )
|
||||
__field( pid_t, cur_pid )
|
||||
__field(unsigned int, cur_freq )
|
||||
__field( u64, wallclock )
|
||||
__field( u64, mark_start )
|
||||
__field( u64, delta_m )
|
||||
__field( u64, win_start )
|
||||
__field( u64, delta )
|
||||
__field( u64, irqtime )
|
||||
__field( int, evt )
|
||||
__field(unsigned int, demand )
|
||||
__field(unsigned int, sum )
|
||||
__field( int, cpu )
|
||||
__field( u64, cs )
|
||||
__field( u64, ps )
|
||||
__field( u32, curr_window )
|
||||
__field( u32, prev_window )
|
||||
__field( u64, nt_cs )
|
||||
__field( u64, nt_ps )
|
||||
__field( u32, active_windows )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->wallclock = wallclock;
|
||||
__entry->win_start = rq->window_start;
|
||||
__entry->delta = (wallclock - rq->window_start);
|
||||
__entry->evt = evt;
|
||||
__entry->cpu = rq->cpu;
|
||||
__entry->cur_pid = rq->curr->pid;
|
||||
__entry->cur_freq = rq->cur_freq;
|
||||
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
|
||||
__entry->pid = p->pid;
|
||||
__entry->mark_start = p->ravg.mark_start;
|
||||
__entry->delta_m = (wallclock - p->ravg.mark_start);
|
||||
__entry->demand = p->ravg.demand;
|
||||
__entry->sum = p->ravg.sum;
|
||||
__entry->irqtime = irqtime;
|
||||
__entry->cs = rq->curr_runnable_sum;
|
||||
__entry->ps = rq->prev_runnable_sum;
|
||||
__entry->curr_window = p->ravg.curr_window;
|
||||
__entry->prev_window = p->ravg.prev_window;
|
||||
__entry->nt_cs = rq->nt_curr_runnable_sum;
|
||||
__entry->nt_ps = rq->nt_prev_runnable_sum;
|
||||
__entry->active_windows = p->ravg.active_windows;
|
||||
),
|
||||
|
||||
TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
|
||||
" cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u"
|
||||
, __entry->wallclock, __entry->win_start, __entry->delta,
|
||||
__entry->evt, __entry->cpu,
|
||||
__entry->cur_freq, __entry->cur_pid,
|
||||
__entry->pid, __entry->comm, __entry->mark_start,
|
||||
__entry->delta_m, __entry->demand,
|
||||
__entry->sum, __entry->irqtime,
|
||||
__entry->cs, __entry->ps,
|
||||
__entry->curr_window, __entry->prev_window,
|
||||
__entry->nt_cs, __entry->nt_ps,
|
||||
__entry->active_windows
|
||||
)
|
||||
);
|
||||
|
||||
TRACE_EVENT(walt_update_history,
|
||||
|
||||
TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
|
||||
int evt),
|
||||
|
||||
TP_ARGS(rq, p, runtime, samples, evt),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array( char, comm, TASK_COMM_LEN )
|
||||
__field( pid_t, pid )
|
||||
__field(unsigned int, runtime )
|
||||
__field( int, samples )
|
||||
__field( int, evt )
|
||||
__field( u64, demand )
|
||||
__field(unsigned int, walt_avg )
|
||||
__field(unsigned int, pelt_avg )
|
||||
__array( u32, hist, RAVG_HIST_SIZE_MAX)
|
||||
__field( int, cpu )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
|
||||
__entry->pid = p->pid;
|
||||
__entry->runtime = runtime;
|
||||
__entry->samples = samples;
|
||||
__entry->evt = evt;
|
||||
__entry->demand = p->ravg.demand;
|
||||
__entry->walt_avg = (__entry->demand << 10) / walt_ravg_window,
|
||||
__entry->pelt_avg = p->se.avg.util_avg;
|
||||
memcpy(__entry->hist, p->ravg.sum_history,
|
||||
RAVG_HIST_SIZE_MAX * sizeof(u32));
|
||||
__entry->cpu = rq->cpu;
|
||||
),
|
||||
|
||||
TP_printk("%d (%s): runtime %u samples %d event %d demand %llu"
|
||||
" walt %u pelt %u (hist: %u %u %u %u %u) cpu %d",
|
||||
__entry->pid, __entry->comm,
|
||||
__entry->runtime, __entry->samples, __entry->evt,
|
||||
__entry->demand,
|
||||
__entry->walt_avg,
|
||||
__entry->pelt_avg,
|
||||
__entry->hist[0], __entry->hist[1],
|
||||
__entry->hist[2], __entry->hist[3],
|
||||
__entry->hist[4], __entry->cpu)
|
||||
);
|
||||
|
||||
TRACE_EVENT(walt_migration_update_sum,
|
||||
|
||||
TP_PROTO(struct rq *rq, struct task_struct *p),
|
||||
|
||||
TP_ARGS(rq, p),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, cpu )
|
||||
__field(int, pid )
|
||||
__field( u64, cs )
|
||||
__field( u64, ps )
|
||||
__field( s64, nt_cs )
|
||||
__field( s64, nt_ps )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->cpu = cpu_of(rq);
|
||||
__entry->cs = rq->curr_runnable_sum;
|
||||
__entry->ps = rq->prev_runnable_sum;
|
||||
__entry->nt_cs = (s64)rq->nt_curr_runnable_sum;
|
||||
__entry->nt_ps = (s64)rq->nt_prev_runnable_sum;
|
||||
__entry->pid = p->pid;
|
||||
),
|
||||
|
||||
TP_printk("cpu %d: cs %llu ps %llu nt_cs %lld nt_ps %lld pid %d",
|
||||
__entry->cpu, __entry->cs, __entry->ps,
|
||||
__entry->nt_cs, __entry->nt_ps, __entry->pid)
|
||||
);
|
||||
#endif /* CONFIG_SCHED_WALT */
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#endif /* _TRACE_SCHED_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
|
53
init/Kconfig
53
init/Kconfig
|
@ -392,6 +392,15 @@ config IRQ_TIME_ACCOUNTING
|
|||
|
||||
endchoice
|
||||
|
||||
config SCHED_WALT
|
||||
bool "Support window based load tracking"
|
||||
depends on SMP
|
||||
help
|
||||
This feature will allow the scheduler to maintain a tunable window
|
||||
based set of metrics for tasks and runqueues. These metrics can be
|
||||
used to guide task placement as well as task frequency requirements
|
||||
for cpufreq governors.
|
||||
|
||||
config BSD_PROCESS_ACCT
|
||||
bool "BSD Process Accounting"
|
||||
depends on MULTIUSER
|
||||
|
@ -999,6 +1008,23 @@ config CGROUP_CPUACCT
|
|||
Provides a simple Resource Controller for monitoring the
|
||||
total CPU consumed by the tasks in a cgroup.
|
||||
|
||||
config CGROUP_SCHEDTUNE
|
||||
bool "CFS tasks boosting cgroup subsystem (EXPERIMENTAL)"
|
||||
depends on SCHED_TUNE
|
||||
help
|
||||
This option provides the "schedtune" controller which improves the
|
||||
flexibility of the task boosting mechanism by introducing the support
|
||||
to define "per task" boost values.
|
||||
|
||||
This new controller:
|
||||
1. allows only a two layers hierarchy, where the root defines the
|
||||
system-wide boost value and its direct childrens define each one a
|
||||
different "class of tasks" to be boosted with a different value
|
||||
2. supports up to 16 different task classes, each one which could be
|
||||
configured with a different boost value
|
||||
|
||||
Say N if unsure.
|
||||
|
||||
config PAGE_COUNTER
|
||||
bool
|
||||
|
||||
|
@ -1237,6 +1263,33 @@ config SCHED_AUTOGROUP
|
|||
desktop applications. Task group autogeneration is currently based
|
||||
upon task session.
|
||||
|
||||
config SCHED_TUNE
|
||||
bool "Boosting for CFS tasks (EXPERIMENTAL)"
|
||||
depends on SMP
|
||||
help
|
||||
This option enables the system-wide support for task boosting.
|
||||
When this support is enabled a new sysctl interface is exposed to
|
||||
userspace via:
|
||||
/proc/sys/kernel/sched_cfs_boost
|
||||
which allows to set a system-wide boost value in range [0..100].
|
||||
|
||||
The currently boosting strategy is implemented in such a way that:
|
||||
- a 0% boost value requires to operate in "standard" mode by
|
||||
scheduling all tasks at the minimum capacities required by their
|
||||
workload demand
|
||||
- a 100% boost value requires to push at maximum the task
|
||||
performances, "regardless" of the incurred energy consumption
|
||||
|
||||
A boost value in between these two boundaries is used to bias the
|
||||
power/performance trade-off, the higher the boost value the more the
|
||||
scheduler is biased toward performance boosting instead of energy
|
||||
efficiency.
|
||||
|
||||
Since this support exposes a single system-wide knob, the specified
|
||||
boost value is applied to all (CFS) tasks in the system.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config SYSFS_DEPRECATED
|
||||
bool "Enable deprecated sysfs features to support old userspace tools"
|
||||
depends on SYSFS
|
||||
|
|
|
@ -54,6 +54,8 @@
|
|||
#include <linux/writeback.h>
|
||||
#include <linux/shm.h>
|
||||
|
||||
#include "sched/tune.h"
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/unistd.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
@ -699,6 +701,9 @@ void do_exit(long code)
|
|||
}
|
||||
|
||||
exit_signals(tsk); /* sets PF_EXITING */
|
||||
|
||||
schedtune_exit_task(tsk);
|
||||
|
||||
/*
|
||||
* tsk->flags are checked in the futex code to protect against
|
||||
* an exiting task cleaning up the robust pi futexes.
|
||||
|
|
|
@ -14,8 +14,11 @@ endif
|
|||
obj-y += core.o loadavg.o clock.o cputime.o
|
||||
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
|
||||
obj-y += wait.o completion.o idle.o
|
||||
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
|
||||
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
|
||||
obj-$(CONFIG_SCHED_WALT) += walt.o
|
||||
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
|
||||
obj-$(CONFIG_SCHEDSTATS) += stats.o
|
||||
obj-$(CONFIG_SCHED_DEBUG) += debug.o
|
||||
obj-$(CONFIG_SCHED_TUNE) += tune.o
|
||||
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
|
||||
obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
|
||||
|
|
|
@ -89,6 +89,7 @@
|
|||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/sched.h>
|
||||
#include "walt.h"
|
||||
|
||||
DEFINE_MUTEX(sched_domains_mutex);
|
||||
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
||||
|
@ -287,6 +288,18 @@ int sysctl_sched_rt_runtime = 950000;
|
|||
/* cpus with isolated domains */
|
||||
cpumask_var_t cpu_isolated_map;
|
||||
|
||||
struct rq *
|
||||
lock_rq_of(struct task_struct *p, unsigned long *flags)
|
||||
{
|
||||
return task_rq_lock(p, flags);
|
||||
}
|
||||
|
||||
void
|
||||
unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
|
||||
{
|
||||
task_rq_unlock(rq, p, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* this_rq_lock - lock this runqueue and disable interrupts.
|
||||
*/
|
||||
|
@ -1073,7 +1086,9 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
|
|||
|
||||
dequeue_task(rq, p, 0);
|
||||
p->on_rq = TASK_ON_RQ_MIGRATING;
|
||||
double_lock_balance(rq, cpu_rq(new_cpu));
|
||||
set_task_cpu(p, new_cpu);
|
||||
double_unlock_balance(rq, cpu_rq(new_cpu));
|
||||
raw_spin_unlock(&rq->lock);
|
||||
|
||||
rq = cpu_rq(new_cpu);
|
||||
|
@ -1297,6 +1312,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
|||
p->sched_class->migrate_task_rq(p);
|
||||
p->se.nr_migrations++;
|
||||
perf_event_task_migrate(p);
|
||||
|
||||
walt_fixup_busy_time(p, new_cpu);
|
||||
}
|
||||
|
||||
__set_task_cpu(p, new_cpu);
|
||||
|
@ -1925,6 +1942,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
|||
{
|
||||
unsigned long flags;
|
||||
int cpu, success = 0;
|
||||
#ifdef CONFIG_SMP
|
||||
struct rq *rq;
|
||||
u64 wallclock;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If we are going to wake up a thread waiting for CONDITION we
|
||||
|
@ -1982,6 +2003,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
|||
*/
|
||||
smp_rmb();
|
||||
|
||||
rq = cpu_rq(task_cpu(p));
|
||||
|
||||
raw_spin_lock(&rq->lock);
|
||||
wallclock = walt_ktime_clock();
|
||||
walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
|
||||
walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
|
||||
raw_spin_unlock(&rq->lock);
|
||||
|
||||
p->sched_contributes_to_load = !!task_contributes_to_load(p);
|
||||
p->state = TASK_WAKING;
|
||||
|
||||
|
@ -1989,10 +2018,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
|||
p->sched_class->task_waking(p);
|
||||
|
||||
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
|
||||
|
||||
if (task_cpu(p) != cpu) {
|
||||
wake_flags |= WF_MIGRATED;
|
||||
set_task_cpu(p, cpu);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
ttwu_queue(p, cpu);
|
||||
|
@ -2041,8 +2072,13 @@ static void try_to_wake_up_local(struct task_struct *p)
|
|||
|
||||
trace_sched_waking(p);
|
||||
|
||||
if (!task_on_rq_queued(p))
|
||||
if (!task_on_rq_queued(p)) {
|
||||
u64 wallclock = walt_ktime_clock();
|
||||
|
||||
walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
|
||||
walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
|
||||
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
|
||||
}
|
||||
|
||||
ttwu_do_wakeup(rq, p, 0);
|
||||
ttwu_stat(p, smp_processor_id(), 0);
|
||||
|
@ -2108,6 +2144,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|||
p->se.nr_migrations = 0;
|
||||
p->se.vruntime = 0;
|
||||
INIT_LIST_HEAD(&p->se.group_node);
|
||||
walt_init_new_task_load(p);
|
||||
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
|
||||
|
@ -2375,6 +2412,9 @@ void wake_up_new_task(struct task_struct *p)
|
|||
struct rq *rq;
|
||||
|
||||
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
||||
|
||||
walt_init_new_task_load(p);
|
||||
|
||||
/* Initialize new task's runnable average */
|
||||
init_entity_runnable_average(&p->se);
|
||||
#ifdef CONFIG_SMP
|
||||
|
@ -2387,7 +2427,8 @@ void wake_up_new_task(struct task_struct *p)
|
|||
#endif
|
||||
|
||||
rq = __task_rq_lock(p);
|
||||
activate_task(rq, p, 0);
|
||||
walt_mark_task_starting(p);
|
||||
activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
|
||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||
trace_sched_wakeup_new(p);
|
||||
check_preempt_curr(rq, p, WF_FORK);
|
||||
|
@ -2768,6 +2809,36 @@ unsigned long nr_iowait_cpu(int cpu)
|
|||
return atomic_read(&this->nr_iowait);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CPU_QUIET
|
||||
u64 nr_running_integral(unsigned int cpu)
|
||||
{
|
||||
unsigned int seqcnt;
|
||||
u64 integral;
|
||||
struct rq *q;
|
||||
|
||||
if (cpu >= nr_cpu_ids)
|
||||
return 0;
|
||||
|
||||
q = cpu_rq(cpu);
|
||||
|
||||
/*
|
||||
* Update average to avoid reading stalled value if there were
|
||||
* no run-queue changes for a long time. On the other hand if
|
||||
* the changes are happening right now, just read current value
|
||||
* directly.
|
||||
*/
|
||||
|
||||
seqcnt = read_seqcount_begin(&q->ave_seqcnt);
|
||||
integral = do_nr_running_integral(q);
|
||||
if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
|
||||
read_seqcount_begin(&q->ave_seqcnt);
|
||||
integral = q->nr_running_integral;
|
||||
}
|
||||
|
||||
return integral;
|
||||
}
|
||||
#endif
|
||||
|
||||
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
|
||||
{
|
||||
struct rq *rq = this_rq();
|
||||
|
@ -2854,6 +2925,93 @@ unsigned long long task_sched_runtime(struct task_struct *p)
|
|||
return ns;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CPU_FREQ_GOV_SCHED
|
||||
|
||||
static inline
|
||||
unsigned long add_capacity_margin(unsigned long cpu_capacity)
|
||||
{
|
||||
cpu_capacity = cpu_capacity * capacity_margin;
|
||||
cpu_capacity /= SCHED_CAPACITY_SCALE;
|
||||
return cpu_capacity;
|
||||
}
|
||||
|
||||
static inline
|
||||
unsigned long sum_capacity_reqs(unsigned long cfs_cap,
|
||||
struct sched_capacity_reqs *scr)
|
||||
{
|
||||
unsigned long total = add_capacity_margin(cfs_cap + scr->rt);
|
||||
return total += scr->dl;
|
||||
}
|
||||
|
||||
static void sched_freq_tick_pelt(int cpu)
|
||||
{
|
||||
unsigned long cpu_utilization = capacity_max;
|
||||
unsigned long capacity_curr = capacity_curr_of(cpu);
|
||||
struct sched_capacity_reqs *scr;
|
||||
|
||||
scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
|
||||
if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr)
|
||||
return;
|
||||
|
||||
/*
|
||||
* To make free room for a task that is building up its "real"
|
||||
* utilization and to harm its performance the least, request
|
||||
* a jump to a higher OPP as soon as the margin of free capacity
|
||||
* is impacted (specified by capacity_margin).
|
||||
*/
|
||||
set_cfs_cpu_capacity(cpu, true, cpu_utilization);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
static void sched_freq_tick_walt(int cpu)
|
||||
{
|
||||
unsigned long cpu_utilization = cpu_util(cpu);
|
||||
unsigned long capacity_curr = capacity_curr_of(cpu);
|
||||
|
||||
if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
|
||||
return sched_freq_tick_pelt(cpu);
|
||||
|
||||
/*
|
||||
* Add a margin to the WALT utilization.
|
||||
* NOTE: WALT tracks a single CPU signal for all the scheduling
|
||||
* classes, thus this margin is going to be added to the DL class as
|
||||
* well, which is something we do not do in sched_freq_tick_pelt case.
|
||||
*/
|
||||
cpu_utilization = add_capacity_margin(cpu_utilization);
|
||||
if (cpu_utilization <= capacity_curr)
|
||||
return;
|
||||
|
||||
/*
|
||||
* It is likely that the load is growing so we
|
||||
* keep the added margin in our request as an
|
||||
* extra boost.
|
||||
*/
|
||||
set_cfs_cpu_capacity(cpu, true, cpu_utilization);
|
||||
|
||||
}
|
||||
#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
|
||||
#else
|
||||
#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu)
|
||||
#endif /* CONFIG_SCHED_WALT */
|
||||
|
||||
static void sched_freq_tick(int cpu)
|
||||
{
|
||||
unsigned long capacity_orig, capacity_curr;
|
||||
|
||||
if (!sched_freq())
|
||||
return;
|
||||
|
||||
capacity_orig = capacity_orig_of(cpu);
|
||||
capacity_curr = capacity_curr_of(cpu);
|
||||
if (capacity_curr == capacity_orig)
|
||||
return;
|
||||
|
||||
_sched_freq_tick(cpu);
|
||||
}
|
||||
#else
|
||||
static inline void sched_freq_tick(int cpu) { }
|
||||
#endif /* CONFIG_CPU_FREQ_GOV_SCHED */
|
||||
|
||||
/*
|
||||
* This function gets called by the timer code, with HZ frequency.
|
||||
* We call it with interrupts disabled.
|
||||
|
@ -2867,10 +3025,14 @@ void scheduler_tick(void)
|
|||
sched_clock_tick();
|
||||
|
||||
raw_spin_lock(&rq->lock);
|
||||
walt_set_window_start(rq);
|
||||
update_rq_clock(rq);
|
||||
curr->sched_class->task_tick(rq, curr, 0);
|
||||
update_cpu_load_active(rq);
|
||||
walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
|
||||
walt_ktime_clock(), 0);
|
||||
calc_global_load_tick(rq);
|
||||
sched_freq_tick(cpu);
|
||||
raw_spin_unlock(&rq->lock);
|
||||
|
||||
perf_event_task_tick();
|
||||
|
@ -3107,6 +3269,7 @@ static void __sched notrace __schedule(bool preempt)
|
|||
unsigned long *switch_count;
|
||||
struct rq *rq;
|
||||
int cpu;
|
||||
u64 wallclock;
|
||||
|
||||
cpu = smp_processor_id();
|
||||
rq = cpu_rq(cpu);
|
||||
|
@ -3168,6 +3331,9 @@ static void __sched notrace __schedule(bool preempt)
|
|||
update_rq_clock(rq);
|
||||
|
||||
next = pick_next_task(rq, prev);
|
||||
wallclock = walt_ktime_clock();
|
||||
walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
|
||||
walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
|
||||
clear_tsk_need_resched(prev);
|
||||
clear_preempt_need_resched();
|
||||
rq->clock_skip_update = 0;
|
||||
|
@ -4992,6 +5158,7 @@ void init_idle(struct task_struct *idle, int cpu)
|
|||
raw_spin_lock(&rq->lock);
|
||||
|
||||
__sched_fork(0, idle);
|
||||
|
||||
idle->state = TASK_RUNNING;
|
||||
idle->se.exec_start = sched_clock();
|
||||
|
||||
|
@ -5373,10 +5540,61 @@ set_table_entry(struct ctl_table *entry,
|
|||
}
|
||||
}
|
||||
|
||||
static struct ctl_table *
|
||||
sd_alloc_ctl_energy_table(struct sched_group_energy *sge)
|
||||
{
|
||||
struct ctl_table *table = sd_alloc_ctl_entry(5);
|
||||
|
||||
if (table == NULL)
|
||||
return NULL;
|
||||
|
||||
set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states,
|
||||
sizeof(int), 0644, proc_dointvec_minmax, false);
|
||||
set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power,
|
||||
sge->nr_idle_states*sizeof(struct idle_state), 0644,
|
||||
proc_doulongvec_minmax, false);
|
||||
set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states,
|
||||
sizeof(int), 0644, proc_dointvec_minmax, false);
|
||||
set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap,
|
||||
sge->nr_cap_states*sizeof(struct capacity_state), 0644,
|
||||
proc_doulongvec_minmax, false);
|
||||
|
||||
return table;
|
||||
}
|
||||
|
||||
static struct ctl_table *
|
||||
sd_alloc_ctl_group_table(struct sched_group *sg)
|
||||
{
|
||||
struct ctl_table *table = sd_alloc_ctl_entry(2);
|
||||
|
||||
if (table == NULL)
|
||||
return NULL;
|
||||
|
||||
table->procname = kstrdup("energy", GFP_KERNEL);
|
||||
table->mode = 0555;
|
||||
table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge);
|
||||
|
||||
return table;
|
||||
}
|
||||
|
||||
static struct ctl_table *
|
||||
sd_alloc_ctl_domain_table(struct sched_domain *sd)
|
||||
{
|
||||
struct ctl_table *table = sd_alloc_ctl_entry(14);
|
||||
struct ctl_table *table;
|
||||
unsigned int nr_entries = 14;
|
||||
|
||||
int i = 0;
|
||||
struct sched_group *sg = sd->groups;
|
||||
|
||||
if (sg->sge) {
|
||||
int nr_sgs = 0;
|
||||
|
||||
do {} while (nr_sgs++, sg = sg->next, sg != sd->groups);
|
||||
|
||||
nr_entries += nr_sgs;
|
||||
}
|
||||
|
||||
table = sd_alloc_ctl_entry(nr_entries);
|
||||
|
||||
if (table == NULL)
|
||||
return NULL;
|
||||
|
@ -5409,7 +5627,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
|
|||
sizeof(long), 0644, proc_doulongvec_minmax, false);
|
||||
set_table_entry(&table[12], "name", sd->name,
|
||||
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
|
||||
/* &table[13] is terminator */
|
||||
sg = sd->groups;
|
||||
if (sg->sge) {
|
||||
char buf[32];
|
||||
struct ctl_table *entry = &table[13];
|
||||
|
||||
do {
|
||||
snprintf(buf, 32, "group%d", i);
|
||||
entry->procname = kstrdup(buf, GFP_KERNEL);
|
||||
entry->mode = 0555;
|
||||
entry->child = sd_alloc_ctl_group_table(sg);
|
||||
} while (entry++, i++, sg = sg->next, sg != sd->groups);
|
||||
}
|
||||
/* &table[nr_entries-1] is terminator */
|
||||
|
||||
return table;
|
||||
}
|
||||
|
@ -5525,6 +5755,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
|
|||
switch (action & ~CPU_TASKS_FROZEN) {
|
||||
|
||||
case CPU_UP_PREPARE:
|
||||
raw_spin_lock_irqsave(&rq->lock, flags);
|
||||
walt_set_window_start(rq);
|
||||
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
||||
rq->calc_load_update = calc_load_update;
|
||||
break;
|
||||
|
||||
|
@ -5544,6 +5777,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
|
|||
sched_ttwu_pending();
|
||||
/* Update our root-domain */
|
||||
raw_spin_lock_irqsave(&rq->lock, flags);
|
||||
walt_migrate_sync_cpu(cpu);
|
||||
if (rq->rd) {
|
||||
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
|
||||
set_rq_offline(rq);
|
||||
|
@ -5715,7 +5949,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
|
|||
printk(KERN_CONT " %*pbl",
|
||||
cpumask_pr_args(sched_group_cpus(group)));
|
||||
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
|
||||
printk(KERN_CONT " (cpu_capacity = %d)",
|
||||
printk(KERN_CONT " (cpu_capacity = %lu)",
|
||||
group->sgc->capacity);
|
||||
}
|
||||
|
||||
|
@ -5776,7 +6010,8 @@ static int sd_degenerate(struct sched_domain *sd)
|
|||
SD_BALANCE_EXEC |
|
||||
SD_SHARE_CPUCAPACITY |
|
||||
SD_SHARE_PKG_RESOURCES |
|
||||
SD_SHARE_POWERDOMAIN)) {
|
||||
SD_SHARE_POWERDOMAIN |
|
||||
SD_SHARE_CAP_STATES)) {
|
||||
if (sd->groups != sd->groups->next)
|
||||
return 0;
|
||||
}
|
||||
|
@ -5808,7 +6043,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
|
|||
SD_SHARE_CPUCAPACITY |
|
||||
SD_SHARE_PKG_RESOURCES |
|
||||
SD_PREFER_SIBLING |
|
||||
SD_SHARE_POWERDOMAIN);
|
||||
SD_SHARE_POWERDOMAIN |
|
||||
SD_SHARE_CAP_STATES);
|
||||
if (nr_node_ids == 1)
|
||||
pflags &= ~SD_SERIALIZE;
|
||||
}
|
||||
|
@ -5887,6 +6123,8 @@ static int init_rootdomain(struct root_domain *rd)
|
|||
|
||||
if (cpupri_init(&rd->cpupri) != 0)
|
||||
goto free_rto_mask;
|
||||
|
||||
init_max_cpu_capacity(&rd->max_cpu_capacity);
|
||||
return 0;
|
||||
|
||||
free_rto_mask:
|
||||
|
@ -5992,11 +6230,13 @@ DEFINE_PER_CPU(int, sd_llc_id);
|
|||
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
|
||||
DEFINE_PER_CPU(struct sched_domain *, sd_busy);
|
||||
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
|
||||
DEFINE_PER_CPU(struct sched_domain *, sd_ea);
|
||||
DEFINE_PER_CPU(struct sched_domain *, sd_scs);
|
||||
|
||||
static void update_top_cache_domain(int cpu)
|
||||
{
|
||||
struct sched_domain *sd;
|
||||
struct sched_domain *busy_sd = NULL;
|
||||
struct sched_domain *busy_sd = NULL, *ea_sd = NULL;
|
||||
int id = cpu;
|
||||
int size = 1;
|
||||
|
||||
|
@ -6017,6 +6257,17 @@ static void update_top_cache_domain(int cpu)
|
|||
|
||||
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
|
||||
rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
|
||||
|
||||
for_each_domain(cpu, sd) {
|
||||
if (sd->groups->sge)
|
||||
ea_sd = sd;
|
||||
else
|
||||
break;
|
||||
}
|
||||
rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
|
||||
|
||||
sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
|
||||
rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -6177,6 +6428,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
|
|||
* die on a /0 trap.
|
||||
*/
|
||||
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
|
||||
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
|
||||
|
||||
/*
|
||||
* Make sure the first group of this domain contains the
|
||||
|
@ -6305,6 +6557,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
|
|||
atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that the per-cpu provided sd energy data is consistent for all cpus
|
||||
* within the mask.
|
||||
*/
|
||||
static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn,
|
||||
const struct cpumask *cpumask)
|
||||
{
|
||||
const struct sched_group_energy * const sge = fn(cpu);
|
||||
struct cpumask mask;
|
||||
int i;
|
||||
|
||||
if (cpumask_weight(cpumask) <= 1)
|
||||
return;
|
||||
|
||||
cpumask_xor(&mask, cpumask, get_cpu_mask(cpu));
|
||||
|
||||
for_each_cpu(i, &mask) {
|
||||
const struct sched_group_energy * const e = fn(i);
|
||||
int y;
|
||||
|
||||
BUG_ON(e->nr_idle_states != sge->nr_idle_states);
|
||||
|
||||
for (y = 0; y < (e->nr_idle_states); y++) {
|
||||
BUG_ON(e->idle_states[y].power !=
|
||||
sge->idle_states[y].power);
|
||||
}
|
||||
|
||||
BUG_ON(e->nr_cap_states != sge->nr_cap_states);
|
||||
|
||||
for (y = 0; y < (e->nr_cap_states); y++) {
|
||||
BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap);
|
||||
BUG_ON(e->cap_states[y].power !=
|
||||
sge->cap_states[y].power);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void init_sched_energy(int cpu, struct sched_domain *sd,
|
||||
sched_domain_energy_f fn)
|
||||
{
|
||||
if (!(fn && fn(cpu)))
|
||||
return;
|
||||
|
||||
if (cpu != group_balance_cpu(sd->groups))
|
||||
return;
|
||||
|
||||
if (sd->child && !sd->child->groups->sge) {
|
||||
pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
pr_err(" energy data on %s but not on %s domain\n",
|
||||
sd->name, sd->child->name);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));
|
||||
|
||||
sd->groups->sge = fn(cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initializers for schedule domains
|
||||
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
|
||||
|
@ -6413,6 +6725,7 @@ static int sched_domains_curr_level;
|
|||
* SD_SHARE_PKG_RESOURCES - describes shared caches
|
||||
* SD_NUMA - describes NUMA topologies
|
||||
* SD_SHARE_POWERDOMAIN - describes shared power domain
|
||||
* SD_SHARE_CAP_STATES - describes shared capacity states
|
||||
*
|
||||
* Odd one out:
|
||||
* SD_ASYM_PACKING - describes SMT quirks
|
||||
|
@ -6422,7 +6735,8 @@ static int sched_domains_curr_level;
|
|||
SD_SHARE_PKG_RESOURCES | \
|
||||
SD_NUMA | \
|
||||
SD_ASYM_PACKING | \
|
||||
SD_SHARE_POWERDOMAIN)
|
||||
SD_SHARE_POWERDOMAIN | \
|
||||
SD_SHARE_CAP_STATES)
|
||||
|
||||
static struct sched_domain *
|
||||
sd_init(struct sched_domain_topology_level *tl, int cpu)
|
||||
|
@ -6972,6 +7286,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
|
|||
enum s_alloc alloc_state;
|
||||
struct sched_domain *sd;
|
||||
struct s_data d;
|
||||
struct rq *rq = NULL;
|
||||
int i, ret = -ENOMEM;
|
||||
|
||||
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
|
||||
|
@ -7010,10 +7325,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
|
|||
|
||||
/* Calculate CPU capacity for physical packages and nodes */
|
||||
for (i = nr_cpumask_bits-1; i >= 0; i--) {
|
||||
struct sched_domain_topology_level *tl = sched_domain_topology;
|
||||
|
||||
if (!cpumask_test_cpu(i, cpu_map))
|
||||
continue;
|
||||
|
||||
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
|
||||
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
|
||||
init_sched_energy(i, sd, tl->energy);
|
||||
claim_allocations(i, sd);
|
||||
init_sched_groups_capacity(i, sd);
|
||||
}
|
||||
|
@ -7022,6 +7340,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
|
|||
/* Attach the domains */
|
||||
rcu_read_lock();
|
||||
for_each_cpu(i, cpu_map) {
|
||||
rq = cpu_rq(i);
|
||||
sd = *per_cpu_ptr(d.sd, i);
|
||||
cpu_attach_domain(sd, d.rd, i);
|
||||
}
|
||||
|
@ -7303,6 +7622,7 @@ void __init sched_init_smp(void)
|
|||
{
|
||||
cpumask_var_t non_isolated_cpus;
|
||||
|
||||
walt_init_cpu_efficiency();
|
||||
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
|
||||
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
|
||||
|
||||
|
@ -7480,6 +7800,11 @@ void __init sched_init(void)
|
|||
rq->idle_stamp = 0;
|
||||
rq->avg_idle = 2*sysctl_sched_migration_cost;
|
||||
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
rq->cur_irqload = 0;
|
||||
rq->avg_irqload = 0;
|
||||
rq->irqload_ts = 0;
|
||||
#endif
|
||||
|
||||
INIT_LIST_HEAD(&rq->cfs_tasks);
|
||||
|
||||
|
|
499
kernel/sched/cpufreq_sched.c
Normal file
499
kernel/sched/cpufreq_sched.c
Normal file
|
@ -0,0 +1,499 @@
|
|||
/*
|
||||
* Copyright (C) 2015 Michael Turquette <mturquette@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/cpufreq.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/irq_work.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/string.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/cpufreq_sched.h>
|
||||
|
||||
#include "sched.h"
|
||||
|
||||
#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */
|
||||
#define THROTTLE_UP_NSEC 500000 /* 500us default */
|
||||
|
||||
struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE;
|
||||
static bool __read_mostly cpufreq_driver_slow;
|
||||
|
||||
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
|
||||
static struct cpufreq_governor cpufreq_gov_sched;
|
||||
#endif
|
||||
|
||||
static DEFINE_PER_CPU(unsigned long, enabled);
|
||||
DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
|
||||
|
||||
/**
|
||||
* gov_data - per-policy data internal to the governor
|
||||
* @up_throttle: next throttling period expiry if increasing OPP
|
||||
* @down_throttle: next throttling period expiry if decreasing OPP
|
||||
* @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP
|
||||
* @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP
|
||||
* @task: worker thread for dvfs transition that may block/sleep
|
||||
* @irq_work: callback used to wake up worker thread
|
||||
* @requested_freq: last frequency requested by the sched governor
|
||||
*
|
||||
* struct gov_data is the per-policy cpufreq_sched-specific data structure. A
|
||||
* per-policy instance of it is created when the cpufreq_sched governor receives
|
||||
* the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
|
||||
* member of struct cpufreq_policy.
|
||||
*
|
||||
* Readers of this data must call down_read(policy->rwsem). Writers must
|
||||
* call down_write(policy->rwsem).
|
||||
*/
|
||||
struct gov_data {
|
||||
ktime_t up_throttle;
|
||||
ktime_t down_throttle;
|
||||
unsigned int up_throttle_nsec;
|
||||
unsigned int down_throttle_nsec;
|
||||
struct task_struct *task;
|
||||
struct irq_work irq_work;
|
||||
unsigned int requested_freq;
|
||||
};
|
||||
|
||||
static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
|
||||
unsigned int freq)
|
||||
{
|
||||
struct gov_data *gd = policy->governor_data;
|
||||
|
||||
/* avoid race with cpufreq_sched_stop */
|
||||
if (!down_write_trylock(&policy->rwsem))
|
||||
return;
|
||||
|
||||
__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
|
||||
|
||||
gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec);
|
||||
gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec);
|
||||
up_write(&policy->rwsem);
|
||||
}
|
||||
|
||||
static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq)
|
||||
{
|
||||
ktime_t now = ktime_get();
|
||||
|
||||
ktime_t throttle = gd->requested_freq < cur_freq ?
|
||||
gd->down_throttle : gd->up_throttle;
|
||||
|
||||
if (ktime_after(now, throttle))
|
||||
return false;
|
||||
|
||||
while (1) {
|
||||
int usec_left = ktime_to_ns(ktime_sub(throttle, now));
|
||||
|
||||
usec_left /= NSEC_PER_USEC;
|
||||
trace_cpufreq_sched_throttled(usec_left);
|
||||
usleep_range(usec_left, usec_left + 100);
|
||||
now = ktime_get();
|
||||
if (ktime_after(now, throttle))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* we pass in struct cpufreq_policy. This is safe because changing out the
|
||||
* policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
|
||||
* which tears down all of the data structures and __cpufreq_governor(policy,
|
||||
* CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
|
||||
* new policy pointer
|
||||
*/
|
||||
static int cpufreq_sched_thread(void *data)
|
||||
{
|
||||
struct sched_param param;
|
||||
struct cpufreq_policy *policy;
|
||||
struct gov_data *gd;
|
||||
unsigned int new_request = 0;
|
||||
unsigned int last_request = 0;
|
||||
int ret;
|
||||
|
||||
policy = (struct cpufreq_policy *) data;
|
||||
gd = policy->governor_data;
|
||||
|
||||
param.sched_priority = 50;
|
||||
ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m);
|
||||
if (ret) {
|
||||
pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
|
||||
do_exit(-EINVAL);
|
||||
} else {
|
||||
pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
|
||||
__func__, gd->task->pid);
|
||||
}
|
||||
|
||||
do {
|
||||
new_request = gd->requested_freq;
|
||||
if (new_request == last_request) {
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
if (kthread_should_stop())
|
||||
break;
|
||||
schedule();
|
||||
} else {
|
||||
/*
|
||||
* if the frequency thread sleeps while waiting to be
|
||||
* unthrottled, start over to check for a newer request
|
||||
*/
|
||||
if (finish_last_request(gd, policy->cur))
|
||||
continue;
|
||||
last_request = new_request;
|
||||
cpufreq_sched_try_driver_target(policy, new_request);
|
||||
}
|
||||
} while (!kthread_should_stop());
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void cpufreq_sched_irq_work(struct irq_work *irq_work)
|
||||
{
|
||||
struct gov_data *gd;
|
||||
|
||||
gd = container_of(irq_work, struct gov_data, irq_work);
|
||||
if (!gd)
|
||||
return;
|
||||
|
||||
wake_up_process(gd->task);
|
||||
}
|
||||
|
||||
static void update_fdomain_capacity_request(int cpu)
|
||||
{
|
||||
unsigned int freq_new, index_new, cpu_tmp;
|
||||
struct cpufreq_policy *policy;
|
||||
struct gov_data *gd;
|
||||
unsigned long capacity = 0;
|
||||
|
||||
/*
|
||||
* Avoid grabbing the policy if possible. A test is still
|
||||
* required after locking the CPU's policy to avoid racing
|
||||
* with the governor changing.
|
||||
*/
|
||||
if (!per_cpu(enabled, cpu))
|
||||
return;
|
||||
|
||||
policy = cpufreq_cpu_get(cpu);
|
||||
if (IS_ERR_OR_NULL(policy))
|
||||
return;
|
||||
|
||||
if (policy->governor != &cpufreq_gov_sched ||
|
||||
!policy->governor_data)
|
||||
goto out;
|
||||
|
||||
gd = policy->governor_data;
|
||||
|
||||
/* find max capacity requested by cpus in this policy */
|
||||
for_each_cpu(cpu_tmp, policy->cpus) {
|
||||
struct sched_capacity_reqs *scr;
|
||||
|
||||
scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp);
|
||||
capacity = max(capacity, scr->total);
|
||||
}
|
||||
|
||||
/* Convert the new maximum capacity request into a cpu frequency */
|
||||
freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
|
||||
if (cpufreq_frequency_table_target(policy, policy->freq_table,
|
||||
freq_new, CPUFREQ_RELATION_L,
|
||||
&index_new))
|
||||
goto out;
|
||||
freq_new = policy->freq_table[index_new].frequency;
|
||||
|
||||
if (freq_new > policy->max)
|
||||
freq_new = policy->max;
|
||||
|
||||
if (freq_new < policy->min)
|
||||
freq_new = policy->min;
|
||||
|
||||
trace_cpufreq_sched_request_opp(cpu, capacity, freq_new,
|
||||
gd->requested_freq);
|
||||
if (freq_new == gd->requested_freq)
|
||||
goto out;
|
||||
|
||||
gd->requested_freq = freq_new;
|
||||
|
||||
/*
|
||||
* Throttling is not yet supported on platforms with fast cpufreq
|
||||
* drivers.
|
||||
*/
|
||||
if (cpufreq_driver_slow)
|
||||
irq_work_queue_on(&gd->irq_work, cpu);
|
||||
else
|
||||
cpufreq_sched_try_driver_target(policy, freq_new);
|
||||
|
||||
out:
|
||||
cpufreq_cpu_put(policy);
|
||||
}
|
||||
|
||||
void update_cpu_capacity_request(int cpu, bool request)
|
||||
{
|
||||
unsigned long new_capacity;
|
||||
struct sched_capacity_reqs *scr;
|
||||
|
||||
/* The rq lock serializes access to the CPU's sched_capacity_reqs. */
|
||||
lockdep_assert_held(&cpu_rq(cpu)->lock);
|
||||
|
||||
scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
|
||||
|
||||
new_capacity = scr->cfs + scr->rt;
|
||||
new_capacity = new_capacity * capacity_margin
|
||||
/ SCHED_CAPACITY_SCALE;
|
||||
new_capacity += scr->dl;
|
||||
|
||||
if (new_capacity == scr->total)
|
||||
return;
|
||||
|
||||
trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity);
|
||||
|
||||
scr->total = new_capacity;
|
||||
if (request)
|
||||
update_fdomain_capacity_request(cpu);
|
||||
}
|
||||
|
||||
static inline void set_sched_freq(void)
|
||||
{
|
||||
static_key_slow_inc(&__sched_freq);
|
||||
}
|
||||
|
||||
static inline void clear_sched_freq(void)
|
||||
{
|
||||
static_key_slow_dec(&__sched_freq);
|
||||
}
|
||||
|
||||
static struct attribute_group sched_attr_group_gov_pol;
|
||||
static struct attribute_group *get_sysfs_attr(void)
|
||||
{
|
||||
return &sched_attr_group_gov_pol;
|
||||
}
|
||||
|
||||
static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
|
||||
{
|
||||
struct gov_data *gd;
|
||||
int cpu;
|
||||
int rc;
|
||||
|
||||
for_each_cpu(cpu, policy->cpus)
|
||||
memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0,
|
||||
sizeof(struct sched_capacity_reqs));
|
||||
|
||||
gd = kzalloc(sizeof(*gd), GFP_KERNEL);
|
||||
if (!gd)
|
||||
return -ENOMEM;
|
||||
|
||||
gd->up_throttle_nsec = policy->cpuinfo.transition_latency ?
|
||||
policy->cpuinfo.transition_latency :
|
||||
THROTTLE_UP_NSEC;
|
||||
gd->down_throttle_nsec = THROTTLE_DOWN_NSEC;
|
||||
pr_debug("%s: throttle threshold = %u [ns]\n",
|
||||
__func__, gd->up_throttle_nsec);
|
||||
|
||||
rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
|
||||
if (rc) {
|
||||
pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
|
||||
goto err;
|
||||
}
|
||||
|
||||
policy->governor_data = gd;
|
||||
if (cpufreq_driver_is_slow()) {
|
||||
cpufreq_driver_slow = true;
|
||||
gd->task = kthread_create(cpufreq_sched_thread, policy,
|
||||
"kschedfreq:%d",
|
||||
cpumask_first(policy->related_cpus));
|
||||
if (IS_ERR_OR_NULL(gd->task)) {
|
||||
pr_err("%s: failed to create kschedfreq thread\n",
|
||||
__func__);
|
||||
goto err;
|
||||
}
|
||||
get_task_struct(gd->task);
|
||||
kthread_bind_mask(gd->task, policy->related_cpus);
|
||||
wake_up_process(gd->task);
|
||||
init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
|
||||
}
|
||||
|
||||
set_sched_freq();
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
policy->governor_data = NULL;
|
||||
kfree(gd);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
|
||||
{
|
||||
struct gov_data *gd = policy->governor_data;
|
||||
|
||||
clear_sched_freq();
|
||||
if (cpufreq_driver_slow) {
|
||||
kthread_stop(gd->task);
|
||||
put_task_struct(gd->task);
|
||||
}
|
||||
|
||||
sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr());
|
||||
|
||||
policy->governor_data = NULL;
|
||||
|
||||
kfree(gd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cpufreq_sched_start(struct cpufreq_policy *policy)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_cpu(cpu, policy->cpus)
|
||||
per_cpu(enabled, cpu) = 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void cpufreq_sched_limits(struct cpufreq_policy *policy)
|
||||
{
|
||||
unsigned int clamp_freq;
|
||||
struct gov_data *gd = policy->governor_data;;
|
||||
|
||||
pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n",
|
||||
policy->cpu, policy->min, policy->max,
|
||||
policy->cur);
|
||||
|
||||
clamp_freq = clamp(gd->requested_freq, policy->min, policy->max);
|
||||
|
||||
if (policy->cur != clamp_freq)
|
||||
__cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L);
|
||||
}
|
||||
|
||||
static int cpufreq_sched_stop(struct cpufreq_policy *policy)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_cpu(cpu, policy->cpus)
|
||||
per_cpu(enabled, cpu) = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cpufreq_sched_setup(struct cpufreq_policy *policy,
|
||||
unsigned int event)
|
||||
{
|
||||
switch (event) {
|
||||
case CPUFREQ_GOV_POLICY_INIT:
|
||||
return cpufreq_sched_policy_init(policy);
|
||||
case CPUFREQ_GOV_POLICY_EXIT:
|
||||
return cpufreq_sched_policy_exit(policy);
|
||||
case CPUFREQ_GOV_START:
|
||||
return cpufreq_sched_start(policy);
|
||||
case CPUFREQ_GOV_STOP:
|
||||
return cpufreq_sched_stop(policy);
|
||||
case CPUFREQ_GOV_LIMITS:
|
||||
cpufreq_sched_limits(policy);
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Tunables */
|
||||
static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf)
|
||||
{
|
||||
return sprintf(buf, "%u\n", gd->up_throttle_nsec);
|
||||
}
|
||||
|
||||
static ssize_t store_up_throttle_nsec(struct gov_data *gd,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
int ret;
|
||||
long unsigned int val;
|
||||
|
||||
ret = kstrtoul(buf, 0, &val);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
gd->up_throttle_nsec = val;
|
||||
return count;
|
||||
}
|
||||
|
||||
static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf)
|
||||
{
|
||||
return sprintf(buf, "%u\n", gd->down_throttle_nsec);
|
||||
}
|
||||
|
||||
static ssize_t store_down_throttle_nsec(struct gov_data *gd,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
int ret;
|
||||
long unsigned int val;
|
||||
|
||||
ret = kstrtoul(buf, 0, &val);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
gd->down_throttle_nsec = val;
|
||||
return count;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create show/store routines
|
||||
* - sys: One governor instance for complete SYSTEM
|
||||
* - pol: One governor instance per struct cpufreq_policy
|
||||
*/
|
||||
#define show_gov_pol_sys(file_name) \
|
||||
static ssize_t show_##file_name##_gov_pol \
|
||||
(struct cpufreq_policy *policy, char *buf) \
|
||||
{ \
|
||||
return show_##file_name(policy->governor_data, buf); \
|
||||
}
|
||||
|
||||
#define store_gov_pol_sys(file_name) \
|
||||
static ssize_t store_##file_name##_gov_pol \
|
||||
(struct cpufreq_policy *policy, const char *buf, size_t count) \
|
||||
{ \
|
||||
return store_##file_name(policy->governor_data, buf, count); \
|
||||
}
|
||||
|
||||
#define gov_pol_attr_rw(_name) \
|
||||
static struct freq_attr _name##_gov_pol = \
|
||||
__ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
|
||||
|
||||
#define show_store_gov_pol_sys(file_name) \
|
||||
show_gov_pol_sys(file_name); \
|
||||
store_gov_pol_sys(file_name)
|
||||
#define tunable_handlers(file_name) \
|
||||
show_gov_pol_sys(file_name); \
|
||||
store_gov_pol_sys(file_name); \
|
||||
gov_pol_attr_rw(file_name)
|
||||
|
||||
tunable_handlers(down_throttle_nsec);
|
||||
tunable_handlers(up_throttle_nsec);
|
||||
|
||||
/* Per policy governor instance */
|
||||
static struct attribute *sched_attributes_gov_pol[] = {
|
||||
&up_throttle_nsec_gov_pol.attr,
|
||||
&down_throttle_nsec_gov_pol.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group sched_attr_group_gov_pol = {
|
||||
.attrs = sched_attributes_gov_pol,
|
||||
.name = "sched",
|
||||
};
|
||||
|
||||
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
|
||||
static
|
||||
#endif
|
||||
struct cpufreq_governor cpufreq_gov_sched = {
|
||||
.name = "sched",
|
||||
.governor = cpufreq_sched_setup,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init cpufreq_sched_init(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_cpu(cpu, cpu_possible_mask)
|
||||
per_cpu(enabled, cpu) = 0;
|
||||
return cpufreq_register_governor(&cpufreq_gov_sched);
|
||||
}
|
||||
|
||||
/* Try to make this the default governor */
|
||||
fs_initcall(cpufreq_sched_init);
|
|
@ -5,6 +5,7 @@
|
|||
#include <linux/static_key.h>
|
||||
#include <linux/context_tracking.h>
|
||||
#include "sched.h"
|
||||
#include "walt.h"
|
||||
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
|
@ -49,6 +50,10 @@ void irqtime_account_irq(struct task_struct *curr)
|
|||
unsigned long flags;
|
||||
s64 delta;
|
||||
int cpu;
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
u64 wallclock;
|
||||
bool account = true;
|
||||
#endif
|
||||
|
||||
if (!sched_clock_irqtime)
|
||||
return;
|
||||
|
@ -56,6 +61,9 @@ void irqtime_account_irq(struct task_struct *curr)
|
|||
local_irq_save(flags);
|
||||
|
||||
cpu = smp_processor_id();
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
wallclock = sched_clock_cpu(cpu);
|
||||
#endif
|
||||
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
|
||||
__this_cpu_add(irq_start_time, delta);
|
||||
|
||||
|
@ -70,8 +78,16 @@ void irqtime_account_irq(struct task_struct *curr)
|
|||
__this_cpu_add(cpu_hardirq_time, delta);
|
||||
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
|
||||
__this_cpu_add(cpu_softirq_time, delta);
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
else
|
||||
account = false;
|
||||
#endif
|
||||
|
||||
irq_time_write_end();
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
if (account)
|
||||
walt_account_irqtime(cpu, curr, delta, wallclock);
|
||||
#endif
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(irqtime_account_irq);
|
||||
|
|
|
@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
|
|||
return !RB_EMPTY_NODE(&dl_se->rb_node);
|
||||
}
|
||||
|
||||
static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
|
||||
{
|
||||
u64 se_bw = dl_se->dl_bw;
|
||||
|
||||
dl_rq->avg_bw += se_bw;
|
||||
}
|
||||
|
||||
static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
|
||||
{
|
||||
u64 se_bw = dl_se->dl_bw;
|
||||
|
||||
dl_rq->avg_bw -= se_bw;
|
||||
if (dl_rq->avg_bw < 0) {
|
||||
WARN_ON(1);
|
||||
dl_rq->avg_bw = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
|
||||
{
|
||||
struct sched_dl_entity *dl_se = &p->dl;
|
||||
|
@ -494,6 +512,9 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
|
|||
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
|
||||
struct rq *rq = rq_of_dl_rq(dl_rq);
|
||||
|
||||
if (dl_se->dl_new)
|
||||
add_average_bw(dl_se, dl_rq);
|
||||
|
||||
/*
|
||||
* The arrival of a new instance needs special treatment, i.e.,
|
||||
* the actual scheduling parameters have to be "renewed".
|
||||
|
@ -741,8 +762,6 @@ static void update_curr_dl(struct rq *rq)
|
|||
curr->se.exec_start = rq_clock_task(rq);
|
||||
cpuacct_charge(curr, delta_exec);
|
||||
|
||||
sched_rt_avg_update(rq, delta_exec);
|
||||
|
||||
dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
|
||||
if (dl_runtime_exceeded(dl_se)) {
|
||||
dl_se->dl_throttled = 1;
|
||||
|
@ -1241,6 +1260,8 @@ static void task_fork_dl(struct task_struct *p)
|
|||
static void task_dead_dl(struct task_struct *p)
|
||||
{
|
||||
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
|
||||
struct dl_rq *dl_rq = dl_rq_of_se(&p->dl);
|
||||
struct rq *rq = rq_of_dl_rq(dl_rq);
|
||||
|
||||
/*
|
||||
* Since we are TASK_DEAD we won't slip out of the domain!
|
||||
|
@ -1249,6 +1270,8 @@ static void task_dead_dl(struct task_struct *p)
|
|||
/* XXX we should retain the bw until 0-lag */
|
||||
dl_b->total_bw -= p->dl.dl_bw;
|
||||
raw_spin_unlock_irq(&dl_b->lock);
|
||||
|
||||
clear_average_bw(&p->dl, &rq->dl);
|
||||
}
|
||||
|
||||
static void set_curr_task_dl(struct rq *rq)
|
||||
|
@ -1556,7 +1579,9 @@ retry:
|
|||
}
|
||||
|
||||
deactivate_task(rq, next_task, 0);
|
||||
clear_average_bw(&next_task->dl, &rq->dl);
|
||||
set_task_cpu(next_task, later_rq->cpu);
|
||||
add_average_bw(&next_task->dl, &later_rq->dl);
|
||||
activate_task(later_rq, next_task, 0);
|
||||
ret = 1;
|
||||
|
||||
|
@ -1644,7 +1669,9 @@ static void pull_dl_task(struct rq *this_rq)
|
|||
resched = true;
|
||||
|
||||
deactivate_task(src_rq, p, 0);
|
||||
clear_average_bw(&p->dl, &src_rq->dl);
|
||||
set_task_cpu(p, this_cpu);
|
||||
add_average_bw(&p->dl, &this_rq->dl);
|
||||
activate_task(this_rq, p, 0);
|
||||
dmin = p->dl.deadline;
|
||||
|
||||
|
@ -1750,6 +1777,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
|
|||
if (!start_dl_timer(p))
|
||||
__dl_clear_params(p);
|
||||
|
||||
clear_average_bw(&p->dl, &rq->dl);
|
||||
|
||||
/*
|
||||
* Since this might be the only -deadline task on the rq,
|
||||
* this is the right place to try to pull some other one
|
||||
|
|
124
kernel/sched/energy.c
Normal file
124
kernel/sched/energy.c
Normal file
|
@ -0,0 +1,124 @@
|
|||
/*
|
||||
* Obtain energy cost data from DT and populate relevant scheduler data
|
||||
* structures.
|
||||
*
|
||||
* Copyright (C) 2015 ARM Ltd.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#define pr_fmt(fmt) "sched-energy: " fmt
|
||||
|
||||
#define DEBUG
|
||||
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/of.h>
|
||||
#include <linux/printk.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched_energy.h>
|
||||
#include <linux/stddef.h>
|
||||
|
||||
struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
|
||||
|
||||
static void free_resources(void)
|
||||
{
|
||||
int cpu, sd_level;
|
||||
struct sched_group_energy *sge;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
for_each_possible_sd_level(sd_level) {
|
||||
sge = sge_array[cpu][sd_level];
|
||||
if (sge) {
|
||||
kfree(sge->cap_states);
|
||||
kfree(sge->idle_states);
|
||||
kfree(sge);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void init_sched_energy_costs(void)
|
||||
{
|
||||
struct device_node *cn, *cp;
|
||||
struct capacity_state *cap_states;
|
||||
struct idle_state *idle_states;
|
||||
struct sched_group_energy *sge;
|
||||
const struct property *prop;
|
||||
int sd_level, i, nstates, cpu;
|
||||
const __be32 *val;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
cn = of_get_cpu_node(cpu, NULL);
|
||||
if (!cn) {
|
||||
pr_warn("CPU device node missing for CPU %d\n", cpu);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!of_find_property(cn, "sched-energy-costs", NULL)) {
|
||||
pr_warn("CPU device node has no sched-energy-costs\n");
|
||||
return;
|
||||
}
|
||||
|
||||
for_each_possible_sd_level(sd_level) {
|
||||
cp = of_parse_phandle(cn, "sched-energy-costs", sd_level);
|
||||
if (!cp)
|
||||
break;
|
||||
|
||||
prop = of_find_property(cp, "busy-cost-data", NULL);
|
||||
if (!prop || !prop->value) {
|
||||
pr_warn("No busy-cost data, skipping sched_energy init\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
sge = kcalloc(1, sizeof(struct sched_group_energy),
|
||||
GFP_NOWAIT);
|
||||
|
||||
nstates = (prop->length / sizeof(u32)) / 2;
|
||||
cap_states = kcalloc(nstates,
|
||||
sizeof(struct capacity_state),
|
||||
GFP_NOWAIT);
|
||||
|
||||
for (i = 0, val = prop->value; i < nstates; i++) {
|
||||
cap_states[i].cap = be32_to_cpup(val++);
|
||||
cap_states[i].power = be32_to_cpup(val++);
|
||||
}
|
||||
|
||||
sge->nr_cap_states = nstates;
|
||||
sge->cap_states = cap_states;
|
||||
|
||||
prop = of_find_property(cp, "idle-cost-data", NULL);
|
||||
if (!prop || !prop->value) {
|
||||
pr_warn("No idle-cost data, skipping sched_energy init\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
nstates = (prop->length / sizeof(u32));
|
||||
idle_states = kcalloc(nstates,
|
||||
sizeof(struct idle_state),
|
||||
GFP_NOWAIT);
|
||||
|
||||
for (i = 0, val = prop->value; i < nstates; i++)
|
||||
idle_states[i].power = be32_to_cpup(val++);
|
||||
|
||||
sge->nr_idle_states = nstates;
|
||||
sge->idle_states = idle_states;
|
||||
|
||||
sge_array[cpu][sd_level] = sge;
|
||||
}
|
||||
}
|
||||
|
||||
pr_info("Sched-energy-costs installed from DT\n");
|
||||
return;
|
||||
|
||||
out:
|
||||
free_resources();
|
||||
}
|
1300
kernel/sched/fair.c
1300
kernel/sched/fair.c
File diff suppressed because it is too large
Load diff
|
@ -69,3 +69,8 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
|
|||
SCHED_FEAT(LB_MIN, false)
|
||||
SCHED_FEAT(ATTACH_AGE_LOAD, true)
|
||||
|
||||
/*
|
||||
* Energy aware scheduling. Use platform energy model to guide scheduling
|
||||
* decisions optimizing for energy efficiency.
|
||||
*/
|
||||
SCHED_FEAT(ENERGY_AWARE, false)
|
||||
|
|
|
@ -19,9 +19,10 @@
|
|||
* sched_idle_set_state - Record idle state for the current CPU.
|
||||
* @idle_state: State to record.
|
||||
*/
|
||||
void sched_idle_set_state(struct cpuidle_state *idle_state)
|
||||
void sched_idle_set_state(struct cpuidle_state *idle_state, int index)
|
||||
{
|
||||
idle_set_state(this_rq(), idle_state);
|
||||
idle_set_state_idx(this_rq(), index);
|
||||
}
|
||||
|
||||
static int __read_mostly cpu_idle_force_poll;
|
||||
|
@ -219,6 +220,7 @@ static void cpu_idle_loop(void)
|
|||
*/
|
||||
|
||||
__current_set_polling();
|
||||
quiet_vmstat();
|
||||
tick_nohz_idle_enter();
|
||||
|
||||
while (!need_resched()) {
|
||||
|
|
|
@ -8,6 +8,8 @@
|
|||
#include <linux/slab.h>
|
||||
#include <linux/irq_work.h>
|
||||
|
||||
#include "walt.h"
|
||||
|
||||
int sched_rr_timeslice = RR_TIMESLICE;
|
||||
|
||||
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
|
||||
|
@ -889,6 +891,51 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
|
|||
return rt_task_of(rt_se)->prio;
|
||||
}
|
||||
|
||||
static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
|
||||
{
|
||||
struct rt_prio_array *array = &rt_rq->active;
|
||||
struct sched_rt_entity *rt_se;
|
||||
char buf[500];
|
||||
char *pos = buf;
|
||||
char *end = buf + sizeof(buf);
|
||||
int idx;
|
||||
|
||||
pos += snprintf(pos, sizeof(buf),
|
||||
"sched: RT throttling activated for rt_rq %p (cpu %d)\n",
|
||||
rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
|
||||
|
||||
if (bitmap_empty(array->bitmap, MAX_RT_PRIO))
|
||||
goto out;
|
||||
|
||||
pos += snprintf(pos, end - pos, "potential CPU hogs:\n");
|
||||
idx = sched_find_first_bit(array->bitmap);
|
||||
while (idx < MAX_RT_PRIO) {
|
||||
list_for_each_entry(rt_se, array->queue + idx, run_list) {
|
||||
struct task_struct *p;
|
||||
|
||||
if (!rt_entity_is_task(rt_se))
|
||||
continue;
|
||||
|
||||
p = rt_task_of(rt_se);
|
||||
if (pos < end)
|
||||
pos += snprintf(pos, end - pos, "\t%s (%d)\n",
|
||||
p->comm, p->pid);
|
||||
}
|
||||
idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
|
||||
}
|
||||
out:
|
||||
#ifdef CONFIG_PANIC_ON_RT_THROTTLING
|
||||
/*
|
||||
* Use pr_err() in the BUG() case since printk_sched() will
|
||||
* not get flushed and deadlock is not a concern.
|
||||
*/
|
||||
pr_err("%s", buf);
|
||||
BUG();
|
||||
#else
|
||||
printk_deferred("%s", buf);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
|
||||
{
|
||||
u64 runtime = sched_rt_runtime(rt_rq);
|
||||
|
@ -912,8 +959,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
|
|||
* but accrue some time due to boosting.
|
||||
*/
|
||||
if (likely(rt_b->rt_runtime)) {
|
||||
static bool once = false;
|
||||
|
||||
rt_rq->rt_throttled = 1;
|
||||
printk_deferred_once("sched: RT throttling activated\n");
|
||||
|
||||
if (!once) {
|
||||
once = true;
|
||||
dump_throttled_rt_tasks(rt_rq);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* In case we did anyway, make it go away,
|
||||
|
@ -1261,6 +1314,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
|
|||
rt_se->timeout = 0;
|
||||
|
||||
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
|
||||
walt_inc_cumulative_runnable_avg(rq, p);
|
||||
|
||||
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
|
||||
enqueue_pushable_task(rq, p);
|
||||
|
@ -1272,6 +1326,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
|
|||
|
||||
update_curr_rt(rq);
|
||||
dequeue_rt_entity(rt_se);
|
||||
walt_dec_cumulative_runnable_avg(rq, p);
|
||||
|
||||
dequeue_pushable_task(rq, p);
|
||||
}
|
||||
|
@ -1426,6 +1481,41 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
|
|||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static void sched_rt_update_capacity_req(struct rq *rq)
|
||||
{
|
||||
u64 total, used, age_stamp, avg;
|
||||
s64 delta;
|
||||
|
||||
if (!sched_freq())
|
||||
return;
|
||||
|
||||
sched_avg_update(rq);
|
||||
/*
|
||||
* Since we're reading these variables without serialization make sure
|
||||
* we read them once before doing sanity checks on them.
|
||||
*/
|
||||
age_stamp = READ_ONCE(rq->age_stamp);
|
||||
avg = READ_ONCE(rq->rt_avg);
|
||||
delta = rq_clock(rq) - age_stamp;
|
||||
|
||||
if (unlikely(delta < 0))
|
||||
delta = 0;
|
||||
|
||||
total = sched_avg_period() + delta;
|
||||
|
||||
used = div_u64(avg, total);
|
||||
if (unlikely(used > SCHED_CAPACITY_SCALE))
|
||||
used = SCHED_CAPACITY_SCALE;
|
||||
|
||||
set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used));
|
||||
}
|
||||
#else
|
||||
static inline void sched_rt_update_capacity_req(struct rq *rq)
|
||||
{ }
|
||||
|
||||
#endif
|
||||
|
||||
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
|
||||
struct rt_rq *rt_rq)
|
||||
{
|
||||
|
@ -1494,8 +1584,17 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
|
|||
if (prev->sched_class == &rt_sched_class)
|
||||
update_curr_rt(rq);
|
||||
|
||||
if (!rt_rq->rt_queued)
|
||||
if (!rt_rq->rt_queued) {
|
||||
/*
|
||||
* The next task to be picked on this rq will have a lower
|
||||
* priority than rt tasks so we can spend some time to update
|
||||
* the capacity used by rt tasks based on the last activity.
|
||||
* This value will be the used as an estimation of the next
|
||||
* activity.
|
||||
*/
|
||||
sched_rt_update_capacity_req(rq);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
put_prev_task(rq, prev);
|
||||
|
||||
|
@ -2212,6 +2311,9 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
|
|||
|
||||
update_curr_rt(rq);
|
||||
|
||||
if (rq->rt.rt_nr_running)
|
||||
sched_rt_update_capacity_req(rq);
|
||||
|
||||
watchdog(rq, p);
|
||||
|
||||
/*
|
||||
|
|
|
@ -410,6 +410,10 @@ struct cfs_rq {
|
|||
struct list_head leaf_cfs_rq_list;
|
||||
struct task_group *tg; /* group that "owns" this runqueue */
|
||||
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
u64 cumulative_runnable_avg;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
int runtime_enabled;
|
||||
u64 runtime_expires;
|
||||
|
@ -506,10 +510,18 @@ struct dl_rq {
|
|||
#else
|
||||
struct dl_bw dl_bw;
|
||||
#endif
|
||||
/* This is the "average utilization" for this runqueue */
|
||||
s64 avg_bw;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
struct max_cpu_capacity {
|
||||
raw_spinlock_t lock;
|
||||
unsigned long val;
|
||||
int cpu;
|
||||
};
|
||||
|
||||
/*
|
||||
* We add the notion of a root-domain which will be used to define per-domain
|
||||
* variables. Each exclusive cpuset essentially defines an island domain by
|
||||
|
@ -528,6 +540,9 @@ struct root_domain {
|
|||
/* Indicate more than one runnable task for any CPU */
|
||||
bool overload;
|
||||
|
||||
/* Indicate one or more cpus over-utilized (tipping point) */
|
||||
bool overutilized;
|
||||
|
||||
/*
|
||||
* The bit corresponding to a CPU gets set here if such CPU has more
|
||||
* than one runnable -deadline task (as it is below for RT tasks).
|
||||
|
@ -543,6 +558,9 @@ struct root_domain {
|
|||
*/
|
||||
cpumask_var_t rto_mask;
|
||||
struct cpupri cpupri;
|
||||
|
||||
/* Maximum cpu capacity in the system. */
|
||||
struct max_cpu_capacity max_cpu_capacity;
|
||||
};
|
||||
|
||||
extern struct root_domain def_root_domain;
|
||||
|
@ -572,6 +590,7 @@ struct rq {
|
|||
#define CPU_LOAD_IDX_MAX 5
|
||||
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
|
||||
unsigned long last_load_update_tick;
|
||||
unsigned int misfit_task;
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
u64 nohz_stamp;
|
||||
unsigned long nohz_flags;
|
||||
|
@ -579,6 +598,14 @@ struct rq {
|
|||
#ifdef CONFIG_NO_HZ_FULL
|
||||
unsigned long last_sched_tick;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CPU_QUIET
|
||||
/* time-based average load */
|
||||
u64 nr_last_stamp;
|
||||
u64 nr_running_integral;
|
||||
seqcount_t ave_seqcnt;
|
||||
#endif
|
||||
|
||||
/* capture load from *all* tasks on this cpu: */
|
||||
struct load_weight load;
|
||||
unsigned long nr_load_updates;
|
||||
|
@ -640,6 +667,30 @@ struct rq {
|
|||
u64 max_idle_balance_cost;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
/*
|
||||
* max_freq = user or thermal defined maximum
|
||||
* max_possible_freq = maximum supported by hardware
|
||||
*/
|
||||
unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
|
||||
struct cpumask freq_domain_cpumask;
|
||||
|
||||
u64 cumulative_runnable_avg;
|
||||
int efficiency; /* Differentiate cpus with different IPC capability */
|
||||
int load_scale_factor;
|
||||
int capacity;
|
||||
int max_possible_capacity;
|
||||
u64 window_start;
|
||||
u64 curr_runnable_sum;
|
||||
u64 prev_runnable_sum;
|
||||
u64 nt_curr_runnable_sum;
|
||||
u64 nt_prev_runnable_sum;
|
||||
u64 cur_irqload;
|
||||
u64 avg_irqload;
|
||||
u64 irqload_ts;
|
||||
#endif /* CONFIG_SCHED_WALT */
|
||||
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
u64 prev_irq_time;
|
||||
#endif
|
||||
|
@ -687,6 +738,7 @@ struct rq {
|
|||
#ifdef CONFIG_CPU_IDLE
|
||||
/* Must be inspected within a rcu lock section */
|
||||
struct cpuidle_state *idle_state;
|
||||
int idle_state_idx;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
@ -836,6 +888,8 @@ DECLARE_PER_CPU(int, sd_llc_id);
|
|||
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
|
||||
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
|
||||
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
|
||||
DECLARE_PER_CPU(struct sched_domain *, sd_ea);
|
||||
DECLARE_PER_CPU(struct sched_domain *, sd_scs);
|
||||
|
||||
struct sched_group_capacity {
|
||||
atomic_t ref;
|
||||
|
@ -843,7 +897,8 @@ struct sched_group_capacity {
|
|||
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
|
||||
* for a single CPU.
|
||||
*/
|
||||
unsigned int capacity;
|
||||
unsigned long capacity;
|
||||
unsigned long max_capacity; /* Max per-cpu capacity in group */
|
||||
unsigned long next_update;
|
||||
int imbalance; /* XXX unrelated to capacity but shared group state */
|
||||
/*
|
||||
|
@ -860,6 +915,7 @@ struct sched_group {
|
|||
|
||||
unsigned int group_weight;
|
||||
struct sched_group_capacity *sgc;
|
||||
const struct sched_group_energy const *sge;
|
||||
|
||||
/*
|
||||
* The CPUs this group covers.
|
||||
|
@ -1163,6 +1219,7 @@ static const u32 prio_to_wmult[40] = {
|
|||
#endif
|
||||
#define ENQUEUE_REPLENISH 0x08
|
||||
#define ENQUEUE_RESTORE 0x10
|
||||
#define ENQUEUE_WAKEUP_NEW 0x20
|
||||
|
||||
#define DEQUEUE_SLEEP 0x01
|
||||
#define DEQUEUE_SAVE 0x02
|
||||
|
@ -1248,6 +1305,7 @@ extern const struct sched_class idle_sched_class;
|
|||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
|
||||
extern void update_group_capacity(struct sched_domain *sd, int cpu);
|
||||
|
||||
extern void trigger_load_balance(struct rq *rq);
|
||||
|
@ -1276,6 +1334,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
|
|||
WARN_ON(!rcu_read_lock_held());
|
||||
return rq->idle_state;
|
||||
}
|
||||
|
||||
static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
|
||||
{
|
||||
rq->idle_state_idx = idle_state_idx;
|
||||
}
|
||||
|
||||
static inline int idle_get_state_idx(struct rq *rq)
|
||||
{
|
||||
WARN_ON(!rcu_read_lock_held());
|
||||
return rq->idle_state_idx;
|
||||
}
|
||||
#else
|
||||
static inline void idle_set_state(struct rq *rq,
|
||||
struct cpuidle_state *idle_state)
|
||||
|
@ -1286,6 +1355,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
|
|||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
|
||||
{
|
||||
}
|
||||
|
||||
static inline int idle_get_state_idx(struct rq *rq)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
extern void sysrq_sched_debug_show(void);
|
||||
|
@ -1310,7 +1388,7 @@ unsigned long to_ratio(u64 period, u64 runtime);
|
|||
|
||||
extern void init_entity_runnable_average(struct sched_entity *se);
|
||||
|
||||
static inline void add_nr_running(struct rq *rq, unsigned count)
|
||||
static inline void __add_nr_running(struct rq *rq, unsigned count)
|
||||
{
|
||||
unsigned prev_nr = rq->nr_running;
|
||||
|
||||
|
@ -1338,11 +1416,48 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
|
|||
}
|
||||
}
|
||||
|
||||
static inline void sub_nr_running(struct rq *rq, unsigned count)
|
||||
static inline void __sub_nr_running(struct rq *rq, unsigned count)
|
||||
{
|
||||
rq->nr_running -= count;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CPU_QUIET
|
||||
#define NR_AVE_SCALE(x) ((x) << FSHIFT)
|
||||
static inline u64 do_nr_running_integral(struct rq *rq)
|
||||
{
|
||||
s64 nr, deltax;
|
||||
u64 nr_running_integral = rq->nr_running_integral;
|
||||
|
||||
deltax = rq->clock_task - rq->nr_last_stamp;
|
||||
nr = NR_AVE_SCALE(rq->nr_running);
|
||||
|
||||
nr_running_integral += nr * deltax;
|
||||
|
||||
return nr_running_integral;
|
||||
}
|
||||
|
||||
static inline void add_nr_running(struct rq *rq, unsigned count)
|
||||
{
|
||||
write_seqcount_begin(&rq->ave_seqcnt);
|
||||
rq->nr_running_integral = do_nr_running_integral(rq);
|
||||
rq->nr_last_stamp = rq->clock_task;
|
||||
__add_nr_running(rq, count);
|
||||
write_seqcount_end(&rq->ave_seqcnt);
|
||||
}
|
||||
|
||||
static inline void sub_nr_running(struct rq *rq, unsigned count)
|
||||
{
|
||||
write_seqcount_begin(&rq->ave_seqcnt);
|
||||
rq->nr_running_integral = do_nr_running_integral(rq);
|
||||
rq->nr_last_stamp = rq->clock_task;
|
||||
__sub_nr_running(rq, count);
|
||||
write_seqcount_end(&rq->ave_seqcnt);
|
||||
}
|
||||
#else
|
||||
#define add_nr_running __add_nr_running
|
||||
#define sub_nr_running __sub_nr_running
|
||||
#endif
|
||||
|
||||
static inline void rq_last_tick_reset(struct rq *rq)
|
||||
{
|
||||
#ifdef CONFIG_NO_HZ_FULL
|
||||
|
@ -1415,10 +1530,145 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static inline unsigned long capacity_of(int cpu)
|
||||
{
|
||||
return cpu_rq(cpu)->cpu_capacity;
|
||||
}
|
||||
|
||||
static inline unsigned long capacity_orig_of(int cpu)
|
||||
{
|
||||
return cpu_rq(cpu)->cpu_capacity_orig;
|
||||
}
|
||||
|
||||
extern unsigned int sysctl_sched_use_walt_cpu_util;
|
||||
extern unsigned int walt_ravg_window;
|
||||
extern unsigned int walt_disabled;
|
||||
|
||||
/*
|
||||
* cpu_util returns the amount of capacity of a CPU that is used by CFS
|
||||
* tasks. The unit of the return value must be the one of capacity so we can
|
||||
* compare the utilization with the capacity of the CPU that is available for
|
||||
* CFS task (ie cpu_capacity).
|
||||
*
|
||||
* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
|
||||
* recent utilization of currently non-runnable tasks on a CPU. It represents
|
||||
* the amount of utilization of a CPU in the range [0..capacity_orig] where
|
||||
* capacity_orig is the cpu_capacity available at the highest frequency
|
||||
* (arch_scale_freq_capacity()).
|
||||
* The utilization of a CPU converges towards a sum equal to or less than the
|
||||
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
|
||||
* the running time on this CPU scaled by capacity_curr.
|
||||
*
|
||||
* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
|
||||
* higher than capacity_orig because of unfortunate rounding in
|
||||
* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
|
||||
* the average stabilizes with the new running time. We need to check that the
|
||||
* utilization stays within the range of [0..capacity_orig] and cap it if
|
||||
* necessary. Without utilization capping, a group could be seen as overloaded
|
||||
* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
|
||||
* available capacity. We allow utilization to overshoot capacity_curr (but not
|
||||
* capacity_orig) as it useful for predicting the capacity required after task
|
||||
* migrations (scheduler-driven DVFS).
|
||||
*/
|
||||
static inline unsigned long __cpu_util(int cpu, int delta)
|
||||
{
|
||||
unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
|
||||
unsigned long capacity = capacity_orig_of(cpu);
|
||||
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
|
||||
util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) /
|
||||
walt_ravg_window;
|
||||
#endif
|
||||
delta += util;
|
||||
if (delta < 0)
|
||||
return 0;
|
||||
|
||||
return (delta >= capacity) ? capacity : delta;
|
||||
}
|
||||
|
||||
static inline unsigned long cpu_util(int cpu)
|
||||
{
|
||||
return __cpu_util(cpu, 0);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CPU_FREQ_GOV_SCHED
|
||||
#define capacity_max SCHED_CAPACITY_SCALE
|
||||
extern unsigned int capacity_margin;
|
||||
extern struct static_key __sched_freq;
|
||||
|
||||
static inline bool sched_freq(void)
|
||||
{
|
||||
return static_key_false(&__sched_freq);
|
||||
}
|
||||
|
||||
DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
|
||||
void update_cpu_capacity_request(int cpu, bool request);
|
||||
|
||||
static inline void set_cfs_cpu_capacity(int cpu, bool request,
|
||||
unsigned long capacity)
|
||||
{
|
||||
struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
|
||||
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
|
||||
int rtdl = scr->rt + scr->dl;
|
||||
/*
|
||||
* WALT tracks the utilization of a CPU considering the load
|
||||
* generated by all the scheduling classes.
|
||||
* Since the following call to:
|
||||
* update_cpu_capacity
|
||||
* is already adding the RT and DL utilizations let's remove
|
||||
* these contributions from the WALT signal.
|
||||
*/
|
||||
if (capacity > rtdl)
|
||||
capacity -= rtdl;
|
||||
else
|
||||
capacity = 0;
|
||||
}
|
||||
#endif
|
||||
if (scr->cfs != capacity) {
|
||||
scr->cfs = capacity;
|
||||
update_cpu_capacity_request(cpu, request);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void set_rt_cpu_capacity(int cpu, bool request,
|
||||
unsigned long capacity)
|
||||
{
|
||||
if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) {
|
||||
per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity;
|
||||
update_cpu_capacity_request(cpu, request);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void set_dl_cpu_capacity(int cpu, bool request,
|
||||
unsigned long capacity)
|
||||
{
|
||||
if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) {
|
||||
per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity;
|
||||
update_cpu_capacity_request(cpu, request);
|
||||
}
|
||||
}
|
||||
#else
|
||||
static inline bool sched_freq(void) { return false; }
|
||||
static inline void set_cfs_cpu_capacity(int cpu, bool request,
|
||||
unsigned long capacity)
|
||||
{ }
|
||||
static inline void set_rt_cpu_capacity(int cpu, bool request,
|
||||
unsigned long capacity)
|
||||
{ }
|
||||
static inline void set_dl_cpu_capacity(int cpu, bool request,
|
||||
unsigned long capacity)
|
||||
{ }
|
||||
#endif
|
||||
|
||||
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
|
||||
{
|
||||
rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
|
||||
sched_avg_update(rq);
|
||||
}
|
||||
#else
|
||||
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
|
||||
|
@ -1507,6 +1757,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
|
|||
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
|
||||
}
|
||||
|
||||
extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags);
|
||||
extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
#ifdef CONFIG_PREEMPT
|
||||
|
||||
|
@ -1579,7 +1832,8 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
|
|||
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
|
||||
__releases(busiest->lock)
|
||||
{
|
||||
raw_spin_unlock(&busiest->lock);
|
||||
if (this_rq != busiest)
|
||||
raw_spin_unlock(&busiest->lock);
|
||||
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include "sched.h"
|
||||
#include "walt.h"
|
||||
|
||||
/*
|
||||
* stop-task scheduling class.
|
||||
|
@ -42,12 +43,14 @@ static void
|
|||
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
add_nr_running(rq, 1);
|
||||
walt_inc_cumulative_runnable_avg(rq, p);
|
||||
}
|
||||
|
||||
static void
|
||||
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
sub_nr_running(rq, 1);
|
||||
walt_dec_cumulative_runnable_avg(rq, p);
|
||||
}
|
||||
|
||||
static void yield_task_stop(struct rq *rq)
|
||||
|
|
949
kernel/sched/tune.c
Normal file
949
kernel/sched/tune.c
Normal file
|
@ -0,0 +1,949 @@
|
|||
#include <linux/cgroup.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/printk.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include <trace/events/sched.h>
|
||||
|
||||
#include "sched.h"
|
||||
#include "tune.h"
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHEDTUNE
|
||||
static bool schedtune_initialized = false;
|
||||
#endif
|
||||
|
||||
unsigned int sysctl_sched_cfs_boost __read_mostly;
|
||||
|
||||
extern struct target_nrg schedtune_target_nrg;
|
||||
|
||||
/* Performance Boost region (B) threshold params */
|
||||
static int perf_boost_idx;
|
||||
|
||||
/* Performance Constraint region (C) threshold params */
|
||||
static int perf_constrain_idx;
|
||||
|
||||
/**
|
||||
* Performance-Energy (P-E) Space thresholds constants
|
||||
*/
|
||||
struct threshold_params {
|
||||
int nrg_gain;
|
||||
int cap_gain;
|
||||
};
|
||||
|
||||
/*
|
||||
* System specific P-E space thresholds constants
|
||||
*/
|
||||
static struct threshold_params
|
||||
threshold_gains[] = {
|
||||
{ 0, 5 }, /* < 10% */
|
||||
{ 1, 5 }, /* < 20% */
|
||||
{ 2, 5 }, /* < 30% */
|
||||
{ 3, 5 }, /* < 40% */
|
||||
{ 4, 5 }, /* < 50% */
|
||||
{ 5, 4 }, /* < 60% */
|
||||
{ 5, 3 }, /* < 70% */
|
||||
{ 5, 2 }, /* < 80% */
|
||||
{ 5, 1 }, /* < 90% */
|
||||
{ 5, 0 } /* <= 100% */
|
||||
};
|
||||
|
||||
static int
|
||||
__schedtune_accept_deltas(int nrg_delta, int cap_delta,
|
||||
int perf_boost_idx, int perf_constrain_idx)
|
||||
{
|
||||
int payoff = -INT_MAX;
|
||||
int gain_idx = -1;
|
||||
|
||||
/* Performance Boost (B) region */
|
||||
if (nrg_delta >= 0 && cap_delta > 0)
|
||||
gain_idx = perf_boost_idx;
|
||||
/* Performance Constraint (C) region */
|
||||
else if (nrg_delta < 0 && cap_delta <= 0)
|
||||
gain_idx = perf_constrain_idx;
|
||||
|
||||
/* Default: reject schedule candidate */
|
||||
if (gain_idx == -1)
|
||||
return payoff;
|
||||
|
||||
/*
|
||||
* Evaluate "Performance Boost" vs "Energy Increase"
|
||||
*
|
||||
* - Performance Boost (B) region
|
||||
*
|
||||
* Condition: nrg_delta > 0 && cap_delta > 0
|
||||
* Payoff criteria:
|
||||
* cap_gain / nrg_gain < cap_delta / nrg_delta =
|
||||
* cap_gain * nrg_delta < cap_delta * nrg_gain
|
||||
* Note that since both nrg_gain and nrg_delta are positive, the
|
||||
* inequality does not change. Thus:
|
||||
*
|
||||
* payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
|
||||
*
|
||||
* - Performance Constraint (C) region
|
||||
*
|
||||
* Condition: nrg_delta < 0 && cap_delta < 0
|
||||
* payoff criteria:
|
||||
* cap_gain / nrg_gain > cap_delta / nrg_delta =
|
||||
* cap_gain * nrg_delta < cap_delta * nrg_gain
|
||||
* Note that since nrg_gain > 0 while nrg_delta < 0, the
|
||||
* inequality change. Thus:
|
||||
*
|
||||
* payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
|
||||
*
|
||||
* This means that, in case of same positive defined {cap,nrg}_gain
|
||||
* for both the B and C regions, we can use the same payoff formula
|
||||
* where a positive value represents the accept condition.
|
||||
*/
|
||||
payoff = cap_delta * threshold_gains[gain_idx].nrg_gain;
|
||||
payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
|
||||
|
||||
return payoff;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHEDTUNE
|
||||
|
||||
/*
|
||||
* EAS scheduler tunables for task groups.
|
||||
*/
|
||||
|
||||
/* SchdTune tunables for a group of tasks */
|
||||
struct schedtune {
|
||||
/* SchedTune CGroup subsystem */
|
||||
struct cgroup_subsys_state css;
|
||||
|
||||
/* Boost group allocated ID */
|
||||
int idx;
|
||||
|
||||
/* Boost value for tasks on that SchedTune CGroup */
|
||||
int boost;
|
||||
|
||||
/* Performance Boost (B) region threshold params */
|
||||
int perf_boost_idx;
|
||||
|
||||
/* Performance Constraint (C) region threshold params */
|
||||
int perf_constrain_idx;
|
||||
|
||||
/* Hint to bias scheduling of tasks on that SchedTune CGroup
|
||||
* towards idle CPUs */
|
||||
int prefer_idle;
|
||||
};
|
||||
|
||||
static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
|
||||
{
|
||||
return css ? container_of(css, struct schedtune, css) : NULL;
|
||||
}
|
||||
|
||||
static inline struct schedtune *task_schedtune(struct task_struct *tsk)
|
||||
{
|
||||
return css_st(task_css(tsk, schedtune_cgrp_id));
|
||||
}
|
||||
|
||||
static inline struct schedtune *parent_st(struct schedtune *st)
|
||||
{
|
||||
return css_st(st->css.parent);
|
||||
}
|
||||
|
||||
/*
|
||||
* SchedTune root control group
|
||||
* The root control group is used to defined a system-wide boosting tuning,
|
||||
* which is applied to all tasks in the system.
|
||||
* Task specific boost tuning could be specified by creating and
|
||||
* configuring a child control group under the root one.
|
||||
* By default, system-wide boosting is disabled, i.e. no boosting is applied
|
||||
* to tasks which are not into a child control group.
|
||||
*/
|
||||
static struct schedtune
|
||||
root_schedtune = {
|
||||
.boost = 0,
|
||||
.perf_boost_idx = 0,
|
||||
.perf_constrain_idx = 0,
|
||||
.prefer_idle = 0,
|
||||
};
|
||||
|
||||
int
|
||||
schedtune_accept_deltas(int nrg_delta, int cap_delta,
|
||||
struct task_struct *task)
|
||||
{
|
||||
struct schedtune *ct;
|
||||
int perf_boost_idx;
|
||||
int perf_constrain_idx;
|
||||
|
||||
/* Optimal (O) region */
|
||||
if (nrg_delta < 0 && cap_delta > 0) {
|
||||
trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
|
||||
return INT_MAX;
|
||||
}
|
||||
|
||||
/* Suboptimal (S) region */
|
||||
if (nrg_delta > 0 && cap_delta < 0) {
|
||||
trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
|
||||
return -INT_MAX;
|
||||
}
|
||||
|
||||
/* Get task specific perf Boost/Constraints indexes */
|
||||
rcu_read_lock();
|
||||
ct = task_schedtune(task);
|
||||
perf_boost_idx = ct->perf_boost_idx;
|
||||
perf_constrain_idx = ct->perf_constrain_idx;
|
||||
rcu_read_unlock();
|
||||
|
||||
return __schedtune_accept_deltas(nrg_delta, cap_delta,
|
||||
perf_boost_idx, perf_constrain_idx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Maximum number of boost groups to support
|
||||
* When per-task boosting is used we still allow only limited number of
|
||||
* boost groups for two main reasons:
|
||||
* 1. on a real system we usually have only few classes of workloads which
|
||||
* make sense to boost with different values (e.g. background vs foreground
|
||||
* tasks, interactive vs low-priority tasks)
|
||||
* 2. a limited number allows for a simpler and more memory/time efficient
|
||||
* implementation especially for the computation of the per-CPU boost
|
||||
* value
|
||||
*/
|
||||
#define BOOSTGROUPS_COUNT 4
|
||||
|
||||
/* Array of configured boostgroups */
|
||||
static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
|
||||
&root_schedtune,
|
||||
NULL,
|
||||
};
|
||||
|
||||
/* SchedTune boost groups
|
||||
* Keep track of all the boost groups which impact on CPU, for example when a
|
||||
* CPU has two RUNNABLE tasks belonging to two different boost groups and thus
|
||||
* likely with different boost values.
|
||||
* Since on each system we expect only a limited number of boost groups, here
|
||||
* we use a simple array to keep track of the metrics required to compute the
|
||||
* maximum per-CPU boosting value.
|
||||
*/
|
||||
struct boost_groups {
|
||||
/* Maximum boost value for all RUNNABLE tasks on a CPU */
|
||||
bool idle;
|
||||
int boost_max;
|
||||
struct {
|
||||
/* The boost for tasks on that boost group */
|
||||
int boost;
|
||||
/* Count of RUNNABLE tasks on that boost group */
|
||||
unsigned tasks;
|
||||
} group[BOOSTGROUPS_COUNT];
|
||||
/* CPU's boost group locking */
|
||||
raw_spinlock_t lock;
|
||||
};
|
||||
|
||||
/* Boost groups affecting each CPU in the system */
|
||||
DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
|
||||
|
||||
static void
|
||||
schedtune_cpu_update(int cpu)
|
||||
{
|
||||
struct boost_groups *bg;
|
||||
int boost_max;
|
||||
int idx;
|
||||
|
||||
bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
|
||||
/* The root boost group is always active */
|
||||
boost_max = bg->group[0].boost;
|
||||
for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
|
||||
/*
|
||||
* A boost group affects a CPU only if it has
|
||||
* RUNNABLE tasks on that CPU
|
||||
*/
|
||||
if (bg->group[idx].tasks == 0)
|
||||
continue;
|
||||
|
||||
boost_max = max(boost_max, bg->group[idx].boost);
|
||||
}
|
||||
/* Ensures boost_max is non-negative when all cgroup boost values
|
||||
* are neagtive. Avoids under-accounting of cpu capacity which may cause
|
||||
* task stacking and frequency spikes.*/
|
||||
boost_max = max(boost_max, 0);
|
||||
bg->boost_max = boost_max;
|
||||
}
|
||||
|
||||
static int
|
||||
schedtune_boostgroup_update(int idx, int boost)
|
||||
{
|
||||
struct boost_groups *bg;
|
||||
int cur_boost_max;
|
||||
int old_boost;
|
||||
int cpu;
|
||||
|
||||
/* Update per CPU boost groups */
|
||||
for_each_possible_cpu(cpu) {
|
||||
bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
|
||||
/*
|
||||
* Keep track of current boost values to compute the per CPU
|
||||
* maximum only when it has been affected by the new value of
|
||||
* the updated boost group
|
||||
*/
|
||||
cur_boost_max = bg->boost_max;
|
||||
old_boost = bg->group[idx].boost;
|
||||
|
||||
/* Update the boost value of this boost group */
|
||||
bg->group[idx].boost = boost;
|
||||
|
||||
/* Check if this update increase current max */
|
||||
if (boost > cur_boost_max && bg->group[idx].tasks) {
|
||||
bg->boost_max = boost;
|
||||
trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Check if this update has decreased current max */
|
||||
if (cur_boost_max == old_boost && old_boost > boost) {
|
||||
schedtune_cpu_update(cpu);
|
||||
trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
|
||||
continue;
|
||||
}
|
||||
|
||||
trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define ENQUEUE_TASK 1
|
||||
#define DEQUEUE_TASK -1
|
||||
|
||||
static inline void
|
||||
schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
|
||||
{
|
||||
struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
int tasks = bg->group[idx].tasks + task_count;
|
||||
|
||||
/* Update boosted tasks count while avoiding to make it negative */
|
||||
bg->group[idx].tasks = max(0, tasks);
|
||||
|
||||
trace_sched_tune_tasks_update(p, cpu, tasks, idx,
|
||||
bg->group[idx].boost, bg->boost_max);
|
||||
|
||||
/* Boost group activation or deactivation on that RQ */
|
||||
if (tasks == 1 || tasks == 0)
|
||||
schedtune_cpu_update(cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: This function must be called while holding the lock on the CPU RQ
|
||||
*/
|
||||
void schedtune_enqueue_task(struct task_struct *p, int cpu)
|
||||
{
|
||||
struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
unsigned long irq_flags;
|
||||
struct schedtune *st;
|
||||
int idx;
|
||||
|
||||
if (!unlikely(schedtune_initialized))
|
||||
return;
|
||||
|
||||
/*
|
||||
* When a task is marked PF_EXITING by do_exit() it's going to be
|
||||
* dequeued and enqueued multiple times in the exit path.
|
||||
* Thus we avoid any further update, since we do not want to change
|
||||
* CPU boosting while the task is exiting.
|
||||
*/
|
||||
if (p->flags & PF_EXITING)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Boost group accouting is protected by a per-cpu lock and requires
|
||||
* interrupt to be disabled to avoid race conditions for example on
|
||||
* do_exit()::cgroup_exit() and task migration.
|
||||
*/
|
||||
raw_spin_lock_irqsave(&bg->lock, irq_flags);
|
||||
rcu_read_lock();
|
||||
|
||||
st = task_schedtune(p);
|
||||
idx = st->idx;
|
||||
|
||||
schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
|
||||
|
||||
rcu_read_unlock();
|
||||
raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
|
||||
}
|
||||
|
||||
int schedtune_allow_attach(struct cgroup_taskset *tset)
|
||||
{
|
||||
/* We always allows tasks to be moved between existing CGroups */
|
||||
return 0;
|
||||
}
|
||||
|
||||
int schedtune_can_attach(struct cgroup_taskset *tset)
|
||||
{
|
||||
struct task_struct *task;
|
||||
struct cgroup_subsys_state *css;
|
||||
struct boost_groups *bg;
|
||||
unsigned long irq_flags;
|
||||
unsigned int cpu;
|
||||
struct rq *rq;
|
||||
int src_bg; /* Source boost group index */
|
||||
int dst_bg; /* Destination boost group index */
|
||||
int tasks;
|
||||
|
||||
if (!unlikely(schedtune_initialized))
|
||||
return 0;
|
||||
|
||||
|
||||
cgroup_taskset_for_each(task, css, tset) {
|
||||
|
||||
/*
|
||||
* Lock the CPU's RQ the task is enqueued to avoid race
|
||||
* conditions with migration code while the task is being
|
||||
* accounted
|
||||
*/
|
||||
rq = lock_rq_of(task, &irq_flags);
|
||||
|
||||
if (!task->on_rq) {
|
||||
unlock_rq_of(rq, task, &irq_flags);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Boost group accouting is protected by a per-cpu lock and requires
|
||||
* interrupt to be disabled to avoid race conditions on...
|
||||
*/
|
||||
cpu = cpu_of(rq);
|
||||
bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
raw_spin_lock(&bg->lock);
|
||||
|
||||
dst_bg = css_st(css)->idx;
|
||||
src_bg = task_schedtune(task)->idx;
|
||||
|
||||
/*
|
||||
* Current task is not changing boostgroup, which can
|
||||
* happen when the new hierarchy is in use.
|
||||
*/
|
||||
if (unlikely(dst_bg == src_bg)) {
|
||||
raw_spin_unlock(&bg->lock);
|
||||
unlock_rq_of(rq, task, &irq_flags);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the case of a RUNNABLE task which is switching its
|
||||
* current boost group.
|
||||
*/
|
||||
|
||||
/* Move task from src to dst boost group */
|
||||
tasks = bg->group[src_bg].tasks - 1;
|
||||
bg->group[src_bg].tasks = max(0, tasks);
|
||||
bg->group[dst_bg].tasks += 1;
|
||||
|
||||
raw_spin_unlock(&bg->lock);
|
||||
unlock_rq_of(rq, task, &irq_flags);
|
||||
|
||||
/* Update CPU boost group */
|
||||
if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
|
||||
schedtune_cpu_update(task_cpu(task));
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void schedtune_cancel_attach(struct cgroup_taskset *tset)
|
||||
{
|
||||
/* This can happen only if SchedTune controller is mounted with
|
||||
* other hierarchies ane one of them fails. Since usually SchedTune is
|
||||
* mouted on its own hierarcy, for the time being we do not implement
|
||||
* a proper rollback mechanism */
|
||||
WARN(1, "SchedTune cancel attach not implemented");
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: This function must be called while holding the lock on the CPU RQ
|
||||
*/
|
||||
void schedtune_dequeue_task(struct task_struct *p, int cpu)
|
||||
{
|
||||
struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
unsigned long irq_flags;
|
||||
struct schedtune *st;
|
||||
int idx;
|
||||
|
||||
if (!unlikely(schedtune_initialized))
|
||||
return;
|
||||
|
||||
/*
|
||||
* When a task is marked PF_EXITING by do_exit() it's going to be
|
||||
* dequeued and enqueued multiple times in the exit path.
|
||||
* Thus we avoid any further update, since we do not want to change
|
||||
* CPU boosting while the task is exiting.
|
||||
* The last dequeue is already enforce by the do_exit() code path
|
||||
* via schedtune_exit_task().
|
||||
*/
|
||||
if (p->flags & PF_EXITING)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Boost group accouting is protected by a per-cpu lock and requires
|
||||
* interrupt to be disabled to avoid race conditions on...
|
||||
*/
|
||||
raw_spin_lock_irqsave(&bg->lock, irq_flags);
|
||||
rcu_read_lock();
|
||||
|
||||
st = task_schedtune(p);
|
||||
idx = st->idx;
|
||||
|
||||
schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
|
||||
|
||||
rcu_read_unlock();
|
||||
raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
|
||||
}
|
||||
|
||||
void schedtune_exit_task(struct task_struct *tsk)
|
||||
{
|
||||
struct schedtune *st;
|
||||
unsigned long irq_flags;
|
||||
unsigned int cpu;
|
||||
struct rq *rq;
|
||||
int idx;
|
||||
|
||||
if (!unlikely(schedtune_initialized))
|
||||
return;
|
||||
|
||||
rq = lock_rq_of(tsk, &irq_flags);
|
||||
rcu_read_lock();
|
||||
|
||||
cpu = cpu_of(rq);
|
||||
st = task_schedtune(tsk);
|
||||
idx = st->idx;
|
||||
schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
|
||||
|
||||
rcu_read_unlock();
|
||||
unlock_rq_of(rq, tsk, &irq_flags);
|
||||
}
|
||||
|
||||
int schedtune_cpu_boost(int cpu)
|
||||
{
|
||||
struct boost_groups *bg;
|
||||
|
||||
bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
return bg->boost_max;
|
||||
}
|
||||
|
||||
int schedtune_task_boost(struct task_struct *p)
|
||||
{
|
||||
struct schedtune *st;
|
||||
int task_boost;
|
||||
|
||||
/* Get task boost value */
|
||||
rcu_read_lock();
|
||||
st = task_schedtune(p);
|
||||
task_boost = st->boost;
|
||||
rcu_read_unlock();
|
||||
|
||||
return task_boost;
|
||||
}
|
||||
|
||||
int schedtune_prefer_idle(struct task_struct *p)
|
||||
{
|
||||
struct schedtune *st;
|
||||
int prefer_idle;
|
||||
|
||||
/* Get prefer_idle value */
|
||||
rcu_read_lock();
|
||||
st = task_schedtune(p);
|
||||
prefer_idle = st->prefer_idle;
|
||||
rcu_read_unlock();
|
||||
|
||||
return prefer_idle;
|
||||
}
|
||||
|
||||
static u64
|
||||
prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
|
||||
{
|
||||
struct schedtune *st = css_st(css);
|
||||
|
||||
return st->prefer_idle;
|
||||
}
|
||||
|
||||
static int
|
||||
prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
|
||||
u64 prefer_idle)
|
||||
{
|
||||
struct schedtune *st = css_st(css);
|
||||
st->prefer_idle = prefer_idle;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static s64
|
||||
boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
|
||||
{
|
||||
struct schedtune *st = css_st(css);
|
||||
|
||||
return st->boost;
|
||||
}
|
||||
|
||||
static int
|
||||
boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
|
||||
s64 boost)
|
||||
{
|
||||
struct schedtune *st = css_st(css);
|
||||
unsigned threshold_idx;
|
||||
int boost_pct;
|
||||
|
||||
if (boost < -100 || boost > 100)
|
||||
return -EINVAL;
|
||||
boost_pct = boost;
|
||||
|
||||
/*
|
||||
* Update threshold params for Performance Boost (B)
|
||||
* and Performance Constraint (C) regions.
|
||||
* The current implementatio uses the same cuts for both
|
||||
* B and C regions.
|
||||
*/
|
||||
threshold_idx = clamp(boost_pct, 0, 99) / 10;
|
||||
st->perf_boost_idx = threshold_idx;
|
||||
st->perf_constrain_idx = threshold_idx;
|
||||
|
||||
st->boost = boost;
|
||||
if (css == &root_schedtune.css) {
|
||||
sysctl_sched_cfs_boost = boost;
|
||||
perf_boost_idx = threshold_idx;
|
||||
perf_constrain_idx = threshold_idx;
|
||||
}
|
||||
|
||||
/* Update CPU boost */
|
||||
schedtune_boostgroup_update(st->idx, st->boost);
|
||||
|
||||
trace_sched_tune_config(st->boost);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct cftype files[] = {
|
||||
{
|
||||
.name = "boost",
|
||||
.read_s64 = boost_read,
|
||||
.write_s64 = boost_write,
|
||||
},
|
||||
{
|
||||
.name = "prefer_idle",
|
||||
.read_u64 = prefer_idle_read,
|
||||
.write_u64 = prefer_idle_write,
|
||||
},
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
static int
|
||||
schedtune_boostgroup_init(struct schedtune *st)
|
||||
{
|
||||
struct boost_groups *bg;
|
||||
int cpu;
|
||||
|
||||
/* Keep track of allocated boost groups */
|
||||
allocated_group[st->idx] = st;
|
||||
|
||||
/* Initialize the per CPU boost groups */
|
||||
for_each_possible_cpu(cpu) {
|
||||
bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
bg->group[st->idx].boost = 0;
|
||||
bg->group[st->idx].tasks = 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct cgroup_subsys_state *
|
||||
schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
{
|
||||
struct schedtune *st;
|
||||
int idx;
|
||||
|
||||
if (!parent_css)
|
||||
return &root_schedtune.css;
|
||||
|
||||
/* Allow only single level hierachies */
|
||||
if (parent_css != &root_schedtune.css) {
|
||||
pr_err("Nested SchedTune boosting groups not allowed\n");
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
/* Allow only a limited number of boosting groups */
|
||||
for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
|
||||
if (!allocated_group[idx])
|
||||
break;
|
||||
if (idx == BOOSTGROUPS_COUNT) {
|
||||
pr_err("Trying to create more than %d SchedTune boosting groups\n",
|
||||
BOOSTGROUPS_COUNT);
|
||||
return ERR_PTR(-ENOSPC);
|
||||
}
|
||||
|
||||
st = kzalloc(sizeof(*st), GFP_KERNEL);
|
||||
if (!st)
|
||||
goto out;
|
||||
|
||||
/* Initialize per CPUs boost group support */
|
||||
st->idx = idx;
|
||||
if (schedtune_boostgroup_init(st))
|
||||
goto release;
|
||||
|
||||
return &st->css;
|
||||
|
||||
release:
|
||||
kfree(st);
|
||||
out:
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
static void
|
||||
schedtune_boostgroup_release(struct schedtune *st)
|
||||
{
|
||||
/* Reset this boost group */
|
||||
schedtune_boostgroup_update(st->idx, 0);
|
||||
|
||||
/* Keep track of allocated boost groups */
|
||||
allocated_group[st->idx] = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
schedtune_css_free(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct schedtune *st = css_st(css);
|
||||
|
||||
schedtune_boostgroup_release(st);
|
||||
kfree(st);
|
||||
}
|
||||
|
||||
struct cgroup_subsys schedtune_cgrp_subsys = {
|
||||
.css_alloc = schedtune_css_alloc,
|
||||
.css_free = schedtune_css_free,
|
||||
// .allow_attach = schedtune_allow_attach,
|
||||
.can_attach = schedtune_can_attach,
|
||||
.cancel_attach = schedtune_cancel_attach,
|
||||
.legacy_cftypes = files,
|
||||
.early_init = 1,
|
||||
};
|
||||
|
||||
static inline void
|
||||
schedtune_init_cgroups(void)
|
||||
{
|
||||
struct boost_groups *bg;
|
||||
int cpu;
|
||||
|
||||
/* Initialize the per CPU boost groups */
|
||||
for_each_possible_cpu(cpu) {
|
||||
bg = &per_cpu(cpu_boost_groups, cpu);
|
||||
memset(bg, 0, sizeof(struct boost_groups));
|
||||
}
|
||||
|
||||
pr_info("schedtune: configured to support %d boost groups\n",
|
||||
BOOSTGROUPS_COUNT);
|
||||
}
|
||||
|
||||
#else /* CONFIG_CGROUP_SCHEDTUNE */
|
||||
|
||||
int
|
||||
schedtune_accept_deltas(int nrg_delta, int cap_delta,
|
||||
struct task_struct *task)
|
||||
{
|
||||
/* Optimal (O) region */
|
||||
if (nrg_delta < 0 && cap_delta > 0) {
|
||||
trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
|
||||
return INT_MAX;
|
||||
}
|
||||
|
||||
/* Suboptimal (S) region */
|
||||
if (nrg_delta > 0 && cap_delta < 0) {
|
||||
trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
|
||||
return -INT_MAX;
|
||||
}
|
||||
|
||||
return __schedtune_accept_deltas(nrg_delta, cap_delta,
|
||||
perf_boost_idx, perf_constrain_idx);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_CGROUP_SCHEDTUNE */
|
||||
|
||||
int
|
||||
sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos)
|
||||
{
|
||||
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
unsigned threshold_idx;
|
||||
int boost_pct;
|
||||
|
||||
if (ret || !write)
|
||||
return ret;
|
||||
|
||||
if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
|
||||
return -EINVAL;
|
||||
boost_pct = sysctl_sched_cfs_boost;
|
||||
|
||||
/*
|
||||
* Update threshold params for Performance Boost (B)
|
||||
* and Performance Constraint (C) regions.
|
||||
* The current implementatio uses the same cuts for both
|
||||
* B and C regions.
|
||||
*/
|
||||
threshold_idx = clamp(boost_pct, 0, 99) / 10;
|
||||
perf_boost_idx = threshold_idx;
|
||||
perf_constrain_idx = threshold_idx;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
static void
|
||||
schedtune_test_nrg(unsigned long delta_pwr)
|
||||
{
|
||||
unsigned long test_delta_pwr;
|
||||
unsigned long test_norm_pwr;
|
||||
int idx;
|
||||
|
||||
/*
|
||||
* Check normalization constants using some constant system
|
||||
* energy values
|
||||
*/
|
||||
pr_info("schedtune: verify normalization constants...\n");
|
||||
for (idx = 0; idx < 6; ++idx) {
|
||||
test_delta_pwr = delta_pwr >> idx;
|
||||
|
||||
/* Normalize on max energy for target platform */
|
||||
test_norm_pwr = reciprocal_divide(
|
||||
test_delta_pwr << SCHED_LOAD_SHIFT,
|
||||
schedtune_target_nrg.rdiv);
|
||||
|
||||
pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
|
||||
idx, test_delta_pwr, test_norm_pwr);
|
||||
}
|
||||
}
|
||||
#else
|
||||
#define schedtune_test_nrg(delta_pwr)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Compute the min/max power consumption of a cluster and all its CPUs
|
||||
*/
|
||||
static void
|
||||
schedtune_add_cluster_nrg(
|
||||
struct sched_domain *sd,
|
||||
struct sched_group *sg,
|
||||
struct target_nrg *ste)
|
||||
{
|
||||
struct sched_domain *sd2;
|
||||
struct sched_group *sg2;
|
||||
|
||||
struct cpumask *cluster_cpus;
|
||||
char str[32];
|
||||
|
||||
unsigned long min_pwr;
|
||||
unsigned long max_pwr;
|
||||
int cpu;
|
||||
|
||||
/* Get Cluster energy using EM data for the first CPU */
|
||||
cluster_cpus = sched_group_cpus(sg);
|
||||
snprintf(str, 32, "CLUSTER[%*pbl]",
|
||||
cpumask_pr_args(cluster_cpus));
|
||||
|
||||
min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
|
||||
max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
|
||||
pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
|
||||
str, min_pwr, max_pwr);
|
||||
|
||||
/*
|
||||
* Keep track of this cluster's energy in the computation of the
|
||||
* overall system energy
|
||||
*/
|
||||
ste->min_power += min_pwr;
|
||||
ste->max_power += max_pwr;
|
||||
|
||||
/* Get CPU energy using EM data for each CPU in the group */
|
||||
for_each_cpu(cpu, cluster_cpus) {
|
||||
/* Get a SD view for the specific CPU */
|
||||
for_each_domain(cpu, sd2) {
|
||||
/* Get the CPU group */
|
||||
sg2 = sd2->groups;
|
||||
min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
|
||||
max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
|
||||
|
||||
ste->min_power += min_pwr;
|
||||
ste->max_power += max_pwr;
|
||||
|
||||
snprintf(str, 32, "CPU[%d]", cpu);
|
||||
pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
|
||||
str, min_pwr, max_pwr);
|
||||
|
||||
/*
|
||||
* Assume we have EM data only at the CPU and
|
||||
* the upper CLUSTER level
|
||||
*/
|
||||
BUG_ON(!cpumask_equal(
|
||||
sched_group_cpus(sg),
|
||||
sched_group_cpus(sd2->parent->groups)
|
||||
));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the constants required to compute normalized energy.
|
||||
* The values of these constants depends on the EM data for the specific
|
||||
* target system and topology.
|
||||
* Thus, this function is expected to be called by the code
|
||||
* that bind the EM to the topology information.
|
||||
*/
|
||||
static int
|
||||
schedtune_init(void)
|
||||
{
|
||||
struct target_nrg *ste = &schedtune_target_nrg;
|
||||
unsigned long delta_pwr = 0;
|
||||
struct sched_domain *sd;
|
||||
struct sched_group *sg;
|
||||
|
||||
pr_info("schedtune: init normalization constants...\n");
|
||||
ste->max_power = 0;
|
||||
ste->min_power = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
/*
|
||||
* When EAS is in use, we always have a pointer to the highest SD
|
||||
* which provides EM data.
|
||||
*/
|
||||
sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
|
||||
if (!sd) {
|
||||
pr_info("schedtune: no energy model data\n");
|
||||
goto nodata;
|
||||
}
|
||||
|
||||
sg = sd->groups;
|
||||
do {
|
||||
schedtune_add_cluster_nrg(sd, sg, ste);
|
||||
} while (sg = sg->next, sg != sd->groups);
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
|
||||
"SYSTEM", ste->min_power, ste->max_power);
|
||||
|
||||
/* Compute normalization constants */
|
||||
delta_pwr = ste->max_power - ste->min_power;
|
||||
ste->rdiv = reciprocal_value(delta_pwr);
|
||||
pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
|
||||
ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
|
||||
|
||||
schedtune_test_nrg(delta_pwr);
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHEDTUNE
|
||||
schedtune_init_cgroups();
|
||||
#else
|
||||
pr_info("schedtune: configured to support global boosting only\n");
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
||||
nodata:
|
||||
rcu_read_unlock();
|
||||
return -EINVAL;
|
||||
}
|
||||
postcore_initcall(schedtune_init);
|
55
kernel/sched/tune.h
Normal file
55
kernel/sched/tune.h
Normal file
|
@ -0,0 +1,55 @@
|
|||
|
||||
#ifdef CONFIG_SCHED_TUNE
|
||||
|
||||
#include <linux/reciprocal_div.h>
|
||||
|
||||
/*
|
||||
* System energy normalization constants
|
||||
*/
|
||||
struct target_nrg {
|
||||
unsigned long min_power;
|
||||
unsigned long max_power;
|
||||
struct reciprocal_value rdiv;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHEDTUNE
|
||||
|
||||
int schedtune_cpu_boost(int cpu);
|
||||
int schedtune_task_boost(struct task_struct *tsk);
|
||||
|
||||
int schedtune_prefer_idle(struct task_struct *tsk);
|
||||
|
||||
void schedtune_exit_task(struct task_struct *tsk);
|
||||
|
||||
void schedtune_enqueue_task(struct task_struct *p, int cpu);
|
||||
void schedtune_dequeue_task(struct task_struct *p, int cpu);
|
||||
|
||||
#else /* CONFIG_CGROUP_SCHEDTUNE */
|
||||
|
||||
#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost()
|
||||
#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost()
|
||||
|
||||
#define schedtune_exit_task(task) do { } while (0)
|
||||
|
||||
#define schedtune_enqueue_task(task, cpu) do { } while (0)
|
||||
#define schedtune_dequeue_task(task, cpu) do { } while (0)
|
||||
|
||||
#endif /* CONFIG_CGROUP_SCHEDTUNE */
|
||||
|
||||
int schedtune_normalize_energy(int energy);
|
||||
int schedtune_accept_deltas(int nrg_delta, int cap_delta,
|
||||
struct task_struct *task);
|
||||
|
||||
#else /* CONFIG_SCHED_TUNE */
|
||||
|
||||
#define schedtune_cpu_boost(cpu) 0
|
||||
#define schedtune_task_boost(tsk) 0
|
||||
|
||||
#define schedtune_exit_task(task) do { } while (0)
|
||||
|
||||
#define schedtune_enqueue_task(task, cpu) do { } while (0)
|
||||
#define schedtune_dequeue_task(task, cpu) do { } while (0)
|
||||
|
||||
#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta
|
||||
|
||||
#endif /* CONFIG_SCHED_TUNE */
|
1170
kernel/sched/walt.c
Normal file
1170
kernel/sched/walt.c
Normal file
File diff suppressed because it is too large
Load diff
62
kernel/sched/walt.h
Normal file
62
kernel/sched/walt.h
Normal file
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Copyright (c) 2016, The Linux Foundation. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 and
|
||||
* only version 2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#ifndef __WALT_H
|
||||
#define __WALT_H
|
||||
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
|
||||
void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
|
||||
u64 wallclock, u64 irqtime);
|
||||
void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
|
||||
void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
|
||||
void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
|
||||
struct task_struct *p);
|
||||
void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
|
||||
struct task_struct *p);
|
||||
void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
|
||||
void walt_init_new_task_load(struct task_struct *p);
|
||||
void walt_mark_task_starting(struct task_struct *p);
|
||||
void walt_set_window_start(struct rq *rq);
|
||||
void walt_migrate_sync_cpu(int cpu);
|
||||
void walt_init_cpu_efficiency(void);
|
||||
u64 walt_ktime_clock(void);
|
||||
void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
|
||||
u64 wallclock);
|
||||
|
||||
u64 walt_irqload(int cpu);
|
||||
int walt_cpu_high_irqload(int cpu);
|
||||
|
||||
#else /* CONFIG_SCHED_WALT */
|
||||
|
||||
static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
|
||||
int event, u64 wallclock, u64 irqtime) { }
|
||||
static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
|
||||
static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
|
||||
static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
|
||||
struct task_struct *p) { }
|
||||
static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
|
||||
struct task_struct *p) { }
|
||||
static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
|
||||
static inline void walt_init_new_task_load(struct task_struct *p) { }
|
||||
static inline void walt_mark_task_starting(struct task_struct *p) { }
|
||||
static inline void walt_set_window_start(struct rq *rq) { }
|
||||
static inline void walt_migrate_sync_cpu(int cpu) { }
|
||||
static inline void walt_init_cpu_efficiency(void) { }
|
||||
static inline u64 walt_ktime_clock(void) { return 0; }
|
||||
|
||||
#endif /* CONFIG_SCHED_WALT */
|
||||
|
||||
extern unsigned int walt_disabled;
|
||||
|
||||
#endif
|
|
@ -304,6 +304,64 @@ static struct ctl_table kern_table[] = {
|
|||
.extra1 = &min_sched_granularity_ns,
|
||||
.extra2 = &max_sched_granularity_ns,
|
||||
},
|
||||
{
|
||||
.procname = "sched_is_big_little",
|
||||
.data = &sysctl_sched_is_big_little,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
#ifdef CONFIG_SCHED_WALT
|
||||
{
|
||||
.procname = "sched_use_walt_cpu_util",
|
||||
.data = &sysctl_sched_use_walt_cpu_util,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "sched_use_walt_task_util",
|
||||
.data = &sysctl_sched_use_walt_task_util,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "sched_walt_init_task_load_pct",
|
||||
.data = &sysctl_sched_walt_init_task_load_pct,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "sched_walt_cpu_high_irqload",
|
||||
.data = &sysctl_sched_walt_cpu_high_irqload,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
#endif
|
||||
{
|
||||
.procname = "sched_sync_hint_enable",
|
||||
.data = &sysctl_sched_sync_hint_enable,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "sched_initial_task_util",
|
||||
.data = &sysctl_sched_initial_task_util,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "sched_cstate_aware",
|
||||
.data = &sysctl_sched_cstate_aware,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "sched_wakeup_granularity_ns",
|
||||
.data = &sysctl_sched_wakeup_granularity,
|
||||
|
@ -435,6 +493,21 @@ static struct ctl_table kern_table[] = {
|
|||
.extra1 = &one,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_TUNE
|
||||
{
|
||||
.procname = "sched_cfs_boost",
|
||||
.data = &sysctl_sched_cfs_boost,
|
||||
.maxlen = sizeof(sysctl_sched_cfs_boost),
|
||||
#ifdef CONFIG_CGROUP_SCHEDTUNE
|
||||
.mode = 0444,
|
||||
#else
|
||||
.mode = 0644,
|
||||
#endif
|
||||
.proc_handler = &sysctl_sched_cfs_boost_handler,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &one_hundred,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_PROVE_LOCKING
|
||||
{
|
||||
.procname = "prove_locking",
|
||||
|
|
|
@ -867,6 +867,15 @@ config SCHED_INFO
|
|||
bool
|
||||
default n
|
||||
|
||||
config PANIC_ON_RT_THROTTLING
|
||||
bool "Panic on RT throttling"
|
||||
help
|
||||
Say Y here to enable the kernel to panic when a realtime
|
||||
runqueue is throttled. This may be useful for detecting
|
||||
and debugging RT throttling issues.
|
||||
|
||||
Say N if unsure.
|
||||
|
||||
config SCHEDSTATS
|
||||
bool "Collect scheduler statistics"
|
||||
depends on DEBUG_KERNEL && PROC_FS
|
||||
|
|
69
mm/vmstat.c
69
mm/vmstat.c
|
@ -460,7 +460,7 @@ static int fold_diff(int *diff)
|
|||
*
|
||||
* The function returns the number of global counters updated.
|
||||
*/
|
||||
static int refresh_cpu_vm_stats(void)
|
||||
static int refresh_cpu_vm_stats(bool do_pagesets)
|
||||
{
|
||||
struct zone *zone;
|
||||
int i;
|
||||
|
@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void)
|
|||
#endif
|
||||
}
|
||||
}
|
||||
cond_resched();
|
||||
#ifdef CONFIG_NUMA
|
||||
/*
|
||||
* Deal with draining the remote pageset of this
|
||||
* processor
|
||||
*
|
||||
* Check if there are pages remaining in this pageset
|
||||
* if not then there is nothing to expire.
|
||||
*/
|
||||
if (!__this_cpu_read(p->expire) ||
|
||||
if (do_pagesets) {
|
||||
cond_resched();
|
||||
/*
|
||||
* Deal with draining the remote pageset of this
|
||||
* processor
|
||||
*
|
||||
* Check if there are pages remaining in this pageset
|
||||
* if not then there is nothing to expire.
|
||||
*/
|
||||
if (!__this_cpu_read(p->expire) ||
|
||||
!__this_cpu_read(p->pcp.count))
|
||||
continue;
|
||||
continue;
|
||||
|
||||
/*
|
||||
* We never drain zones local to this processor.
|
||||
*/
|
||||
if (zone_to_nid(zone) == numa_node_id()) {
|
||||
__this_cpu_write(p->expire, 0);
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
* We never drain zones local to this processor.
|
||||
*/
|
||||
if (zone_to_nid(zone) == numa_node_id()) {
|
||||
__this_cpu_write(p->expire, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (__this_cpu_dec_return(p->expire))
|
||||
continue;
|
||||
if (__this_cpu_dec_return(p->expire))
|
||||
continue;
|
||||
|
||||
if (__this_cpu_read(p->pcp.count)) {
|
||||
drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
|
||||
changes++;
|
||||
if (__this_cpu_read(p->pcp.count)) {
|
||||
drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
|
||||
changes++;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -1386,7 +1388,7 @@ static cpumask_var_t cpu_stat_off;
|
|||
|
||||
static void vmstat_update(struct work_struct *w)
|
||||
{
|
||||
if (refresh_cpu_vm_stats()) {
|
||||
if (refresh_cpu_vm_stats(true)) {
|
||||
/*
|
||||
* Counters were updated so we expect more updates
|
||||
* to occur in the future. Keep on running the
|
||||
|
@ -1417,6 +1419,23 @@ static void vmstat_update(struct work_struct *w)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Switch off vmstat processing and then fold all the remaining differentials
|
||||
* until the diffs stay at zero. The function is used by NOHZ and can only be
|
||||
* invoked when tick processing is not active.
|
||||
*/
|
||||
void quiet_vmstat(void)
|
||||
{
|
||||
if (system_state != SYSTEM_RUNNING)
|
||||
return;
|
||||
|
||||
do {
|
||||
if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
|
||||
cancel_delayed_work(this_cpu_ptr(&vmstat_work));
|
||||
|
||||
} while (refresh_cpu_vm_stats(false));
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if the diffs for a certain cpu indicate that
|
||||
* an update is needed.
|
||||
|
@ -1449,7 +1468,7 @@ static bool need_update(int cpu)
|
|||
*/
|
||||
static void vmstat_shepherd(struct work_struct *w);
|
||||
|
||||
static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
|
||||
static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
|
||||
|
||||
static void vmstat_shepherd(struct work_struct *w)
|
||||
{
|
||||
|
|
Loading…
Add table
Reference in a new issue