Merge branch 'v4.4-16.09-android-tmp' into lsk-v4.4-16.09-android

* v4.4-16.09-android-tmp:
  unsafe_[get|put]_user: change interface to use a error target label
  usercopy: remove page-spanning test for now
  usercopy: fix overlap check for kernel text
  mm/slub: support left redzone
  Linux 4.4.21
  lib/mpi: mpi_write_sgl(): fix skipping of leading zero limbs
  regulator: anatop: allow regulator to be in bypass mode
  hwrng: exynos - Disable runtime PM on probe failure
  cpufreq: Fix GOV_LIMITS handling for the userspace governor
  metag: Fix atomic_*_return inline asm constraints
  scsi: fix upper bounds check of sense key in scsi_sense_key_string()
  ALSA: timer: fix NULL pointer dereference on memory allocation failure
  ALSA: timer: fix division by zero after SNDRV_TIMER_IOCTL_CONTINUE
  ALSA: timer: fix NULL pointer dereference in read()/ioctl() race
  ALSA: hda - Enable subwoofer on Dell Inspiron 7559
  ALSA: hda - Add headset mic quirk for Dell Inspiron 5468
  ALSA: rawmidi: Fix possible deadlock with virmidi registration
  ALSA: fireworks: accessing to user space outside spinlock
  ALSA: firewire-tascam: accessing to user space outside spinlock
  ALSA: usb-audio: Add sample rate inquiry quirk for B850V3 CP2114
  crypto: caam - fix IV loading for authenc (giv)decryption
  uprobes: Fix the memcg accounting
  x86/apic: Do not init irq remapping if ioapic is disabled
  vhost/scsi: fix reuse of &vq->iov[out] in response
  bcache: RESERVE_PRIO is too small by one when prio_buckets() is a power of two.
  ubifs: Fix assertion in layout_in_gaps()
  ovl: fix workdir creation
  ovl: listxattr: use strnlen()
  ovl: remove posix_acl_default from workdir
  ovl: don't copy up opaqueness
  wrappers for ->i_mutex access
  lustre: remove unused declaration
  timekeeping: Avoid taking lock in NMI path with CONFIG_DEBUG_TIMEKEEPING
  timekeeping: Cap array access in timekeeping_debug
  xfs: fix superblock inprogress check
  ASoC: atmel_ssc_dai: Don't unconditionally reset SSC on stream startup
  drm/msm: fix use of copy_from_user() while holding spinlock
  drm: Reject page_flip for !DRIVER_MODESET
  drm/radeon: fix radeon_move_blit on 32bit systems
  s390/sclp_ctl: fix potential information leak with /dev/sclp
  rds: fix an infoleak in rds_inc_info_copy
  powerpc/tm: Avoid SLB faults in treclaim/trecheckpoint when RI=0
  nvme: Call pci_disable_device on the error path.
  cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork
  block: make sure a big bio is split into at most 256 bvecs
  block: Fix race triggered by blk_set_queue_dying()
  ext4: avoid modifying checksum fields directly during checksum verification
  ext4: avoid deadlock when expanding inode size
  ext4: properly align shifted xattrs when expanding inodes
  ext4: fix xattr shifting when expanding inodes part 2
  ext4: fix xattr shifting when expanding inodes
  ext4: validate that metadata blocks do not overlap superblock
  net: Use ns_capable_noaudit() when determining net sysctl permissions
  kernel: Add noaudit variant of ns_capable()
  KEYS: Fix ASN.1 indefinite length object parsing
  drivers:hv: Lock access to hyperv_mmio resource tree
  cxlflash: Move to exponential back-off when cmd_room is not available
  netfilter: x_tables: check for size overflow
  drm/amdgpu/cz: enable/disable vce dpm even if vce pg is disabled
  cred: Reject inodes with invalid ids in set_create_file_as()
  fs: Check for invalid i_uid in may_follow_link()
  IB/IPoIB: Do not set skb truesize since using one linearskb
  udp: properly support MSG_PEEK with truncated buffers
  crypto: nx-842 - Mask XERS0 bit in return value
  cxlflash: Fix to avoid virtual LUN failover failure
  cxlflash: Fix to escalate LINK_RESET also on port 1
  tipc: fix nl compat regression for link statistics
  tipc: fix an infoleak in tipc_nl_compat_link_dump
  netfilter: x_tables: check for size overflow
  Bluetooth: Add support for Intel Bluetooth device 8265 [8087:0a2b]
  drm/i915: Check VBT for port presence in addition to the strap on VLV/CHV
  drm/i915: Only ignore eDP ports that are connected
  Input: xpad - move pending clear to the correct location
  net: thunderx: Fix link status reporting
  x86/hyperv: Avoid reporting bogus NMI status for Gen2 instances
  crypto: vmx - IV size failing on skcipher API
  tda10071: Fix dependency to REGMAP_I2C
  crypto: vmx - Fix ABI detection
  crypto: vmx - comply with ABIs that specify vrsave as reserved.
  HID: core: prevent out-of-bound readings
  lpfc: Fix DMA faults observed upon plugging loopback connector
  block: fix blk_rq_get_max_sectors for driver private requests
  irqchip/gicv3-its: numa: Enable workaround for Cavium thunderx erratum 23144
  clocksource: Allow unregistering the watchdog
  btrfs: Continue write in case of can_not_nocow
  blk-mq: End unstarted requests on dying queue
  cxlflash: Fix to resolve dead-lock during EEH recovery
  drm/radeon/mst: fix regression in lane/link handling.
  ecryptfs: fix handling of directory opening
  ALSA: hda: add AMD Polaris-10/11 AZ PCI IDs with proper driver caps
  drm: Balance error path for GEM handle allocation
  ntp: Fix ADJ_SETOFFSET being used w/ ADJ_NANO
  time: Verify time values in adjtimex ADJ_SETOFFSET to avoid overflow
  Input: xpad - correctly handle concurrent LED and FF requests
  net: thunderx: Fix receive packet stats
  net: thunderx: Fix for multiqset not configured upon interface toggle
  perf/x86/cqm: Fix CQM memory leak and notifier leak
  perf/x86/cqm: Fix CQM handling of grouping events into a cache_group
  s390/crypto: provide correct file mode at device register.
  proc: revert /proc/<pid>/maps [stack:TID] annotation
  intel_idle: Support for Intel Xeon Phi Processor x200 Product Family
  cxlflash: Fix to avoid unnecessary scan with internal LUNs
  Drivers: hv: vmbus: don't manipulate with clocksources on crash
  Drivers: hv: vmbus: avoid scheduling in interrupt context in vmbus_initiate_unload()
  Drivers: hv: vmbus: avoid infinite loop in init_vp_index()
  arcmsr: fixes not release allocated resource
  arcmsr: fixed getting wrong configuration data
  s390/pci_dma: fix DMA table corruption with > 4 TB main memory
  net/mlx5e: Don't modify CQ before it was created
  net/mlx5e: Don't try to modify CQ moderation if it is not supported
  mmc: sdhci: Do not BUG on invalid vdd
  UVC: Add support for R200 depth camera
  sched/numa: Fix use-after-free bug in the task_numa_compare
  ALSA: hda - add codec support for Kabylake display audio codec
  drm/i915: Fix hpd live status bits for g4x
  tipc: fix nullptr crash during subscription cancel
  arm64: Add workaround for Cavium erratum 27456
  net: thunderx: Fix for Qset error due to CQ full
  drm/radeon: fix dp link rate selection (v2)
  drm/amdgpu: fix dp link rate selection (v2)
  qla2xxx: Use ATIO type to send correct tmr response
  mmc: sdhci: 64-bit DMA actually has 4-byte alignment
  drm/atomic: Do not unset crtc when an encoder is stolen
  drm/i915/skl: Add missing SKL ids
  drm/i915/bxt: update list of PCIIDs
  hrtimer: Catch illegal clockids
  i40e/i40evf: Fix RSS rx-flow-hash configuration through ethtool
  mpt3sas: Fix for Asynchronous completion of timedout IO and task abort of timedout IO.
  mpt3sas: A correction in unmap_resources
  net: cavium: liquidio: fix check for in progress flag
  arm64: KVM: Configure TCR_EL2.PS at runtime
  irqchip/gic-v3: Make sure read from ICC_IAR1_EL1 is visible on redestributor
  pwm: lpc32xx: fix and simplify duty cycle and period calculations
  pwm: lpc32xx: correct number of PWM channels from 2 to 1
  pwm: fsl-ftm: Fix clock enable/disable when using PM
  megaraid_sas: Add an i/o barrier
  megaraid_sas: Fix SMAP issue
  megaraid_sas: Do not allow PCI access during OCR
  s390/cio: update measurement characteristics
  s390/cio: ensure consistent measurement state
  s390/cio: fix measurement characteristics memleak
  qeth: initialize net_device with carrier off
  lpfc: Fix external loopback failure.
  lpfc: Fix mbox reuse in PLOGI completion
  lpfc: Fix RDP Speed reporting.
  lpfc: Fix crash in fcp command completion path.
  lpfc: Fix driver crash when module parameter lpfc_fcp_io_channel set to 16
  lpfc: Fix RegLogin failed error seen on Lancer FC during port bounce
  lpfc: Fix the FLOGI discovery logic to comply with T11 standards
  lpfc: Fix FCF Infinite loop in lpfc_sli4_fcf_rr_next_index_get.
  cxl: Enable PCI device ID for future IBM CXL adapter
  cxl: fix build for GCC 4.6.x
  cxlflash: Enable device id for future IBM CXL adapter
  cxlflash: Resolve oops in wait_port_offline
  cxlflash: Fix to resolve cmd leak after host reset
  cxl: Fix DSI misses when the context owning task exits
  cxl: Fix possible idr warning when contexts are released
  Drivers: hv: vmbus: fix rescind-offer handling for device without a driver
  Drivers: hv: vmbus: serialize process_chn_event() and vmbus_close_internal()
  Drivers: hv: vss: run only on supported host versions
  drivers/hv: cleanup synic msrs if vmbus connect failed
  Drivers: hv: util: catch allocation errors
  tools: hv: report ENOSPC errors in hv_fcopy_daemon
  Drivers: hv: utils: run polling callback always in interrupt context
  Drivers: hv: util: Increase the timeout for util services
  lightnvm: fix missing grown bad block type
  lightnvm: fix locking and mempool in rrpc_lun_gc
  lightnvm: unlock rq and free ppa_list on submission fail
  lightnvm: add check after mempool allocation
  lightnvm: fix incorrect nr_free_blocks stat
  lightnvm: fix bio submission issue
  cxlflash: a couple off by one bugs
  fm10k: Cleanup exception handling for mailbox interrupt
  fm10k: Cleanup MSI-X interrupts in case of failure
  fm10k: reinitialize queuing scheme after calling init_hw
  fm10k: always check init_hw for errors
  fm10k: reset max_queues on init_hw_vf failure
  fm10k: Fix handling of NAPI budget when multiple queues are enabled per vector
  fm10k: Correct MTU for jumbo frames
  fm10k: do not assume VF always has 1 queue
  clk: xgene: Fix divider with non-zero shift value
  e1000e: fix division by zero on jumbo MTUs
  e1000: fix data race between tx_ring->next_to_clean
  ixgbe: Fix handling of NAPI budget when multiple queues are enabled per vector
  igb: fix NULL derefs due to skipped SR-IOV enabling
  igb: use the correct i210 register for EEMNGCTL
  igb: don't unmap NULL hw_addr
  i40e: Fix Rx hash reported to the stack by our driver
  i40e: clean whole mac filter list
  i40evf: check rings before freeing resources
  i40e: don't add zero MAC filter
  i40e: properly delete VF MAC filters
  i40e: Fix memory leaks, sideband filter programming
  i40e: fix: do not sleep in netdev_ops
  i40e/i40evf: Fix RS bit update in Tx path and disable force WB workaround
  i40evf: handle many MAC filters correctly
  i40e: Workaround fix for mss < 256 issue
  UPSTREAM: audit: fix a double fetch in audit_log_single_execve_arg()
  UPSTREAM: ARM: 8494/1: mm: Enable PXN when running non-LPAE kernel on LPAE processor
  FIXUP: sched/tune: update accouting before CPU capacity
  FIXUP: sched/tune: add fixes missing from a previous patch
  arm: Fix #if/#ifdef typo in topology.c
  arm: Fix build error "conflicting types for 'scale_cpu_capacity'"
  sched/walt: use do_div instead of division operator
  DEBUG: cpufreq: fix cpu_capacity tracing build for non-smp systems
  sched/walt: include missing header for arm_timer_read_counter()
  cpufreq: Kconfig: Fixup incorrect selection by CPU_FREQ_DEFAULT_GOV_SCHED
  sched/fair: Avoid redundant idle_cpu() call in update_sg_lb_stats()
  FIXUP: sched: scheduler-driven cpu frequency selection
  sched/rt: Add Kconfig option to enable panicking for RT throttling
  sched/rt: print RT tasks when RT throttling is activated
  UPSTREAM: sched: Fix a race between __kthread_bind() and sched_setaffinity()
  sched/fair: Favor higher cpus only for boosted tasks
  vmstat: make vmstat_updater deferrable again and shut down on idle
  sched/fair: call OPP update when going idle after migration
  sched/cpufreq_sched: fix thermal capping events
  sched/fair: Picking cpus with low OPPs for tasks that prefer idle CPUs
  FIXUP: sched/tune: do initialization as a postcore_initicall
  DEBUG: sched: add tracepoint for RD overutilized
  sched/tune: Introducing a new schedtune attribute prefer_idle
  sched: use util instead of capacity to select busy cpu
  arch_timer: add error handling when the MPM global timer is cleared
  FIXUP: sched: Fix double-release of spinlock in move_queued_task
  FIXUP: sched/fair: Fix hang during suspend in sched_group_energy
  FIXUP: sched: fix SchedFreq integration for both PELT and WALT
  sched: EAS: Avoid causing spikes to max-freq unnecessarily
  FIXUP: sched: fix set_cfs_cpu_capacity when WALT is in use
  sched/walt: Accounting for number of irqs pending on each core
  sched: Introduce Window Assisted Load Tracking (WALT)
  sched/tune: fix PB and PC cuts indexes definition
  sched/fair: optimize idle cpu selection for boosted tasks
  FIXUP: sched/tune: fix accounting for runnable tasks
  sched/tune: use a single initialisation function
  sched/{fair,tune}: simplify fair.c code
  FIXUP: sched/tune: fix payoff calculation for boost region
  sched/tune: Add support for negative boost values
  FIX: sched/tune: move schedtune_nornalize_energy into fair.c
  FIX: sched/tune: update usage of boosted task utilisation on CPU selection
  sched/fair: add tunable to set initial task load
  sched/fair: add tunable to force selection at cpu granularity
  sched: EAS: take cstate into account when selecting idle core
  sched/cpufreq_sched: Consolidated update
  FIXUP: sched: fix build for non-SMP target
  DEBUG: sched/tune: add tracepoint on P-E space filtering
  DEBUG: sched/tune: add tracepoint for energy_diff() values
  DEBUG: sched/tune: add tracepoint for task boost signal
  arm: topology: Define TC2 energy and provide it to the scheduler
  CHROMIUM: sched: update the average of nr_running
  DEBUG: schedtune: add tracepoint for schedtune_tasks_update() values
  DEBUG: schedtune: add tracepoint for CPU boost signal
  DEBUG: schedtune: add tracepoint for SchedTune configuration update
  DEBUG: sched: add energy procfs interface
  DEBUG: sched,cpufreq: add cpu_capacity change tracepoint
  DEBUG: sched: add tracepoint for CPU load/util signals
  DEBUG: sched: add tracepoint for task load/util signals
  DEBUG: sched: add tracepoint for cpu/freq scale invariance
  sched/fair: filter energy_diff() based on energy_payoff value
  sched/tune: add support to compute normalized energy
  sched/fair: keep track of energy/capacity variations
  sched/fair: add boosted task utilization
  sched/{fair,tune}: track RUNNABLE tasks impact on per CPU boost value
  sched/tune: compute and keep track of per CPU boost value
  sched/tune: add initial support for CGroups based boosting
  sched/fair: add boosted CPU usage
  sched/fair: add function to convert boost value into "margin"
  sched/tune: add sysctl interface to define a boost value
  sched/tune: add detailed documentation
  fixup! sched/fair: jump to max OPP when crossing UP threshold
  fixup! sched: scheduler-driven cpu frequency selection
  sched: rt scheduler sets capacity requirement
  sched: deadline: use deadline bandwidth in scale_rt_capacity
  sched: remove call of sched_avg_update from sched_rt_avg_update
  sched/cpufreq_sched: add trace events
  sched/fair: jump to max OPP when crossing UP threshold
  sched/fair: cpufreq_sched triggers for load balancing
  sched/{core,fair}: trigger OPP change request on fork()
  sched/fair: add triggers for OPP change requests
  sched: scheduler-driven cpu frequency selection
  cpufreq: introduce cpufreq_driver_is_slow
  sched: Consider misfit tasks when load-balancing
  sched: Add group_misfit_task load-balance type
  sched: Add per-cpu max capacity to sched_group_capacity
  sched: Do eas idle balance regardless of the rq avg idle value
  arm64: Enable max freq invariant scheduler load-tracking and capacity support
  arm: Enable max freq invariant scheduler load-tracking and capacity support
  sched: Update max cpu capacity in case of max frequency constraints
  cpufreq: Max freq invariant scheduler load-tracking and cpu capacity support
  arm64, topology: Updates to use DT bindings for EAS costing data
  sched: Support for extracting EAS energy costs from DT
  Documentation: DT bindings for energy model cost data required by EAS
  sched: Disable energy-unfriendly nohz kicks
  sched: Consider a not over-utilized energy-aware system as balanced
  sched: Energy-aware wake-up task placement
  sched: Determine the current sched_group idle-state
  sched, cpuidle: Track cpuidle state index in the scheduler
  sched: Add over-utilization/tipping point indicator
  sched: Estimate energy impact of scheduling decisions
  sched: Extend sched_group_energy to test load-balancing decisions
  sched: Calculate energy consumption of sched_group
  sched: Highest energy aware balancing sched_domain level pointer
  sched: Relocated cpu_util() and change return type
  sched: Compute cpu capacity available at current frequency
  arm64: Cpu invariant scheduler load-tracking and capacity support
  arm: Cpu invariant scheduler load-tracking and capacity support
  sched: Introduce SD_SHARE_CAP_STATES sched_domain flag
  sched: Initialize energy data structures
  sched: Introduce energy data structures
  sched: Make energy awareness a sched feature
  sched: Documentation for scheduler energy cost model
  sched: Prevent unnecessary active balance of single task in sched group
  sched: Enable idle balance to pull single task towards cpu with higher capacity
  sched: Consider spare cpu capacity at task wake-up
  sched: Add cpu capacity awareness to wakeup balancing
  sched: Store system-wide maximum cpu capacity in root domain
  arm: Update arch_scale_cpu_capacity() to reflect change to define
  arm64: Enable frequency invariant scheduler load-tracking support
  arm: Enable frequency invariant scheduler load-tracking support
  cpufreq: Frequency invariant scheduler load-tracking support
  sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task()
  FROMLIST: pstore: drop pmsg bounce buffer
  UPSTREAM: usercopy: remove page-spanning test for now
  UPSTREAM: usercopy: force check_object_size() inline
  BACKPORT: usercopy: fold builtin_const check into inline function
  UPSTREAM: x86/uaccess: force copy_*_user() to be inlined
  UPSTREAM: HID: core: prevent out-of-bound readings
  Android: Fix build breakages.
  UPSTREAM: tty: Prevent ldisc drivers from re-using stale tty fields
  UPSTREAM: netfilter: nfnetlink: correctly validate length of batch messages
  cpuset: Make cpusets restore on hotplug
  UPSTREAM: mm/slub: support left redzone
  UPSTREAM: Make the hardened user-copy code depend on having a hardened allocator
  Android: MMC/UFS IO Latency Histograms.
  UPSTREAM: usercopy: fix overlap check for kernel text
  UPSTREAM: usercopy: avoid potentially undefined behavior in pointer math
  UPSTREAM: unsafe_[get|put]_user: change interface to use a error target label
  BACKPORT: arm64: mm: fix location of _etext
  BACKPORT: ARM: 8583/1: mm: fix location of _etext
  BACKPORT: Don't show empty tag stats for unprivileged uids
  UPSTREAM: tcp: fix use after free in tcp_xmit_retransmit_queue()
  ANDROID: base-cfg: drop SECCOMP_FILTER config
  UPSTREAM: [media] xc2028: unlock on error in xc2028_set_config()
  UPSTREAM: [media] xc2028: avoid use after free
  ANDROID: base-cfg: enable SECCOMP config
  ANDROID: rcu_sync: Export rcu_sync_lockdep_assert
  RFC: FROMLIST: cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork
  RFC: FROMLIST: cgroup: avoid synchronize_sched() in __cgroup_procs_write()
  RFC: FROMLIST: locking/percpu-rwsem: Optimize readers and reduce global impact
  net: ipv6: Fix ping to link-local addresses.
  ipv6: fix endianness error in icmpv6_err
  ANDROID: dm: android-verity: Allow android-verity to be compiled as an independent module
  backporting: a brief introduce of backported feautures on 4.4
  Linux 4.4.20
  sysfs: correctly handle read offset on PREALLOC attrs
  hwmon: (iio_hwmon) fix memory leak in name attribute
  ALSA: line6: Fix POD sysfs attributes segfault
  ALSA: line6: Give up on the lock while URBs are released.
  ALSA: line6: Remove double line6_pcm_release() after failed acquire.
  ACPI / SRAT: fix SRAT parsing order with both LAPIC and X2APIC present
  ACPI / sysfs: fix error code in get_status()
  ACPI / drivers: replace acpi_probe_lock spinlock with mutex
  ACPI / drivers: fix typo in ACPI_DECLARE_PROBE_ENTRY macro
  staging: comedi: ni_mio_common: fix wrong insn_write handler
  staging: comedi: ni_mio_common: fix AO inttrig backwards compatibility
  staging: comedi: comedi_test: fix timer race conditions
  staging: comedi: daqboard2000: bug fix board type matching code
  USB: serial: option: add WeTelecom 0x6802 and 0x6803 products
  USB: serial: option: add WeTelecom WM-D200
  USB: serial: mos7840: fix non-atomic allocation in write path
  USB: serial: mos7720: fix non-atomic allocation in write path
  USB: fix typo in wMaxPacketSize validation
  usb: chipidea: udc: don't touch DP when controller is in host mode
  USB: avoid left shift by -1
  dmaengine: usb-dmac: check CHCR.DE bit in usb_dmac_isr_channel()
  crypto: qat - fix aes-xts key sizes
  crypto: nx - off by one bug in nx_of_update_msc()
  Input: i8042 - set up shared ps2_cmd_mutex for AUX ports
  Input: i8042 - break load dependency between atkbd/psmouse and i8042
  Input: tegra-kbc - fix inverted reset logic
  btrfs: properly track when rescan worker is running
  btrfs: waiting on qgroup rescan should not always be interruptible
  fs/seq_file: fix out-of-bounds read
  gpio: Fix OF build problem on UM
  usb: renesas_usbhs: gadget: fix return value check in usbhs_mod_gadget_probe()
  megaraid_sas: Fix probing cards without io port
  mpt3sas: Fix resume on WarpDrive flash cards
  cdc-acm: fix wrong pipe type on rx interrupt xfers
  i2c: cros-ec-tunnel: Fix usage of cros_ec_cmd_xfer()
  mfd: cros_ec: Add cros_ec_cmd_xfer_status() helper
  aacraid: Check size values after double-fetch from user
  ARC: Elide redundant setup of DMA callbacks
  ARC: Call trace_hardirqs_on() before enabling irqs
  ARC: use correct offset in pt_regs for saving/restoring user mode r25
  ARC: build: Better way to detect ISA compatible toolchain
  drm/i915: fix aliasing_ppgtt leak
  drm/amdgpu: record error code when ring test failed
  drm/amd/amdgpu: sdma resume fail during S4 on CI
  drm/amdgpu: skip TV/CV in display parsing
  drm/amdgpu: avoid a possible array overflow
  drm/amdgpu: fix amdgpu_move_blit on 32bit systems
  drm/amdgpu: Change GART offset to 64-bit
  iio: fix sched WARNING "do not call blocking ops when !TASK_RUNNING"
  sched/nohz: Fix affine unpinned timers mess
  sched/cputime: Fix NO_HZ_FULL getrusage() monotonicity regression
  of: fix reference counting in of_graph_get_endpoint_by_regs
  arm64: dts: rockchip: add reset saradc node for rk3368 SoCs
  mac80211: fix purging multicast PS buffer queue
  s390/dasd: fix hanging device after clear subchannel
  EDAC: Increment correct counter in edac_inc_ue_error()
  pinctrl/amd: Remove the default de-bounce time
  iommu/arm-smmu: Don't BUG() if we find aborting STEs with disable_bypass
  iommu/arm-smmu: Fix CMDQ error handling
  iommu/dma: Don't put uninitialised IOVA domains
  xhci: Make sure xhci handles USB_SPEED_SUPER_PLUS devices.
  USB: serial: ftdi_sio: add PIDs for Ivium Technologies devices
  USB: serial: ftdi_sio: add device ID for WICED USB UART dev board
  USB: serial: option: add support for Telit LE920A4
  USB: serial: option: add D-Link DWM-156/A3
  USB: serial: fix memleak in driver-registration error path
  xhci: don't dereference a xhci member after removing xhci
  usb: xhci: Fix panic if disconnect
  xhci: always handle "Command Ring Stopped" events
  usb/gadget: fix gadgetfs aio support.
  usb: gadget: fsl_qe_udc: off by one in setup_received_handle()
  USB: validate wMaxPacketValue entries in endpoint descriptors
  usb: renesas_usbhs: Use dmac only if the pipe type is bulk
  usb: renesas_usbhs: clear the BRDYSTS in usbhsg_ep_enable()
  USB: hub: change the locking in hub_activate
  USB: hub: fix up early-exit pathway in hub_activate
  usb: hub: Fix unbalanced reference count/memory leak/deadlocks
  usb: define USB_SPEED_SUPER_PLUS speed for SuperSpeedPlus USB3.1 devices
  usb: dwc3: gadget: increment request->actual once
  usb: dwc3: pci: add Intel Kabylake PCI ID
  usb: misc: usbtest: add fix for driver hang
  usb: ehci: change order of register cleanup during shutdown
  crypto: caam - defer aead_set_sh_desc in case of zero authsize
  crypto: caam - fix echainiv(authenc) encrypt shared descriptor
  crypto: caam - fix non-hmac hashes
  genirq/msi: Make sure PCI MSIs are activated early
  genirq/msi: Remove unused MSI_FLAG_IDENTITY_MAP
  um: Don't discard .text.exit section
  ACPI / CPPC: Prevent cpc_desc_ptr points to the invalid data
  ACPI: CPPC: Return error if _CPC is invalid on a CPU
  mmc: sdhci-acpi: Reduce Baytrail eMMC/SD/SDIO hangs
  PCI: Limit config space size for Netronome NFP4000
  PCI: Add Netronome NFP4000 PF device ID
  PCI: Limit config space size for Netronome NFP6000 family
  PCI: Add Netronome vendor and device IDs
  PCI: Support PCIe devices with short cfg_size
  NVMe: Don't unmap controller registers on reset
  ALSA: hda - Manage power well properly for resume
  libnvdimm, nd_blk: mask off reserved status bits
  perf intel-pt: Fix occasional decoding errors when tracing system-wide
  vfio/pci: Fix NULL pointer oops in error interrupt setup handling
  virtio: fix memory leak in virtqueue_add()
  parisc: Fix order of EREFUSED define in errno.h
  arm64: Define AT_VECTOR_SIZE_ARCH for ARCH_DLINFO
  ALSA: usb-audio: Add quirk for ELP HD USB Camera
  ALSA: usb-audio: Add a sample rate quirk for Creative Live! Cam Socialize HD (VF0610)
  powerpc/eeh: eeh_pci_enable(): fix checking of post-request state
  SUNRPC: allow for upcalls for same uid but different gss service
  SUNRPC: Handle EADDRNOTAVAIL on connection failures
  tools/testing/nvdimm: fix SIGTERM vs hotplug crash
  uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions
  x86/mm: Disable preemption during CR3 read+write
  hugetlb: fix nr_pmds accounting with shared page tables
  mm: SLUB hardened usercopy support
  mm: SLAB hardened usercopy support
  s390/uaccess: Enable hardened usercopy
  sparc/uaccess: Enable hardened usercopy
  powerpc/uaccess: Enable hardened usercopy
  ia64/uaccess: Enable hardened usercopy
  arm64/uaccess: Enable hardened usercopy
  ARM: uaccess: Enable hardened usercopy
  x86/uaccess: Enable hardened usercopy
  x86: remove more uaccess_32.h complexity
  x86: remove pointless uaccess_32.h complexity
  x86: fix SMAP in 32-bit environments
  Use the new batched user accesses in generic user string handling
  Add 'unsafe' user access functions for batched accesses
  x86: reorganize SMAP handling in user space accesses
  mm: Hardened usercopy
  mm: Implement stack frame object validation
  mm: Add is_migrate_cma_page
  Linux 4.4.19
  Documentation/module-signing.txt: Note need for version info if reusing a key
  module: Invalidate signatures on force-loaded modules
  dm flakey: error READ bios during the down_interval
  rtc: s3c: Add s3c_rtc_{enable/disable}_clk in s3c_rtc_setfreq()
  lpfc: fix oops in lpfc_sli4_scmd_to_wqidx_distr() from lpfc_send_taskmgmt()
  ACPI / EC: Work around method reentrancy limit in ACPICA for _Qxx
  x86/platform/intel_mid_pci: Rework IRQ0 workaround
  PCI: Mark Atheros AR9485 and QCA9882 to avoid bus reset
  MIPS: hpet: Increase HPET_MIN_PROG_DELTA and decrease HPET_MIN_CYCLES
  MIPS: Don't register r4k sched clock when CPUFREQ enabled
  MIPS: mm: Fix definition of R6 cache instruction
  SUNRPC: Don't allocate a full sockaddr_storage for tracing
  Input: elan_i2c - properly wake up touchpad on ASUS laptops
  target: Fix ordered task CHECK_CONDITION early exception handling
  target: Fix max_unmap_lba_count calc overflow
  target: Fix race between iscsi-target connection shutdown + ABORT_TASK
  target: Fix missing complete during ABORT_TASK + CMD_T_FABRIC_STOP
  target: Fix ordered task target_setup_cmd_from_cdb exception hang
  iscsi-target: Fix panic when adding second TCP connection to iSCSI session
  ubi: Fix race condition between ubi device creation and udev
  ubi: Fix early logging
  ubi: Make volume resize power cut aware
  of: fix memory leak related to safe_name()
  IB/mlx4: Fix memory leak if QP creation failed
  IB/mlx4: Fix error flow when sending mads under SRIOV
  IB/mlx4: Fix the SQ size of an RC QP
  IB/IWPM: Fix a potential skb leak
  IB/IPoIB: Don't update neigh validity for unresolved entries
  IB/SA: Use correct free function
  IB/mlx5: Return PORT_ERR in Active to Initializing tranisition
  IB/mlx5: Fix post send fence logic
  IB/mlx5: Fix entries check in mlx5_ib_resize_cq
  IB/mlx5: Fix returned values of query QP
  IB/mlx5: Fix entries checks in mlx5_ib_create_cq
  IB/mlx5: Fix MODIFY_QP command input structure
  ALSA: hda - Fix headset mic detection problem for two dell machines
  ALSA: hda: add AMD Bonaire AZ PCI ID with proper driver caps
  ALSA: hda/realtek - Can't adjust speaker's volume on a Dell AIO
  ALSA: hda: Fix krealloc() with __GFP_ZERO usage
  mm/hugetlb: avoid soft lockup in set_max_huge_pages()
  mtd: nand: fix bug writing 1 byte less than page size
  block: fix bdi vs gendisk lifetime mismatch
  block: add missing group association in bio-cloning functions
  metag: Fix __cmpxchg_u32 asm constraint for CMP
  ftrace/recordmcount: Work around for addition of metag magic but not relocations
  balloon: check the number of available pages in leak balloon
  drm/i915/dp: Revert "drm/i915/dp: fall back to 18 bpp when sink capability is unknown"
  drm/i915: Never fully mask the the EI up rps interrupt on SNB/IVB
  drm/edid: Add 6 bpc quirk for display AEO model 0.
  drm: Restore double clflush on the last partial cacheline
  drm/nouveau/fbcon: fix font width not divisible by 8
  drm/nouveau/gr/nv3x: fix instobj write offsets in gr setup
  drm/nouveau: check for supported chipset before booting fbdev off the hw
  drm/radeon: support backlight control for UNIPHY3
  drm/radeon: fix firmware info version checks
  drm/radeon: Poll for both connect/disconnect on analog connectors
  drm/radeon: add a delay after ATPX dGPU power off
  drm/amdgpu/gmc7: add missing mullins case
  drm/amdgpu: fix firmware info version checks
  drm/amdgpu: Disable RPM helpers while reprobing connectors on resume
  drm/amdgpu: support backlight control for UNIPHY3
  drm/amdgpu: Poll for both connect/disconnect on analog connectors
  drm/amdgpu: add a delay after ATPX dGPU power off
  w1:omap_hdq: fix regression
  netlabel: add address family checks to netlbl_{sock,req}_delattr()
  ARM: dts: sunxi: Add a startup delay for fixed regulator enabled phys
  audit: fix a double fetch in audit_log_single_execve_arg()
  iommu/amd: Update Alias-DTE in update_device_table()
  iommu/amd: Init unity mappings only for dma_ops domains
  iommu/amd: Handle IOMMU_DOMAIN_DMA in ops->domain_free call-back
  iommu/vt-d: Return error code in domain_context_mapping_one()
  iommu/exynos: Suppress unbinding to prevent system failure
  drm/i915: Don't complain about lack of ACPI video bios
  nfsd: don't return an unhashed lock stateid after taking mutex
  nfsd: Fix race between FREE_STATEID and LOCK
  nfs: don't create zero-length requests
  MIPS: KVM: Propagate kseg0/mapped tlb fault errors
  MIPS: KVM: Fix gfn range check in kseg0 tlb faults
  MIPS: KVM: Add missing gfn range check
  MIPS: KVM: Fix mapped fault broken commpage handling
  random: add interrupt callback to VMBus IRQ handler
  random: print a warning for the first ten uninitialized random users
  random: initialize the non-blocking pool via add_hwgenerator_randomness()
  CIFS: Fix a possible invalid memory access in smb2_query_symlink()
  cifs: fix crash due to race in hmac(md5) handling
  cifs: Check for existing directory when opening file with O_CREAT
  fs/cifs: make share unaccessible at root level mountable
  jbd2: make journal y2038 safe
  ARC: mm: don't loose PTE_SPECIAL in pte_modify()
  remoteproc: Fix potential race condition in rproc_add
  ovl: disallow overlayfs as upperdir
  HID: uhid: fix timeout when probe races with IO
  EDAC: Correct channel count limit
  Bluetooth: Fix l2cap_sock_setsockopt() with optname BT_RCVMTU
  spi: pxa2xx: Clear all RFT bits in reset_sccr1() on Intel Quark
  i2c: efm32: fix a failure path in efm32_i2c_probe()
  s5p-mfc: Add release callback for memory region devs
  s5p-mfc: Set device name for reserved memory region devs
  hp-wmi: Fix wifi cannot be hard-unblocked
  dm: set DMF_SUSPENDED* _before_ clearing DMF_NOFLUSH_SUSPENDING
  sur40: fix occasional oopses on device close
  sur40: lower poll interval to fix occasional FPS drops to ~56 FPS
  Fix RC5 decoding with Fintek CIR chipset
  vb2: core: Skip planes array verification if pb is NULL
  videobuf2-v4l2: Verify planes array in buffer dequeueing
  media: dvb_ringbuffer: Add memory barriers
  media: usbtv: prevent access to free'd resources
  mfd: qcom_rpm: Parametrize also ack selector size
  mfd: qcom_rpm: Fix offset error for msm8660
  intel_pstate: Fix MSR_CONFIG_TDP_x addressing in core_get_max_pstate()
  s390/cio: allow to reset channel measurement block
  KVM: nVMX: Fix memory corruption when using VMCS shadowing
  KVM: VMX: handle PML full VMEXIT that occurs during event delivery
  KVM: MTRR: fix kvm_mtrr_check_gfn_range_consistency page fault
  KVM: PPC: Book3S HV: Save/restore TM state in H_CEDE
  KVM: PPC: Book3S HV: Pull out TM state save/restore into separate procedures
  arm64: mm: avoid fdt_check_header() before the FDT is fully mapped
  arm64: dts: rockchip: fixes the gic400 2nd region size for rk3368
  pinctrl: cherryview: prevent concurrent access to GPIO controllers
  Bluetooth: hci_intel: Fix null gpio desc pointer dereference
  gpio: intel-mid: Remove potentially harmful code
  gpio: pca953x: Fix NBANK calculation for PCA9536
  tty/serial: atmel: fix RS485 half duplex with DMA
  serial: samsung: Fix ERR pointer dereference on deferred probe
  tty: serial: msm: Don't read off end of tx fifo
  arm64: Fix incorrect per-cpu usage for boot CPU
  arm64: debug: unmask PSTATE.D earlier
  arm64: kernel: Save and restore UAO and addr_limit on exception entry
  USB: usbfs: fix potential infoleak in devio
  usb: renesas_usbhs: fix NULL pointer dereference in xfer_work()
  USB: serial: option: add support for Telit LE910 PID 0x1206
  usb: dwc3: fix for the isoc transfer EP_BUSY flag
  usb: quirks: Add no-lpm quirk for Elan
  usb: renesas_usbhs: protect the CFIFOSEL setting in usbhsg_ep_enable()
  usb: f_fs: off by one bug in _ffs_func_bind()
  usb: gadget: avoid exposing kernel stack
  UPSTREAM: usb: gadget: configfs: add mutex lock before unregister gadget
  ANDROID: dm-verity: adopt changes made to dm callbacks
  UPSTREAM: ecryptfs: fix handling of directory opening
  ANDROID: net: core: fix UID-based routing
  ANDROID: net: fib: remove duplicate assignment
  FROMLIST: proc: Fix timerslack_ns CAP_SYS_NICE check when adjusting self
  ANDROID: dm verity fec: pack the fec_header structure
  ANDROID: dm: android-verity: Verify header before fetching table
  ANDROID: dm: allow adb disable-verity only in userdebug
  ANDROID: dm: mount as linear target if eng build
  ANDROID: dm: use default verity public key
  ANDROID: dm: fix signature verification flag
  ANDROID: dm: use name_to_dev_t
  ANDROID: dm: rename dm-linear methods for dm-android-verity
  ANDROID: dm: Minor cleanup
  ANDROID: dm: Mounting root as linear device when verity disabled
  ANDROID: dm-android-verity: Rebase on top of 4.1
  ANDROID: dm: Add android verity target
  ANDROID: dm: fix dm_substitute_devices()
  ANDROID: dm: Rebase on top of 4.1
  CHROMIUM: dm: boot time specification of dm=
  Implement memory_state_time, used by qcom,cpubw
  Revert "panic: Add board ID to panic output"
  usb: gadget: f_accessory: remove duplicate endpoint alloc
  BACKPORT: brcmfmac: defer DPC processing during probe
  FROMLIST: proc: Add LSM hook checks to /proc/<tid>/timerslack_ns
  FROMLIST: proc: Relax /proc/<tid>/timerslack_ns capability requirements
  UPSTREAM: ppp: defer netns reference release for ppp channel
  cpuset: Add allow_attach hook for cpusets on android.
  UPSTREAM: KEYS: Fix ASN.1 indefinite length object parsing
  ANDROID: sdcardfs: fix itnull.cocci warnings
  android-recommended.cfg: enable fstack-protector-strong
  Linux 4.4.18
  mm: memcontrol: fix memcg id ref counter on swap charge move
  mm: memcontrol: fix swap counter leak on swapout from offline cgroup
  mm: memcontrol: fix cgroup creation failure after many small jobs
  ext4: fix reference counting bug on block allocation error
  ext4: short-cut orphan cleanup on error
  ext4: validate s_reserved_gdt_blocks on mount
  ext4: don't call ext4_should_journal_data() on the journal inode
  ext4: fix deadlock during page writeback
  ext4: check for extents that wrap around
  crypto: scatterwalk - Fix test in scatterwalk_done
  crypto: gcm - Filter out async ghash if necessary
  fs/dcache.c: avoid soft-lockup in dput()
  fuse: fix wrong assignment of ->flags in fuse_send_init()
  fuse: fuse_flush must check mapping->flags for errors
  fuse: fsync() did not return IO errors
  sysv, ipc: fix security-layer leaking
  block: fix use-after-free in seq file
  x86/syscalls/64: Add compat_sys_keyctl for 32-bit userspace
  drm/i915: Pretend cursor is always on for ILK-style WM calculations (v2)
  x86/mm/pat: Fix BUG_ON() in mmap_mem() on QEMU/i386
  x86/pat: Document the PAT initialization sequence
  x86/xen, pat: Remove PAT table init code from Xen
  x86/mtrr: Fix PAT init handling when MTRR is disabled
  x86/mtrr: Fix Xorg crashes in Qemu sessions
  x86/mm/pat: Replace cpu_has_pat with boot_cpu_has()
  x86/mm/pat: Add pat_disable() interface
  x86/mm/pat: Add support of non-default PAT MSR setting
  devpts: clean up interface to pty drivers
  random: strengthen input validation for RNDADDTOENTCNT
  apparmor: fix ref count leak when profile sha1 hash is read
  Revert "s390/kdump: Clear subchannel ID to signal non-CCW/SCSI IPL"
  KEYS: 64-bit MIPS needs to use compat_sys_keyctl for 32-bit userspace
  arm: oabi compat: add missing access checks
  cdc_ncm: do not call usbnet_link_change from cdc_ncm_bind
  i2c: i801: Allow ACPI SystemIO OpRegion to conflict with PCI BAR
  x86/mm/32: Enable full randomization on i386 and X86_32
  HID: sony: do not bail out when the sixaxis refuses the output report
  PNP: Add Broadwell to Intel MCH size workaround
  PNP: Add Haswell-ULT to Intel MCH size workaround
  scsi: ignore errors from scsi_dh_add_device()
  ipath: Restrict use of the write() interface
  tcp: consider recv buf for the initial window scale
  qed: Fix setting/clearing bit in completion bitmap
  net/irda: fix NULL pointer dereference on memory allocation failure
  net: bgmac: Fix infinite loop in bgmac_dma_tx_add()
  bonding: set carrier off for devices created through netlink
  ipv4: reject RTNH_F_DEAD and RTNH_F_LINKDOWN from user space
  tcp: enable per-socket rate limiting of all 'challenge acks'
  tcp: make challenge acks less predictable
  arm64: relocatable: suppress R_AARCH64_ABS64 relocations in vmlinux
  arm64: vmlinux.lds: make __rela_offset and __dynsym_offset ABSOLUTE
  Linux 4.4.17
  vfs: fix deadlock in file_remove_privs() on overlayfs
  intel_th: Fix a deadlock in modprobing
  intel_th: pci: Add Kaby Lake PCH-H support
  net: mvneta: set real interrupt per packet for tx_done
  libceph: apply new_state before new_up_client on incrementals
  libata: LITE-ON CX1-JB256-HP needs lower max_sectors
  i2c: mux: reg: wrong condition checked for of_address_to_resource return value
  posix_cpu_timer: Exit early when process has been reaped
  media: fix airspy usb probe error path
  ipr: Clear interrupt on croc/crocodile when running with LSI
  SCSI: fix new bug in scsi_dev_info_list string matching
  RDS: fix rds_tcp_init() error path
  can: fix oops caused by wrong rtnl dellink usage
  can: fix handling of unmodifiable configuration options fix
  can: c_can: Update D_CAN TX and RX functions to 32 bit - fix Altera Cyclone access
  can: at91_can: RX queue could get stuck at high bus load
  perf/x86: fix PEBS issues on Intel Atom/Core2
  ovl: handle ATTR_KILL*
  sched/fair: Fix effective_load() to consistently use smoothed load
  mmc: block: fix packed command header endianness
  block: fix use-after-free in sys_ioprio_get()
  qeth: delete napi struct when removing a qeth device
  platform/chrome: cros_ec_dev - double fetch bug in ioctl
  clk: rockchip: initialize flags of clk_init_data in mmc-phase clock
  spi: sun4i: fix FIFO limit
  spi: sunxi: fix transfer timeout
  namespace: update event counter when umounting a deleted dentry
  9p: use file_dentry()
  ext4: verify extent header depth
  ecryptfs: don't allow mmap when the lower fs doesn't support it
  Revert "ecryptfs: forbid opening files without mmap handler"
  locks: use file_inode()
  power_supply: power_supply_read_temp only if use_cnt > 0
  cgroup: set css->id to -1 during init
  pinctrl: imx: Do not treat a PIN without MUX register as an error
  pinctrl: single: Fix missing flush of posted write for a wakeirq
  pvclock: Add CPU barriers to get correct version value
  Input: tsc200x - report proper input_dev name
  Input: xpad - validate USB endpoint count during probe
  Input: wacom_w8001 - w8001_MAX_LENGTH should be 13
  Input: xpad - fix oops when attaching an unknown Xbox One gamepad
  Input: elantech - add more IC body types to the list
  Input: vmmouse - remove port reservation
  ALSA: timer: Fix leak in events via snd_timer_user_tinterrupt
  ALSA: timer: Fix leak in events via snd_timer_user_ccallback
  ALSA: timer: Fix leak in SNDRV_TIMER_IOCTL_PARAMS
  xenbus: don't bail early from xenbus_dev_request_and_reply()
  xenbus: don't BUG() on user mode induced condition
  xen/pciback: Fix conf_space read/write overlap check.
  ARC: unwind: ensure that .debug_frame is generated (vs. .eh_frame)
  arc: unwind: warn only once if DW2_UNWIND is disabled
  kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w
  pps: do not crash when failed to register
  vmlinux.lds: account for destructor sections
  mm, meminit: ensure node is online before checking whether pages are uninitialised
  mm, meminit: always return a valid node from early_pfn_to_nid
  mm, compaction: prevent VM_BUG_ON when terminating freeing scanner
  fs/nilfs2: fix potential underflow in call to crc32_le
  mm, compaction: abort free scanner if split fails
  mm, sl[au]b: add __GFP_ATOMIC to the GFP reclaim mask
  dmaengine: at_xdmac: double FIFO flush needed to compute residue
  dmaengine: at_xdmac: fix residue corruption
  dmaengine: at_xdmac: align descriptors on 64 bits
  x86/quirks: Add early quirk to reset Apple AirPort card
  x86/quirks: Reintroduce scanning of secondary buses
  x86/quirks: Apply nvidia_bugs quirk only on root bus
  USB: OHCI: Don't mark EDs as ED_OPER if scheduling fails

Conflicts:
	arch/arm/kernel/topology.c
	arch/arm64/include/asm/arch_gicv3.h
	arch/arm64/kernel/topology.c
	block/bio.c
	drivers/cpufreq/Kconfig
	drivers/md/Makefile
	drivers/media/dvb-core/dvb_ringbuffer.c
	drivers/media/tuners/tuner-xc2028.c
	drivers/misc/Kconfig
	drivers/misc/Makefile
	drivers/mmc/core/host.c
	drivers/scsi/ufs/ufshcd.c
	drivers/scsi/ufs/ufshcd.h
	drivers/usb/dwc3/gadget.c
	drivers/usb/gadget/configfs.c
	fs/ecryptfs/file.c
	include/linux/mmc/core.h
	include/linux/mmc/host.h
	include/linux/mmzone.h
	include/linux/sched.h
	include/linux/sched/sysctl.h
	include/trace/events/power.h
	include/trace/events/sched.h
	init/Kconfig
	kernel/cpuset.c
	kernel/exit.c
	kernel/sched/Makefile
	kernel/sched/core.c
	kernel/sched/cputime.c
	kernel/sched/fair.c
	kernel/sched/features.h
	kernel/sched/rt.c
	kernel/sched/sched.h
	kernel/sched/stop_task.c
	kernel/sched/tune.c
	lib/Kconfig.debug
	mm/Makefile
	mm/vmstat.c

Change-Id: I243a43231ca56a6362076fa6301827e1b0493be5
Signed-off-by: Runmin Wang <runminw@codeaurora.org>
This commit is contained in:
Runmin Wang 2016-12-12 15:32:39 -08:00
commit efbe378b81
656 changed files with 15023 additions and 3955 deletions

View file

@ -0,0 +1,8 @@
Memory bandwidth and frequency state tracking
Required properties:
- compatible : should be:
"memory-state-time"
- freq-tbl: Should contain entries with each frequency in Hz.
- bw-buckets: Should contain upper-bound limits for each bandwidth bucket in Mbps.
Must match the framework power_profile.xml for the device.

View file

@ -0,0 +1,360 @@
===========================================================
Energy cost bindings for Energy Aware Scheduling
===========================================================
===========================================================
1 - Introduction
===========================================================
This note specifies bindings required for energy-aware scheduling
(EAS)[1]. Historically, the scheduler's primary objective has been
performance. EAS aims to provide an alternative objective - energy
efficiency. EAS relies on a simple platform energy cost model to
guide scheduling decisions. The model only considers the CPU
subsystem.
This note is aligned with the definition of the layout of physical
CPUs in the system as described in the ARM topology binding
description [2]. The concept is applicable to any system so long as
the cost model data is provided for those processing elements in
that system's topology that EAS is required to service.
Processing elements refer to hardware threads, CPUs and clusters of
related CPUs in increasing order of hierarchy.
EAS requires two key cost metrics - busy costs and idle costs. Busy
costs comprise of a list of compute capacities for the processing
element in question and the corresponding power consumption at that
capacity. Idle costs comprise of a list of power consumption values
for each idle state [C-state] that the processing element supports.
For a detailed description of these metrics, their derivation and
their use see [3].
These cost metrics are required for processing elements in all
scheduling domain levels that EAS is required to service.
===========================================================
2 - energy-costs node
===========================================================
Energy costs for the processing elements in scheduling domains that
EAS is required to service are defined in the energy-costs node
which acts as a container for the actual per processing element cost
nodes. A single energy-costs node is required for a given system.
- energy-costs node
Usage: Required
Description: The energy-costs node is a container node and
it's sub-nodes describe costs for each processing element at
all scheduling domain levels that EAS is required to
service.
Node name must be "energy-costs".
The energy-costs node's parent node must be the cpus node.
The energy-costs node's child nodes can be:
- one or more cost nodes.
Any other configuration is considered invalid.
The energy-costs node can only contain a single type of child node
whose bindings are described in paragraph 4.
===========================================================
3 - energy-costs node child nodes naming convention
===========================================================
energy-costs child nodes must follow a naming convention where the
node name must be "thread-costN", "core-costN", "cluster-costN"
depending on whether the costs in the node are for a thread, core or
cluster. N (where N = {0, 1, ...}) is the node number and has no
bearing to the OS' logical thread, core or cluster index.
===========================================================
4 - cost node bindings
===========================================================
Bindings for cost nodes are defined as follows:
- cluster-cost node
Description: must be declared within an energy-costs node. A
system can contain multiple clusters and each cluster
serviced by EAS must have a corresponding cluster-costs
node.
The cluster-cost node name must be "cluster-costN" as
described in 3 above.
A cluster-cost node must be a leaf node with no children.
Properties for cluster-cost nodes are described in paragraph
5 below.
Any other configuration is considered invalid.
- core-cost node
Description: must be declared within an energy-costs node. A
system can contain multiple cores and each core serviced by
EAS must have a corresponding core-cost node.
The core-cost node name must be "core-costN" as described in
3 above.
A core-cost node must be a leaf node with no children.
Properties for core-cost nodes are described in paragraph
5 below.
Any other configuration is considered invalid.
- thread-cost node
Description: must be declared within an energy-costs node. A
system can contain cores with multiple hardware threads and
each thread serviced by EAS must have a corresponding
thread-cost node.
The core-cost node name must be "core-costN" as described in
3 above.
A core-cost node must be a leaf node with no children.
Properties for thread-cost nodes are described in paragraph
5 below.
Any other configuration is considered invalid.
===========================================================
5 - Cost node properties
==========================================================
All cost node types must have only the following properties:
- busy-cost-data
Usage: required
Value type: An array of 2-item tuples. Each item is of type
u32.
Definition: The first item in the tuple is the capacity
value as described in [3]. The second item in the tuple is
the energy cost value as described in [3].
- idle-cost-data
Usage: required
Value type: An array of 1-item tuples. The item is of type
u32.
Definition: The item in the tuple is the energy cost value
as described in [3].
===========================================================
4 - Extensions to the cpu node
===========================================================
The cpu node is extended with a property that establishes the
connection between the processing element represented by the cpu
node and the cost-nodes associated with this processing element.
The connection is expressed in line with the topological hierarchy
that this processing element belongs to starting with the level in
the hierarchy that this processing element itself belongs to through
to the highest level that EAS is required to service. The
connection cannot be sparse and must be contiguous from the
processing element's level through to the highest desired level. The
highest desired level must be the same for all processing elements.
Example: Given that a cpu node may represent a thread that is a part
of a core, this property may contain multiple elements which
associate the thread with cost nodes describing the costs for the
thread itself, the core the thread belongs to, the cluster the core
belongs to and so on. The elements must be ordered from the lowest
level nodes to the highest desired level that EAS must service. The
highest desired level must be the same for all cpu nodes. The
elements must not be sparse: there must be elements for the current
thread, the next level of hierarchy (core) and so on without any
'holes'.
Example: Given that a cpu node may represent a core that is a part
of a cluster of related cpus this property may contain multiple
elements which associate the core with cost nodes describing the
costs for the core itself, the cluster the core belongs to and so
on. The elements must be ordered from the lowest level nodes to the
highest desired level that EAS must service. The highest desired
level must be the same for all cpu nodes. The elements must not be
sparse: there must be elements for the current thread, the next
level of hierarchy (core) and so on without any 'holes'.
If the system comprises of hierarchical clusters of clusters, this
property will contain multiple associations with the relevant number
of cluster elements in hierarchical order.
Property added to the cpu node:
- sched-energy-costs
Usage: required
Value type: List of phandles
Definition: a list of phandles to specific cost nodes in the
energy-costs parent node that correspond to the processing
element represented by this cpu node in hierarchical order
of topology.
The order of phandles in the list is significant. The first
phandle is to the current processing element's own cost
node. Subsequent phandles are to higher hierarchical level
cost nodes up until the maximum level that EAS is to
service.
All cpu nodes must have the same highest level cost node.
The phandle list must not be sparsely populated with handles
to non-contiguous hierarchical levels. See commentary above
for clarity.
Any other configuration is invalid.
===========================================================
5 - Example dts
===========================================================
Example 1 (ARM 64-bit, 6-cpu system, two clusters of cpus, one
cluster of 2 Cortex-A57 cpus, one cluster of 4 Cortex-A53 cpus):
cpus {
#address-cells = <2>;
#size-cells = <0>;
.
.
.
A57_0: cpu@0 {
compatible = "arm,cortex-a57","arm,armv8";
reg = <0x0 0x0>;
device_type = "cpu";
enable-method = "psci";
next-level-cache = <&A57_L2>;
clocks = <&scpi_dvfs 0>;
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
};
A57_1: cpu@1 {
compatible = "arm,cortex-a57","arm,armv8";
reg = <0x0 0x1>;
device_type = "cpu";
enable-method = "psci";
next-level-cache = <&A57_L2>;
clocks = <&scpi_dvfs 0>;
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
};
A53_0: cpu@100 {
compatible = "arm,cortex-a53","arm,armv8";
reg = <0x0 0x100>;
device_type = "cpu";
enable-method = "psci";
next-level-cache = <&A53_L2>;
clocks = <&scpi_dvfs 1>;
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
};
A53_1: cpu@101 {
compatible = "arm,cortex-a53","arm,armv8";
reg = <0x0 0x101>;
device_type = "cpu";
enable-method = "psci";
next-level-cache = <&A53_L2>;
clocks = <&scpi_dvfs 1>;
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
};
A53_2: cpu@102 {
compatible = "arm,cortex-a53","arm,armv8";
reg = <0x0 0x102>;
device_type = "cpu";
enable-method = "psci";
next-level-cache = <&A53_L2>;
clocks = <&scpi_dvfs 1>;
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
};
A53_3: cpu@103 {
compatible = "arm,cortex-a53","arm,armv8";
reg = <0x0 0x103>;
device_type = "cpu";
enable-method = "psci";
next-level-cache = <&A53_L2>;
clocks = <&scpi_dvfs 1>;
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
};
energy-costs {
CPU_COST_0: core-cost0 {
busy-cost-data = <
417 168
579 251
744 359
883 479
1024 616
>;
idle-cost-data = <
15
0
>;
};
CPU_COST_1: core-cost1 {
busy-cost-data = <
235 33
302 46
368 61
406 76
447 93
>;
idle-cost-data = <
6
0
>;
};
CLUSTER_COST_0: cluster-cost0 {
busy-cost-data = <
417 24
579 32
744 43
883 49
1024 64
>;
idle-cost-data = <
65
24
>;
};
CLUSTER_COST_1: cluster-cost1 {
busy-cost-data = <
235 26
303 30
368 39
406 47
447 57
>;
idle-cost-data = <
56
17
>;
};
};
};
===============================================================================
[1] https://lkml.org/lkml/2015/5/12/728
[2] Documentation/devicetree/bindings/topology.txt
[3] Documentation/scheduler/sched-energy.txt

View file

@ -348,7 +348,7 @@ address perms offset dev inode pathname
a7cb1000-a7cb2000 ---p 00000000 00:00 0
a7cb2000-a7eb2000 rw-p 00000000 00:00 0
a7eb2000-a7eb3000 ---p 00000000 00:00 0
a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001]
a7eb3000-a7ed5000 rw-p 00000000 00:00 0
a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6
a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6
@ -380,7 +380,6 @@ is not associated with a file:
[heap] = the heap of the program
[stack] = the stack of the main process
[stack:1001] = the stack of the thread with tid 1001
[vdso] = the "virtual dynamic shared object",
the kernel system call handler
[anon:<name>] = an anonymous mapping that has been
@ -390,10 +389,8 @@ is not associated with a file:
The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint
of the individual tasks of a process. In this file you will see a mapping marked
as [stack] if that task sees it as a stack. This is a key difference from the
content of /proc/PID/maps, where you will see all mappings that are being used
as stack by all of those tasks. Hence, for the example above, the task-level
map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
as [stack] if that task sees it as a stack. Hence, for the example above, the
task-level map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
08048000-08049000 r-xp 00000000 03:00 8312 /opt/test
08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test

View file

@ -923,6 +923,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
dm= [DM] Allows early creation of a device-mapper device.
See Documentation/device-mapper/boot.txt.
dmasound= [HW,OSS] Sound subsystem buff
dma_debug=off If the kernel is compiled with DMA_API_DEBUG support,
this option disables the debugging code at boot.

View file

@ -271,3 +271,9 @@ Since the private key is used to sign modules, viruses and malware could use
the private key to sign modules and compromise the operating system. The
private key must be either destroyed or moved to a secure location and not kept
in the root node of the kernel source tree.
If you use the same private key to sign modules for multiple kernel
configurations, you must ensure that the module version information is
sufficient to prevent loading a module into a different kernel. Either
set CONFIG_MODVERSIONS=y or ensure that each configuration has a different
kernel release string by changing EXTRAVERSION or CONFIG_LOCALVERSION.

View file

@ -0,0 +1,362 @@
Energy cost model for energy-aware scheduling (EXPERIMENTAL)
Introduction
=============
The basic energy model uses platform energy data stored in sched_group_energy
data structures attached to the sched_groups in the sched_domain hierarchy. The
energy cost model offers two functions that can be used to guide scheduling
decisions:
1. static unsigned int sched_group_energy(struct energy_env *eenv)
2. static int energy_diff(struct energy_env *eenv)
sched_group_energy() estimates the energy consumed by all cpus in a specific
sched_group including any shared resources owned exclusively by this group of
cpus. Resources shared with other cpus are excluded (e.g. later level caches).
energy_diff() estimates the total energy impact of a utilization change. That
is, adding, removing, or migrating utilization (tasks).
Both functions use a struct energy_env to specify the scenario to be evaluated:
struct energy_env {
struct sched_group *sg_top;
struct sched_group *sg_cap;
int cap_idx;
int util_delta;
int src_cpu;
int dst_cpu;
int energy;
};
sg_top: sched_group to be evaluated. Not used by energy_diff().
sg_cap: sched_group covering the cpus in the same frequency domain. Set by
sched_group_energy().
cap_idx: Capacity state to be used for energy calculations. Set by
find_new_capacity().
util_delta: Amount of utilization to be added, removed, or migrated.
src_cpu: Source cpu from where 'util_delta' utilization is removed. Should be
-1 if no source (e.g. task wake-up).
dst_cpu: Destination cpu where 'util_delta' utilization is added. Should be -1
if utilization is removed (e.g. terminating tasks).
energy: Result of sched_group_energy().
The metric used to represent utilization is the actual per-entity running time
averaged over time using a geometric series. Very similar to the existing
per-entity load-tracking, but _not_ scaled by task priority and capped by the
capacity of the cpu. The latter property does mean that utilization may
underestimate the compute requirements for task on fully/over utilized cpus.
The greatest potential for energy savings without affecting performance too much
is scenarios where the system isn't fully utilized. If the system is deemed
fully utilized load-balancing should be done with task load (includes task
priority) instead in the interest of fairness and performance.
Background and Terminology
===========================
To make it clear from the start:
energy = [joule] (resource like a battery on powered devices)
power = energy/time = [joule/second] = [watt]
The goal of energy-aware scheduling is to minimize energy, while still getting
the job done. That is, we want to maximize:
performance [inst/s]
--------------------
power [W]
which is equivalent to minimizing:
energy [J]
-----------
instruction
while still getting 'good' performance. It is essentially an alternative
optimization objective to the current performance-only objective for the
scheduler. This alternative considers two objectives: energy-efficiency and
performance. Hence, there needs to be a user controllable knob to switch the
objective. Since it is early days, this is currently a sched_feature
(ENERGY_AWARE).
The idea behind introducing an energy cost model is to allow the scheduler to
evaluate the implications of its decisions rather than applying energy-saving
techniques blindly that may only have positive effects on some platforms. At
the same time, the energy cost model must be as simple as possible to minimize
the scheduler latency impact.
Platform topology
------------------
The system topology (cpus, caches, and NUMA information, not peripherals) is
represented in the scheduler by the sched_domain hierarchy which has
sched_groups attached at each level that covers one or more cpus (see
sched-domains.txt for more details). To add energy awareness to the scheduler
we need to consider power and frequency domains.
Power domain:
A power domain is a part of the system that can be powered on/off
independently. Power domains are typically organized in a hierarchy where you
may be able to power down just a cpu or a group of cpus along with any
associated resources (e.g. shared caches). Powering up a cpu means that all
power domains it is a part of in the hierarchy must be powered up. Hence, it is
more expensive to power up the first cpu that belongs to a higher level power
domain than powering up additional cpus in the same high level domain. Two
level power domain hierarchy example:
Power source
+-------------------------------+----...
per group PD G G
| +----------+ |
+--------+-------| Shared | (other groups)
per-cpu PD G G | resource |
| | +----------+
+-------+ +-------+
| CPU 0 | | CPU 1 |
+-------+ +-------+
Frequency domain:
Frequency domains (P-states) typically cover the same group of cpus as one of
the power domain levels. That is, there might be several smaller power domains
sharing the same frequency (P-state) or there might be a power domain spanning
multiple frequency domains.
From a scheduling point of view there is no need to know the actual frequencies
[Hz]. All the scheduler cares about is the compute capacity available at the
current state (P-state) the cpu is in and any other available states. For that
reason, and to also factor in any cpu micro-architecture differences, compute
capacity scaling states are called 'capacity states' in this document. For SMP
systems this is equivalent to P-states. For mixed micro-architecture systems
(like ARM big.LITTLE) it is P-states scaled according to the micro-architecture
performance relative to the other cpus in the system.
Energy modelling:
------------------
Due to the hierarchical nature of the power domains, the most obvious way to
model energy costs is therefore to associate power and energy costs with
domains (groups of cpus). Energy costs of shared resources are associated with
the group of cpus that share the resources, only the cost of powering the
cpu itself and any private resources (e.g. private L1 caches) is associated
with the per-cpu groups (lowest level).
For example, for an SMP system with per-cpu power domains and a cluster level
(group of cpus) power domain we get the overall energy costs to be:
energy = energy_cluster + n * energy_cpu
where 'n' is the number of cpus powered up and energy_cluster is the cost paid
as soon as any cpu in the cluster is powered up.
The power and frequency domains can naturally be mapped onto the existing
sched_domain hierarchy and sched_groups by adding the necessary data to the
existing data structures.
The energy model considers energy consumption from two contributors (shown in
the illustration below):
1. Busy energy: Energy consumed while a cpu and the higher level groups that it
belongs to are busy running tasks. Busy energy is associated with the state of
the cpu, not an event. The time the cpu spends in this state varies. Thus, the
most obvious platform parameter for this contribution is busy power
(energy/time).
2. Idle energy: Energy consumed while a cpu and higher level groups that it
belongs to are idle (in a C-state). Like busy energy, idle energy is associated
with the state of the cpu. Thus, the platform parameter for this contribution
is idle power (energy/time).
Energy consumed during transitions from an idle-state (C-state) to a busy state
(P-state) or going the other way is ignored by the model to simplify the energy
model calculations.
Power
^
| busy->idle idle->busy
| transition transition
|
| _ __
| / \ / \__________________
|______________/ \ /
| \ /
| Busy \ Idle / Busy
| low P-state \____________/ high P-state
|
+------------------------------------------------------------> time
Busy |--------------| |-----------------|
Wakeup |------| |------|
Idle |------------|
The basic algorithm
====================
The basic idea is to determine the total energy impact when utilization is
added or removed by estimating the impact at each level in the sched_domain
hierarchy starting from the bottom (sched_group contains just a single cpu).
The energy cost comes from busy time (sched_group is awake because one or more
cpus are busy) and idle time (in an idle-state). Energy model numbers account
for energy costs associated with all cpus in the sched_group as a group.
for_each_domain(cpu, sd) {
sg = sched_group_of(cpu)
energy_before = curr_util(sg) * busy_power(sg)
+ (1-curr_util(sg)) * idle_power(sg)
energy_after = new_util(sg) * busy_power(sg)
+ (1-new_util(sg)) * idle_power(sg)
energy_diff += energy_before - energy_after
}
return energy_diff
{curr, new}_util: The cpu utilization at the lowest level and the overall
non-idle time for the entire group for higher levels. Utilization is in the
range 0.0 to 1.0 in the pseudo-code.
busy_power: The power consumption of the sched_group.
idle_power: The power consumption of the sched_group when idle.
Note: It is a fundamental assumption that the utilization is (roughly) scale
invariant. Task utilization tracking factors in any frequency scaling and
performance scaling differences due to difference cpu microarchitectures such
that task utilization can be used across the entire system.
Platform energy data
=====================
struct sched_group_energy can be attached to sched_groups in the sched_domain
hierarchy and has the following members:
cap_states:
List of struct capacity_state representing the supported capacity states
(P-states). struct capacity_state has two members: cap and power, which
represents the compute capacity and the busy_power of the state. The
list must be ordered by capacity low->high.
nr_cap_states:
Number of capacity states in cap_states list.
idle_states:
List of struct idle_state containing idle_state power cost for each
idle-state supported by the system orderd by shallowest state first.
All states must be included at all level in the hierarchy, i.e. a
sched_group spanning just a single cpu must also include coupled
idle-states (cluster states). In addition to the cpuidle idle-states,
the list must also contain an entry for the idling using the arch
default idle (arch_idle_cpu()). Despite this state may not be a true
hardware idle-state it is considered the shallowest idle-state in the
energy model and must be the first entry. cpus may enter this state
(possibly 'active idling') if cpuidle decides not enter a cpuidle
idle-state. Default idle may not be used when cpuidle is enabled.
In this case, it should just be a copy of the first cpuidle idle-state.
nr_idle_states:
Number of idle states in idle_states list.
There are no unit requirements for the energy cost data. Data can be normalized
with any reference, however, the normalization must be consistent across all
energy cost data. That is, one bogo-joule/watt must be the same quantity for
data, but we don't care what it is.
A recipe for platform characterization
=======================================
Obtaining the actual model data for a particular platform requires some way of
measuring power/energy. There isn't a tool to help with this (yet). This
section provides a recipe for use as reference. It covers the steps used to
characterize the ARM TC2 development platform. This sort of measurements is
expected to be done anyway when tuning cpuidle and cpufreq for a given
platform.
The energy model needs two types of data (struct sched_group_energy holds
these) for each sched_group where energy costs should be taken into account:
1. Capacity state information
A list containing the compute capacity and power consumption when fully
utilized attributed to the group as a whole for each available capacity state.
At the lowest level (group contains just a single cpu) this is the power of the
cpu alone without including power consumed by resources shared with other cpus.
It basically needs to fit the basic modelling approach described in "Background
and Terminology" section:
energy_system = energy_shared + n * energy_cpu
for a system containing 'n' busy cpus. Only 'energy_cpu' should be included at
the lowest level. 'energy_shared' is included at the next level which
represents the group of cpus among which the resources are shared.
This model is, of course, a simplification of reality. Thus, power/energy
attributions might not always exactly represent how the hardware is designed.
Also, busy power is likely to depend on the workload. It is therefore
recommended to use a representative mix of workloads when characterizing the
capacity states.
If the group has no capacity scaling support, the list will contain a single
state where power is the busy power attributed to the group. The capacity
should be set to a default value (1024).
When frequency domains include multiple power domains, the group representing
the frequency domain and all child groups share capacity states. This must be
indicated by setting the SD_SHARE_CAP_STATES sched_domain flag. All groups at
all levels that share the capacity state must have the list of capacity states
with the power set to the contribution of the individual group.
2. Idle power information
Stored in the idle_states list. The power number is the group idle power
consumption in each idle state as well when the group is idle but has not
entered an idle-state ('active idle' as mentioned earlier). Due to the way the
energy model is defined, the idle power of the deepest group idle state can
alternatively be accounted for in the parent group busy power. In that case the
group idle state power values are offset such that the idle power of the
deepest state is zero. It is less intuitive, but it is easier to measure as
idle power consumed by the group and the busy/idle power of the parent group
cannot be distinguished without per group measurement points.
Measuring capacity states and idle power:
The capacity states' capacity and power can be estimated by running a benchmark
workload at each available capacity state. By restricting the benchmark to run
on subsets of cpus it is possible to extrapolate the power consumption of
shared resources.
ARM TC2 has two clusters of two and three cpus respectively. Each cluster has a
shared L2 cache. TC2 has on-chip energy counters per cluster. Running a
benchmark workload on just one cpu in a cluster means that power is consumed in
the cluster (higher level group) and a single cpu (lowest level group). Adding
another benchmark task to another cpu increases the power consumption by the
amount consumed by the additional cpu. Hence, it is possible to extrapolate the
cluster busy power.
For platforms that don't have energy counters or equivalent instrumentation
built-in, it may be possible to use an external DAQ to acquire similar data.
If the benchmark includes some performance score (for example sysbench cpu
benchmark), this can be used to record the compute capacity.
Measuring idle power requires insight into the idle state implementation on the
particular platform. Specifically, if the platform has coupled idle-states (or
package states). To measure non-coupled per-cpu idle-states it is necessary to
keep one cpu busy to keep any shared resources alive to isolate the idle power
of the cpu from idle/busy power of the shared resources. The cpu can be tricked
into different per-cpu idle states by disabling the other states. Based on
various combinations of measurements with specific cpus busy and disabling
idle-states it is possible to extrapolate the idle-state power.

View file

@ -0,0 +1,366 @@
Central, scheduler-driven, power-performance control
(EXPERIMENTAL)
Abstract
========
The topic of a single simple power-performance tunable, that is wholly
scheduler centric, and has well defined and predictable properties has come up
on several occasions in the past [1,2]. With techniques such as a scheduler
driven DVFS [3], we now have a good framework for implementing such a tunable.
This document describes the overall ideas behind its design and implementation.
Table of Contents
=================
1. Motivation
2. Introduction
3. Signal Boosting Strategy
4. OPP selection using boosted CPU utilization
5. Per task group boosting
6. Question and Answers
- What about "auto" mode?
- What about boosting on a congested system?
- How CPUs are boosted when we have tasks with multiple boost values?
7. References
1. Motivation
=============
Sched-DVFS [3] is a new event-driven cpufreq governor which allows the
scheduler to select the optimal DVFS operating point (OPP) for running a task
allocated to a CPU. The introduction of sched-DVFS enables running workloads at
the most energy efficient OPPs.
However, sometimes it may be desired to intentionally boost the performance of
a workload even if that could imply a reasonable increase in energy
consumption. For example, in order to reduce the response time of a task, we
may want to run the task at a higher OPP than the one that is actually required
by it's CPU bandwidth demand.
This last requirement is especially important if we consider that one of the
main goals of the sched-DVFS component is to replace all currently available
CPUFreq policies. Since sched-DVFS is event based, as opposed to the sampling
driven governors we currently have, it is already more responsive at selecting
the optimal OPP to run tasks allocated to a CPU. However, just tracking the
actual task load demand may not be enough from a performance standpoint. For
example, it is not possible to get behaviors similar to those provided by the
"performance" and "interactive" CPUFreq governors.
This document describes an implementation of a tunable, stacked on top of the
sched-DVFS which extends its functionality to support task performance
boosting.
By "performance boosting" we mean the reduction of the time required to
complete a task activation, i.e. the time elapsed from a task wakeup to its
next deactivation (e.g. because it goes back to sleep or it terminates). For
example, if we consider a simple periodic task which executes the same workload
for 5[s] every 20[s] while running at a certain OPP, a boosted execution of
that task must complete each of its activations in less than 5[s].
A previous attempt [5] to introduce such a boosting feature has not been
successful mainly because of the complexity of the proposed solution. The
approach described in this document exposes a single simple interface to
user-space. This single tunable knob allows the tuning of system wide
scheduler behaviours ranging from energy efficiency at one end through to
incremental performance boosting at the other end. This first tunable affects
all tasks. However, a more advanced extension of the concept is also provided
which uses CGroups to boost the performance of only selected tasks while using
the energy efficient default for all others.
The rest of this document introduces in more details the proposed solution
which has been named SchedTune.
2. Introduction
===============
SchedTune exposes a simple user-space interface with a single power-performance
tunable:
/proc/sys/kernel/sched_cfs_boost
This permits expressing a boost value as an integer in the range [0..100].
A value of 0 (default) configures the CFS scheduler for maximum energy
efficiency. This means that sched-DVFS runs the tasks at the minimum OPP
required to satisfy their workload demand.
A value of 100 configures scheduler for maximum performance, which translates
to the selection of the maximum OPP on that CPU.
The range between 0 and 100 can be set to satisfy other scenarios suitably. For
example to satisfy interactive response or depending on other system events
(battery level etc).
A CGroup based extension is also provided, which permits further user-space
defined task classification to tune the scheduler for different goals depending
on the specific nature of the task, e.g. background vs interactive vs
low-priority.
The overall design of the SchedTune module is built on top of "Per-Entity Load
Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating
Performance Point (OPP) selection.
Each time a task is allocated on a CPU, sched-DVFS has the opportunity to tune
the operating frequency of that CPU to better match the workload demand. The
selection of the actual OPP being activated is influenced by the global boost
value, or the boost value for the task CGroup when in use.
This simple biasing approach leverages existing frameworks, which means minimal
modifications to the scheduler, and yet it allows to achieve a range of
different behaviours all from a single simple tunable knob.
The only new concept introduced is that of signal boosting.
3. Signal Boosting Strategy
===========================
The whole PELT machinery works based on the value of a few load tracking signals
which basically track the CPU bandwidth requirements for tasks and the capacity
of CPUs. The basic idea behind the SchedTune knob is to artificially inflate
some of these load tracking signals to make a task or RQ appears more demanding
that it actually is.
Which signals have to be inflated depends on the specific "consumer". However,
independently from the specific (signal, consumer) pair, it is important to
define a simple and possibly consistent strategy for the concept of boosting a
signal.
A boosting strategy defines how the "abstract" user-space defined
sched_cfs_boost value is translated into an internal "margin" value to be added
to a signal to get its inflated value:
margin := boosting_strategy(sched_cfs_boost, signal)
boosted_signal := signal + margin
Different boosting strategies were identified and analyzed before selecting the
one found to be most effective.
Signal Proportional Compensation (SPC)
--------------------------------------
In this boosting strategy the sched_cfs_boost value is used to compute a
margin which is proportional to the complement of the original signal.
When a signal has a maximum possible value, its complement is defined as
the delta from the actual value and its possible maximum.
Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as
the maximum possible value, the margin becomes:
margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal)
Using this boosting strategy:
- a 100% sched_cfs_boost means that the signal is scaled to the maximum value
- each value in the range of sched_cfs_boost effectively inflates the signal in
question by a quantity which is proportional to the maximum value.
For example, by applying the SPC boosting strategy to the selection of the OPP
to run a task it is possible to achieve these behaviors:
- 0% boosting: run the task at the minimum OPP required by its workload
- 100% boosting: run the task at the maximum OPP available for the CPU
- 50% boosting: run at the half-way OPP between minimum and maximum
Which means that, at 50% boosting, a task will be scheduled to run at half of
the maximum theoretically achievable performance on the specific target
platform.
A graphical representation of an SPC boosted signal is represented in the
following figure where:
a) "-" represents the original signal
b) "b" represents a 50% boosted signal
c) "p" represents a 100% boosted signal
^
| SCHED_LOAD_SCALE
+-----------------------------------------------------------------+
|pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
|
| boosted_signal
| bbbbbbbbbbbbbbbbbbbbbbbb
|
| original signal
| bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+
| |
|bbbbbbbbbbbbbbbbbb |
| |
| |
| |
| +-----------------------+
| |
| |
| |
|------------------+
|
|
+----------------------------------------------------------------------->
The plot above shows a ramped load signal (titled 'original_signal') and it's
boosted equivalent. For each step of the original signal the boosted signal
corresponding to a 50% boost is midway from the original signal and the upper
bound. Boosting by 100% generates a boosted signal which is always saturated to
the upper bound.
4. OPP selection using boosted CPU utilization
==============================================
It is worth calling out that the implementation does not introduce any new load
signals. Instead, it provides an API to tune existing signals. This tuning is
done on demand and only in scheduler code paths where it is sensible to do so.
The new API calls are defined to return either the default signal or a boosted
one, depending on the value of sched_cfs_boost. This is a clean an non invasive
modification of the existing existing code paths.
The signal representing a CPU's utilization is boosted according to the
previously described SPC boosting strategy. To sched-DVFS, this allows a CPU
(ie CFS run-queue) to appear more used then it actually is.
Thus, with the sched_cfs_boost enabled we have the following main functions to
get the current utilization of a CPU:
cpu_util()
boosted_cpu_util()
The new boosted_cpu_util() is similar to the first but returns a boosted
utilization signal which is a function of the sched_cfs_boost value.
This function is used in the CFS scheduler code paths where sched-DVFS needs to
decide the OPP to run a CPU at.
For example, this allows selecting the highest OPP for a CPU which has
the boost value set to 100%.
5. Per task group boosting
==========================
The availability of a single knob which is used to boost all tasks in the
system is certainly a simple solution but it quite likely doesn't fit many
utilization scenarios, especially in the mobile device space.
For example, on battery powered devices there usually are many background
services which are long running and need energy efficient scheduling. On the
other hand, some applications are more performance sensitive and require an
interactive response and/or maximum performance, regardless of the energy cost.
To better service such scenarios, the SchedTune implementation has an extension
that provides a more fine grained boosting interface.
A new CGroup controller, namely "schedtune", could be enabled which allows to
defined and configure task groups with different boosting values.
Tasks that require special performance can be put into separate CGroups.
The value of the boost associated with the tasks in this group can be specified
using a single knob exposed by the CGroup controller:
schedtune.boost
This knob allows the definition of a boost value that is to be used for
SPC boosting of all tasks attached to this group.
The current schedtune controller implementation is really simple and has these
main characteristics:
1) It is only possible to create 1 level depth hierarchies
The root control groups define the system-wide boost value to be applied
by default to all tasks. Its direct subgroups are named "boost groups" and
they define the boost value for specific set of tasks.
Further nested subgroups are not allowed since they do not have a sensible
meaning from a user-space standpoint.
2) It is possible to define only a limited number of "boost groups"
This number is defined at compile time and by default configured to 16.
This is a design decision motivated by two main reasons:
a) In a real system we do not expect utilization scenarios with more then few
boost groups. For example, a reasonable collection of groups could be
just "background", "interactive" and "performance".
b) It simplifies the implementation considerably, especially for the code
which has to compute the per CPU boosting once there are multiple
RUNNABLE tasks with different boost values.
Such a simple design should allow servicing the main utilization scenarios identified
so far. It provides a simple interface which can be used to manage the
power-performance of all tasks or only selected tasks.
Moreover, this interface can be easily integrated by user-space run-times (e.g.
Android, ChromeOS) to implement a QoS solution for task boosting based on tasks
classification, which has been a long standing requirement.
Setup and usage
---------------
0. Use a kernel with CGROUP_SCHEDTUNE support enabled
1. Check that the "schedtune" CGroup controller is available:
root@linaro-nano:~# cat /proc/cgroups
#subsys_name hierarchy num_cgroups enabled
cpuset 0 1 1
cpu 0 1 1
schedtune 0 1 1
2. Mount a tmpfs to create the CGroups mount point (Optional)
root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup
3. Mount the "schedtune" controller
root@linaro-nano:~# mkdir /sys/fs/cgroup/stune
root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune
4. Setup the system-wide boost value (Optional)
If not configured the root control group has a 0% boost value, which
basically disables boosting for all tasks in the system thus running in
an energy-efficient mode.
root@linaro-nano:~# echo $SYSBOOST > /sys/fs/cgroup/stune/schedtune.boost
5. Create task groups and configure their specific boost value (Optional)
For example here we create a "performance" boost group configure to boost
all its tasks to 100%
root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance
root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost
6. Move tasks into the boost group
For example, the following moves the tasks with PID $TASKPID (and all its
threads) into the "performance" boost group.
root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs
This simple configuration allows only the threads of the $TASKPID task to run,
when needed, at the highest OPP in the most capable CPU of the system.
6. Question and Answers
=======================
What about "auto" mode?
-----------------------
The 'auto' mode as described in [5] can be implemented by interfacing SchedTune
with some suitable user-space element. This element could use the exposed
system-wide or cgroup based interface.
How are multiple groups of tasks with different boost values managed?
---------------------------------------------------------------------
The current SchedTune implementation keeps track of the boosted RUNNABLE tasks
on a CPU. Once sched-DVFS selects the OPP to run a CPU at, the CPU utilization
is boosted with a value which is the maximum of the boost values of the
currently RUNNABLE tasks in its RQ.
This allows sched-DVFS to boost a CPU only while there are boosted tasks ready
to run and switch back to the energy efficient mode as soon as the last boosted
task is dequeued.
7. References
=============
[1] http://lwn.net/Articles/552889
[2] http://lkml.org/lkml/2012/5/18/91
[3] http://lkml.org/lkml/2015/6/26/620

View file

@ -196,3 +196,35 @@ Another, more verbose way of getting PAT related debug messages is with
"debugpat" boot parameter. With this parameter, various debug messages are
printed to dmesg log.
PAT Initialization
------------------
The following table describes how PAT is initialized under various
configurations. The PAT MSR must be updated by Linux in order to support WC
and WT attributes. Otherwise, the PAT MSR has the value programmed in it
by the firmware. Note, Xen enables WC attribute in the PAT MSR for guests.
MTRR PAT Call Sequence PAT State PAT MSR
=========================================================
E E MTRR -> PAT init Enabled OS
E D MTRR -> PAT init Disabled -
D E MTRR -> PAT disable Disabled BIOS
D D MTRR -> PAT disable Disabled -
- np/E PAT -> PAT disable Disabled BIOS
- np/D PAT -> PAT disable Disabled -
E !P/E MTRR -> PAT init Disabled BIOS
D !P/E MTRR -> PAT disable Disabled BIOS
!M !P/E MTRR stub -> PAT disable Disabled BIOS
Legend
------------------------------------------------
E Feature enabled in CPU
D Feature disabled/unsupported in CPU
np "nopat" boot option specified
!P CONFIG_X86_PAT option unset
!M CONFIG_MTRR option unset
Enabled PAT state set to enabled
Disabled PAT state set to disabled
OS PAT initializes PAT MSR with OS setting
BIOS PAT keeps PAT MSR with BIOS setting

View file

@ -1,6 +1,6 @@
VERSION = 4
PATCHLEVEL = 4
SUBLEVEL = 16
SUBLEVEL = 21
EXTRAVERSION =
NAME = Blurry Fish Butt

View file

@ -141,6 +141,7 @@ CONFIG_PROFILING=y
CONFIG_QUOTA=y
CONFIG_RTC_CLASS=y
CONFIG_RT_GROUP_SCHED=y
CONFIG_SECCOMP=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y

View file

@ -11,6 +11,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=8192
CONFIG_CC_STACKPROTECTOR_STRONG=y
CONFIG_COMPACTION=y
CONFIG_DEBUG_RODATA=y
CONFIG_DM_UEVENT=y
@ -118,6 +119,7 @@ CONFIG_TIMER_STATS=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
CONFIG_UHID=y
CONFIG_MEMORY_STATE_TIME=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_EHCI_HCD=y
CONFIG_USB_HIDDEV=y

View file

@ -423,6 +423,15 @@ config CC_STACKPROTECTOR_STRONG
endchoice
config HAVE_ARCH_WITHIN_STACK_FRAMES
bool
help
An architecture should select this if it can walk the kernel stack
frames to determine if an object is part of either the arguments
or local variables (i.e. that it excludes saved return addresses,
and similar) by implementing an inline arch_within_stack_frames(),
which is used by CONFIG_HARDENED_USERCOPY.
config HAVE_CONTEXT_TRACKING
bool
help

View file

@ -18,6 +18,20 @@ cflags-y += -fno-common -pipe -fno-builtin -D__linux__
cflags-$(CONFIG_ISA_ARCOMPACT) += -mA7
cflags-$(CONFIG_ISA_ARCV2) += -mcpu=archs
is_700 = $(shell $(CC) -dM -E - < /dev/null | grep -q "ARC700" && echo 1 || echo 0)
ifdef CONFIG_ISA_ARCOMPACT
ifeq ($(is_700), 0)
$(error Toolchain not configured for ARCompact builds)
endif
endif
ifdef CONFIG_ISA_ARCV2
ifeq ($(is_700), 1)
$(error Toolchain not configured for ARCv2 builds)
endif
endif
ifdef CONFIG_ARC_CURR_IN_REG
# For a global register defintion, make sure it gets passed to every file
# We had a customer reported bug where some code built in kernel was NOT using
@ -48,8 +62,6 @@ endif
endif
cflags-$(CONFIG_ARC_DW2_UNWIND) += -fasynchronous-unwind-tables
# By default gcc 4.8 generates dwarf4 which kernel unwinder can't grok
ifeq ($(atleast_gcc48),y)
cflags-$(CONFIG_ARC_DW2_UNWIND) += -gdwarf-2

View file

@ -374,12 +374,6 @@ static inline int is_isa_arcompact(void)
return IS_ENABLED(CONFIG_ISA_ARCOMPACT);
}
#if defined(CONFIG_ISA_ARCOMPACT) && !defined(_CPU_DEFAULT_A7)
#error "Toolchain not configured for ARCompact builds"
#elif defined(CONFIG_ISA_ARCV2) && !defined(_CPU_DEFAULT_HS)
#error "Toolchain not configured for ARCv2 builds"
#endif
#endif /* __ASEMBLY__ */
#endif /* _ASM_ARC_ARCREGS_H */

View file

@ -142,7 +142,7 @@
#ifdef CONFIG_ARC_CURR_IN_REG
; Retrieve orig r25 and save it with rest of callee_regs
ld.as r12, [r12, PT_user_r25]
ld r12, [r12, PT_user_r25]
PUSH r12
#else
PUSH r25
@ -198,7 +198,7 @@
; SP is back to start of pt_regs
#ifdef CONFIG_ARC_CURR_IN_REG
st.as r12, [sp, PT_user_r25]
st r12, [sp, PT_user_r25]
#endif
.endm

View file

@ -188,10 +188,10 @@ static inline int arch_irqs_disabled(void)
.endm
.macro IRQ_ENABLE scratch
TRACE_ASM_IRQ_ENABLE
lr \scratch, [status32]
or \scratch, \scratch, (STATUS_E1_MASK | STATUS_E2_MASK)
flag \scratch
TRACE_ASM_IRQ_ENABLE
.endm
#endif /* __ASSEMBLY__ */

View file

@ -110,7 +110,7 @@
#define ___DEF (_PAGE_PRESENT | _PAGE_CACHEABLE)
/* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_SPECIAL)
/* More Abbrevaited helpers */
#define PAGE_U_NONE __pgprot(___DEF)

View file

@ -142,7 +142,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
* prelogue is setup (callee regs saved and then fp set and not other
* way around
*/
pr_warn("CONFIG_ARC_DW2_UNWIND needs to be enabled\n");
pr_warn_once("CONFIG_ARC_DW2_UNWIND needs to be enabled\n");
return 0;
#endif

View file

@ -914,6 +914,15 @@ void arc_cache_init(void)
printk(arc_cache_mumbojumbo(0, str, sizeof(str)));
/*
* Only master CPU needs to execute rest of function:
* - Assume SMP so all cores will have same cache config so
* any geomtry checks will be same for all
* - IOC setup / dma callbacks only need to be setup once
*/
if (cpu)
return;
if (IS_ENABLED(CONFIG_ARC_HAS_ICACHE)) {
struct cpuinfo_arc_cache *ic = &cpuinfo_arc700[cpu].icache;

View file

@ -36,6 +36,7 @@ config ARM
select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32
select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_HARDENED_USERCOPY
select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
select HAVE_ARCH_TRACEHOOK
select HAVE_BPF_JIT

View file

@ -84,6 +84,7 @@
regulator-name = "emac-3v3";
regulator-min-microvolt = <3300000>;
regulator-max-microvolt = <3300000>;
startup-delay-us = <20000>;
enable-active-high;
gpio = <&pio 7 15 GPIO_ACTIVE_HIGH>;
};

View file

@ -66,6 +66,7 @@
regulator-name = "emac-3v3";
regulator-min-microvolt = <3300000>;
regulator-max-microvolt = <3300000>;
startup-delay-us = <20000>;
enable-active-high;
gpio = <&pio 7 19 GPIO_ACTIVE_HIGH>;
};

View file

@ -80,6 +80,7 @@
regulator-name = "emac-3v3";
regulator-min-microvolt = <3300000>;
regulator-max-microvolt = <3300000>;
startup-delay-us = <20000>;
enable-active-high;
gpio = <&pio 7 19 GPIO_ACTIVE_HIGH>; /* PH19 */
};

View file

@ -79,6 +79,7 @@
regulator-name = "emac-3v3";
regulator-min-microvolt = <3300000>;
regulator-max-microvolt = <3300000>;
startup-delay-us = <20000>;
enable-active-high;
gpio = <&pio 0 2 GPIO_ACTIVE_HIGH>;
};

View file

@ -3,6 +3,7 @@
#ifdef CONFIG_ARM_CPU_TOPOLOGY
#include <linux/cpufreq.h>
#include <linux/cpumask.h>
struct cputopo_arm {
@ -24,6 +25,12 @@ void init_cpu_topology(void);
void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
#ifdef CONFIG_CPU_FREQ
#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
#endif
#define arch_scale_cpu_capacity scale_cpu_capacity
extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
#else
static inline void init_cpu_topology(void) { }

View file

@ -496,7 +496,10 @@ arm_copy_from_user(void *to, const void __user *from, unsigned long n);
static inline unsigned long __must_check
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
unsigned int __ua_flags = uaccess_save_and_enable();
unsigned int __ua_flags;
check_object_size(to, n, false);
__ua_flags = uaccess_save_and_enable();
n = arm_copy_from_user(to, from, n);
uaccess_restore(__ua_flags);
return n;
@ -511,11 +514,15 @@ static inline unsigned long __must_check
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
#ifndef CONFIG_UACCESS_WITH_MEMCPY
unsigned int __ua_flags = uaccess_save_and_enable();
unsigned int __ua_flags;
check_object_size(from, n, true);
__ua_flags = uaccess_save_and_enable();
n = arm_copy_to_user(to, from, n);
uaccess_restore(__ua_flags);
return n;
#else
check_object_size(from, n, true);
return arm_copy_to_user(to, from, n);
#endif
}

View file

@ -781,7 +781,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
struct resource *res;
kernel_code.start = virt_to_phys(_text);
kernel_code.end = virt_to_phys(_etext - 1);
kernel_code.end = virt_to_phys(__init_begin - 1);
kernel_data.start = virt_to_phys(_sdata);
kernel_data.end = virt_to_phys(_end - 1);

View file

@ -279,8 +279,12 @@ asmlinkage long sys_oabi_epoll_wait(int epfd,
mm_segment_t fs;
long ret, err, i;
if (maxevents <= 0 || maxevents > (INT_MAX/sizeof(struct epoll_event)))
if (maxevents <= 0 ||
maxevents > (INT_MAX/sizeof(*kbuf)) ||
maxevents > (INT_MAX/sizeof(*events)))
return -EINVAL;
if (!access_ok(VERIFY_WRITE, events, sizeof(*events) * maxevents))
return -EFAULT;
kbuf = kmalloc(sizeof(*kbuf) * maxevents, GFP_KERNEL);
if (!kbuf)
return -ENOMEM;
@ -317,6 +321,8 @@ asmlinkage long sys_oabi_semtimedop(int semid,
if (nsops < 1 || nsops > SEMOPM)
return -EINVAL;
if (!access_ok(VERIFY_READ, tsops, sizeof(*tsops) * nsops))
return -EFAULT;
sops = kmalloc(sizeof(*sops) * nsops, GFP_KERNEL);
if (!sops)
return -ENOMEM;

View file

@ -42,9 +42,15 @@
*/
static DEFINE_PER_CPU(unsigned long, cpu_scale);
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
#ifdef CONFIG_CPU_FREQ
unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
#else
return per_cpu(cpu_scale, cpu);
#endif
}
static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
@ -343,6 +349,8 @@ out:
return ret;
}
static const struct sched_group_energy * const cpu_core_energy(int cpu);
/*
* Look for a customed capacity of a CPU in the cpu_capacity table during the
* boot. The update of all CPUs is in O(n^2) for heteregeneous system but the
@ -350,10 +358,14 @@ out:
*/
static void update_cpu_capacity(unsigned int cpu)
{
if (!cpu_capacity(cpu))
return;
unsigned long capacity = SCHED_CAPACITY_SCALE;
set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
if (cpu_core_energy(cpu)) {
int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1;
capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap;
}
set_capacity_scale(cpu, capacity);
pr_info("CPU%u: update cpu_capacity %lu\n",
cpu, arch_scale_cpu_capacity(NULL, cpu));
@ -464,17 +476,138 @@ topology_populated:
update_cpu_capacity(cpuid);
}
/*
* ARM TC2 specific energy cost model data. There are no unit requirements for
* the data. Data can be normalized to any reference point, but the
* normalization must be consistent. That is, one bogo-joule/watt must be the
* same quantity for all data, but we don't care what it is.
*/
static struct idle_state idle_states_cluster_a7[] = {
{ .power = 25 }, /* arch_cpu_idle() (active idle) = WFI */
{ .power = 25 }, /* WFI */
{ .power = 10 }, /* cluster-sleep-l */
};
static struct idle_state idle_states_cluster_a15[] = {
{ .power = 70 }, /* arch_cpu_idle() (active idle) = WFI */
{ .power = 70 }, /* WFI */
{ .power = 25 }, /* cluster-sleep-b */
};
static struct capacity_state cap_states_cluster_a7[] = {
/* Cluster only power */
{ .cap = 150, .power = 2967, }, /* 350 MHz */
{ .cap = 172, .power = 2792, }, /* 400 MHz */
{ .cap = 215, .power = 2810, }, /* 500 MHz */
{ .cap = 258, .power = 2815, }, /* 600 MHz */
{ .cap = 301, .power = 2919, }, /* 700 MHz */
{ .cap = 344, .power = 2847, }, /* 800 MHz */
{ .cap = 387, .power = 3917, }, /* 900 MHz */
{ .cap = 430, .power = 4905, }, /* 1000 MHz */
};
static struct capacity_state cap_states_cluster_a15[] = {
/* Cluster only power */
{ .cap = 426, .power = 7920, }, /* 500 MHz */
{ .cap = 512, .power = 8165, }, /* 600 MHz */
{ .cap = 597, .power = 8172, }, /* 700 MHz */
{ .cap = 682, .power = 8195, }, /* 800 MHz */
{ .cap = 768, .power = 8265, }, /* 900 MHz */
{ .cap = 853, .power = 8446, }, /* 1000 MHz */
{ .cap = 938, .power = 11426, }, /* 1100 MHz */
{ .cap = 1024, .power = 15200, }, /* 1200 MHz */
};
static struct sched_group_energy energy_cluster_a7 = {
.nr_idle_states = ARRAY_SIZE(idle_states_cluster_a7),
.idle_states = idle_states_cluster_a7,
.nr_cap_states = ARRAY_SIZE(cap_states_cluster_a7),
.cap_states = cap_states_cluster_a7,
};
static struct sched_group_energy energy_cluster_a15 = {
.nr_idle_states = ARRAY_SIZE(idle_states_cluster_a15),
.idle_states = idle_states_cluster_a15,
.nr_cap_states = ARRAY_SIZE(cap_states_cluster_a15),
.cap_states = cap_states_cluster_a15,
};
static struct idle_state idle_states_core_a7[] = {
{ .power = 0 }, /* arch_cpu_idle (active idle) = WFI */
{ .power = 0 }, /* WFI */
{ .power = 0 }, /* cluster-sleep-l */
};
static struct idle_state idle_states_core_a15[] = {
{ .power = 0 }, /* arch_cpu_idle (active idle) = WFI */
{ .power = 0 }, /* WFI */
{ .power = 0 }, /* cluster-sleep-b */
};
static struct capacity_state cap_states_core_a7[] = {
/* Power per cpu */
{ .cap = 150, .power = 187, }, /* 350 MHz */
{ .cap = 172, .power = 275, }, /* 400 MHz */
{ .cap = 215, .power = 334, }, /* 500 MHz */
{ .cap = 258, .power = 407, }, /* 600 MHz */
{ .cap = 301, .power = 447, }, /* 700 MHz */
{ .cap = 344, .power = 549, }, /* 800 MHz */
{ .cap = 387, .power = 761, }, /* 900 MHz */
{ .cap = 430, .power = 1024, }, /* 1000 MHz */
};
static struct capacity_state cap_states_core_a15[] = {
/* Power per cpu */
{ .cap = 426, .power = 2021, }, /* 500 MHz */
{ .cap = 512, .power = 2312, }, /* 600 MHz */
{ .cap = 597, .power = 2756, }, /* 700 MHz */
{ .cap = 682, .power = 3125, }, /* 800 MHz */
{ .cap = 768, .power = 3524, }, /* 900 MHz */
{ .cap = 853, .power = 3846, }, /* 1000 MHz */
{ .cap = 938, .power = 5177, }, /* 1100 MHz */
{ .cap = 1024, .power = 6997, }, /* 1200 MHz */
};
static struct sched_group_energy energy_core_a7 = {
.nr_idle_states = ARRAY_SIZE(idle_states_core_a7),
.idle_states = idle_states_core_a7,
.nr_cap_states = ARRAY_SIZE(cap_states_core_a7),
.cap_states = cap_states_core_a7,
};
static struct sched_group_energy energy_core_a15 = {
.nr_idle_states = ARRAY_SIZE(idle_states_core_a15),
.idle_states = idle_states_core_a15,
.nr_cap_states = ARRAY_SIZE(cap_states_core_a15),
.cap_states = cap_states_core_a15,
};
/* sd energy functions */
static inline
const struct sched_group_energy * const cpu_cluster_energy(int cpu)
{
return cpu_topology[cpu].cluster_id ? &energy_cluster_a7 :
&energy_cluster_a15;
}
static inline
const struct sched_group_energy * const cpu_core_energy(int cpu)
{
return cpu_topology[cpu].cluster_id ? &energy_core_a7 :
&energy_core_a15;
}
static inline int cpu_corepower_flags(void)
{
return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \
SD_SHARE_CAP_STATES;
}
static struct sched_domain_topology_level arm_topology[] = {
#ifdef CONFIG_SCHED_MC
{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
{ cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
{ NULL, },
};

View file

@ -120,6 +120,8 @@ SECTIONS
#ifdef CONFIG_DEBUG_RODATA
. = ALIGN(1<<SECTION_SHIFT);
#endif
_etext = .; /* End of text section */
RO_DATA(PAGE_SIZE)
. = ALIGN(4);
@ -150,8 +152,6 @@ SECTIONS
NOTES
_etext = .; /* End of text and rodata section */
#ifndef CONFIG_XIP_KERNEL
# ifdef CONFIG_ARM_KERNMEM_PERMS
. = ALIGN(1<<SECTION_SHIFT);

View file

@ -572,7 +572,7 @@ static void __init build_mem_type_table(void)
* in the Short-descriptor translation table format descriptors.
*/
if (cpu_arch == CPU_ARCH_ARMv7 &&
(read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) {
(read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) >= 4) {
user_pmd_table |= PMD_PXNTABLE;
}
#endif

View file

@ -51,6 +51,7 @@ config ARM64
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_BITREVERSE
select HAVE_ARCH_HARDENED_USERCOPY
select HAVE_ARCH_HUGE_VMAP
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
@ -455,6 +456,15 @@ config CAVIUM_ERRATUM_22375
If unsure, say Y.
config CAVIUM_ERRATUM_23144
bool "Cavium erratum 23144: ITS SYNC hang on dual socket system"
depends on NUMA
default y
help
ITS SYNC command hang for cross node io and collections/cpu mapping.
If unsure, say Y.
config CAVIUM_ERRATUM_23154
bool "Cavium erratum 23154: Access to ICC_IAR1_EL1 is not sync'ed"
default y
@ -465,6 +475,17 @@ config CAVIUM_ERRATUM_23154
If unsure, say Y.
config CAVIUM_ERRATUM_27456
bool "Cavium erratum 27456: Broadcast TLBI instructions may cause icache corruption"
default y
help
On ThunderX T88 pass 1.x through 2.1 parts, broadcast TLBI
instructions may cause the icache to become corrupted if it
contains data for a non-current ASID. The fix is to
invalidate the icache when changing the mm context.
If unsure, say Y.
endmenu

View file

@ -16,7 +16,7 @@ OBJCOPYFLAGS :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
GZFLAGS :=-9
ifneq ($(CONFIG_RELOCATABLE),)
LDFLAGS_vmlinux += -pie
LDFLAGS_vmlinux += -pie -Bsymbolic
endif
KBUILD_DEFCONFIG := defconfig

View file

@ -262,6 +262,8 @@
#io-channel-cells = <1>;
clocks = <&cru SCLK_SARADC>, <&cru PCLK_SARADC>;
clock-names = "saradc", "apb_pclk";
resets = <&cru SRST_SARADC>;
reset-names = "saradc-apb";
status = "disabled";
};
@ -517,7 +519,7 @@
#address-cells = <0>;
reg = <0x0 0xffb71000 0x0 0x1000>,
<0x0 0xffb72000 0x0 0x1000>,
<0x0 0xffb72000 0x0 0x2000>,
<0x0 0xffb74000 0x0 0x2000>,
<0x0 0xffb76000 0x0 0x2000>;
interrupts = <GIC_PPI 9

View file

@ -247,6 +247,8 @@ CONFIG_SCSI_UFSHCD_PLATFORM=y
CONFIG_SCSI_UFS_QCOM=y
CONFIG_SCSI_UFS_QCOM_ICE=y
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
CONFIG_MD_LINEAR=y
CONFIG_BLK_DEV_DM=y
CONFIG_DM_CRYPT=y
CONFIG_DM_REQ_CRYPT=y

View file

@ -249,6 +249,8 @@ CONFIG_SCSI_UFSHCD_PLATFORM=y
CONFIG_SCSI_UFS_QCOM=y
CONFIG_SCSI_UFS_QCOM_ICE=y
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
CONFIG_MD_LINEAR=y
CONFIG_BLK_DEV_DM=y
CONFIG_DM_CRYPT=y
CONFIG_DM_REQ_CRYPT=y

View file

@ -35,6 +35,8 @@
#define ARM64_ALT_PAN_NOT_UAO 10
#define ARM64_NCAPS 11
#define ARM64_WORKAROUND_CAVIUM_27456 12
#ifndef __ASSEMBLY__

View file

@ -140,6 +140,7 @@ typedef struct user_fpsimd_state elf_fpregset_t;
#define SET_PERSONALITY(ex) clear_thread_flag(TIF_32BIT);
/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
#define ARCH_DLINFO \
do { \
NEW_AUX_ENT(AT_SYSINFO_EHDR, \

View file

@ -107,8 +107,6 @@
#define TCR_EL2_MASK (TCR_EL2_TG0 | TCR_EL2_SH0 | \
TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
#define TCR_EL2_FLAGS (TCR_EL2_RES1 | TCR_EL2_PS_40B)
/* VTCR_EL2 Registers bits */
#define VTCR_EL2_RES1 (1 << 31)
#define VTCR_EL2_PS_MASK (7 << 16)

View file

@ -117,6 +117,8 @@ struct pt_regs {
};
u64 orig_x0;
u64 syscallno;
u64 orig_addr_limit;
u64 unused; // maintain 16 byte alignment
};
#define arch_has_single_step() (1)

View file

@ -23,6 +23,15 @@ void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
unsigned long arch_get_cpu_efficiency(int cpu);
struct sched_domain;
#ifdef CONFIG_CPU_FREQ
#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
extern unsigned long cpufreq_scale_max_freq_capacity(int cpu);
#endif
#define arch_scale_cpu_capacity scale_cpu_capacity
extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
#include <asm-generic/topology.h>
#endif /* _ASM_ARM_TOPOLOGY_H */

View file

@ -269,24 +269,39 @@ do { \
-EFAULT; \
})
extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n);
extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n);
extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n);
extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n);
extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n);
static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n)
{
check_object_size(to, n, false);
return __arch_copy_from_user(to, from, n);
}
static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n)
{
check_object_size(from, n, true);
return __arch_copy_to_user(to, from, n);
}
static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n)
{
if (access_ok(VERIFY_READ, from, n))
n = __copy_from_user(to, from, n);
else /* security hole - plug it */
if (access_ok(VERIFY_READ, from, n)) {
check_object_size(to, n, false);
n = __arch_copy_from_user(to, from, n);
} else /* security hole - plug it */
memset(to, 0, n);
return n;
}
static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n)
{
if (access_ok(VERIFY_WRITE, to, n))
n = __copy_to_user(to, from, n);
if (access_ok(VERIFY_WRITE, to, n)) {
check_object_size(from, n, true);
n = __arch_copy_to_user(to, from, n);
}
return n;
}

View file

@ -19,4 +19,6 @@
/* vDSO location */
#define AT_SYSINFO_EHDR 33
#define AT_VECTOR_SIZE_ARCH 1 /* entries in ARCH_DLINFO */
#endif

View file

@ -34,8 +34,8 @@ EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
/* user mem (segment) */
EXPORT_SYMBOL(__copy_from_user);
EXPORT_SYMBOL(__copy_to_user);
EXPORT_SYMBOL(__arch_copy_from_user);
EXPORT_SYMBOL(__arch_copy_to_user);
EXPORT_SYMBOL(__clear_user);
EXPORT_SYMBOL(__copy_in_user);

View file

@ -58,6 +58,7 @@ int main(void)
DEFINE(S_PC, offsetof(struct pt_regs, pc));
DEFINE(S_ORIG_X0, offsetof(struct pt_regs, orig_x0));
DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno));
DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit));
DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs));
BLANK();
DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter));

View file

@ -93,6 +93,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
.capability = ARM64_WORKAROUND_CAVIUM_23154,
MIDR_RANGE(MIDR_THUNDERX, 0x00, 0x01),
},
#endif
#ifdef CONFIG_CAVIUM_ERRATUM_27456
{
/* Cavium ThunderX, T88 pass 1.x - 2.1 */
.desc = "Cavium erratum 27456",
.capability = ARM64_WORKAROUND_CAVIUM_27456,
MIDR_RANGE(MIDR_THUNDERX, 0x00,
(1 << MIDR_VARIANT_SHIFT) | 1),
},
#endif
{
}

View file

@ -152,7 +152,6 @@ static int debug_monitors_init(void)
/* Clear the OS lock. */
on_each_cpu(clear_os_lock, NULL, 1);
isb();
local_dbg_enable();
/* Register hotplug handler. */
__register_cpu_notifier(&os_lock_nb);

View file

@ -28,6 +28,7 @@
#include <asm/errno.h>
#include <asm/esr.h>
#include <asm/irq.h>
#include <asm/memory.h>
#include <asm/thread_info.h>
#include <asm/unistd.h>
@ -97,7 +98,13 @@
mov x29, xzr // fp pointed to user-space
.else
add x21, sp, #S_FRAME_SIZE
.endif
get_thread_info tsk
/* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
ldr x20, [tsk, #TI_ADDR_LIMIT]
str x20, [sp, #S_ORIG_ADDR_LIMIT]
mov x20, #TASK_SIZE_64
str x20, [tsk, #TI_ADDR_LIMIT]
.endif /* \el == 0 */
mrs x22, elr_el1
mrs x23, spsr_el1
stp lr, x21, [sp, #S_LR]
@ -128,6 +135,12 @@
.endm
.macro kernel_exit, el
.if \el != 0
/* Restore the task's original addr_limit. */
ldr x20, [sp, #S_ORIG_ADDR_LIMIT]
str x20, [tsk, #TI_ADDR_LIMIT]
.endif
ldp x21, x22, [sp, #S_PC] // load ELR, SPSR
.if \el == 0
ct_user_enter

View file

@ -717,40 +717,25 @@ __primary_switch:
* Iterate over each entry in the relocation table, and apply the
* relocations in place.
*/
ldr w8, =__dynsym_offset // offset to symbol table
ldr w9, =__rela_offset // offset to reloc table
ldr w10, =__rela_size // size of reloc table
mov_q x11, KIMAGE_VADDR // default virtual offset
add x11, x11, x23 // actual virtual offset
add x8, x8, x11 // __va(.dynsym)
add x9, x9, x11 // __va(.rela)
add x10, x9, x10 // __va(.rela) + sizeof(.rela)
0: cmp x9, x10
b.hs 2f
b.hs 1f
ldp x11, x12, [x9], #24
ldr x13, [x9, #-8]
cmp w12, #R_AARCH64_RELATIVE
b.ne 1f
b.ne 0b
add x13, x13, x23 // relocate
str x13, [x11, x23]
b 0b
1: cmp w12, #R_AARCH64_ABS64
b.ne 0b
add x12, x12, x12, lsl #1 // symtab offset: 24x top word
add x12, x8, x12, lsr #(32 - 3) // ... shifted into bottom word
ldrsh w14, [x12, #6] // Elf64_Sym::st_shndx
ldr x15, [x12, #8] // Elf64_Sym::st_value
cmp w14, #-0xf // SHN_ABS (0xfff1) ?
add x14, x15, x23 // relocate
csel x15, x14, x15, ne
add x15, x13, x15
str x15, [x11, x23]
b 0b
2:
1:
#endif
ldr x8, =__primary_switched
br x8

View file

@ -214,7 +214,7 @@ static void __init request_standard_resources(void)
struct resource *res;
kernel_code.start = virt_to_phys(_text);
kernel_code.end = virt_to_phys(_etext - 1);
kernel_code.end = virt_to_phys(__init_begin - 1);
kernel_data.start = virt_to_phys(_sdata);
kernel_data.end = virt_to_phys(_end - 1);

View file

@ -191,7 +191,6 @@ asmlinkage void secondary_start_kernel(void)
set_cpu_online(cpu, true);
complete(&cpu_running);
local_dbg_enable();
local_irq_enable();
local_async_enable();
@ -346,8 +345,8 @@ void __init smp_cpus_done(unsigned int max_cpus)
void __init smp_prepare_boot_cpu(void)
{
cpuinfo_store_boot_cpu();
set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
cpuinfo_store_boot_cpu();
}
static u64 __init of_get_cpu_mpidr(struct device_node *dn)

View file

@ -20,6 +20,8 @@
#include <linux/of.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched_energy.h>
#include <asm/cputype.h>
#include <asm/topology.h>
@ -35,7 +37,7 @@
* rebalance_domains for all idle cores and the cpu_power can be updated
* during this sequence.
*/
static DEFINE_PER_CPU(unsigned long, cpu_scale);
static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
{
@ -47,6 +49,22 @@ static void set_power_scale(unsigned int cpu, unsigned long power)
per_cpu(cpu_scale, cpu) = power;
}
unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
#ifdef CONFIG_CPU_FREQ
unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
#else
return per_cpu(cpu_scale, cpu);
#endif
}
static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
{
per_cpu(cpu_scale, cpu) = capacity;
}
static int __init get_cpu_for_node(struct device_node *node)
{
struct device_node *cpu_node;
@ -371,11 +389,67 @@ static void update_cpu_power(unsigned int cpu)
struct cpu_topology cpu_topology[NR_CPUS];
EXPORT_SYMBOL_GPL(cpu_topology);
/* sd energy functions */
static inline
const struct sched_group_energy * const cpu_cluster_energy(int cpu)
{
struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1];
if (!sge) {
pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu);
return NULL;
}
return sge;
}
static inline
const struct sched_group_energy * const cpu_core_energy(int cpu)
{
struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0];
if (!sge) {
pr_warn("Invalid sched_group_energy for CPU%d\n", cpu);
return NULL;
}
return sge;
}
const struct cpumask *cpu_coregroup_mask(int cpu)
{
return &cpu_topology[cpu].core_sibling;
}
static inline int cpu_corepower_flags(void)
{
return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \
SD_SHARE_CAP_STATES;
}
static struct sched_domain_topology_level arm64_topology[] = {
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
{ NULL, },
};
static void update_cpu_capacity(unsigned int cpu)
{
unsigned long capacity = SCHED_CAPACITY_SCALE;
if (sched_energy_aware && cpu_core_energy(cpu)) {
int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1;
capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap;
}
set_capacity_scale(cpu, capacity);
pr_info("CPU%d: update cpu_capacity %lu\n",
cpu, arch_scale_cpu_capacity(NULL, cpu));
}
static void update_siblings_masks(unsigned int cpuid)
{
struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
@ -438,6 +512,7 @@ void store_cpu_topology(unsigned int cpuid)
topology_populated:
update_siblings_masks(cpuid);
update_cpu_power(cpuid);
update_cpu_capacity(cpuid);
}
static void __init reset_cpu_topology(void)
@ -479,10 +554,12 @@ void __init init_cpu_topology(void)
if (of_have_populated_dt() && parse_dt_topology()) {
reset_cpu_topology();
} else {
set_sched_topology(arm64_topology);
for_each_possible_cpu(cpu)
update_siblings_masks(cpu);
}
reset_cpu_power();
parse_dt_cpu_power();
init_sched_energy_costs();
}

View file

@ -94,6 +94,7 @@ SECTIONS
*(.discard)
*(.discard.*)
*(.interp .dynamic)
*(.dynsym .dynstr .hash)
}
. = KIMAGE_VADDR + TEXT_OFFSET;
@ -120,12 +121,13 @@ SECTIONS
}
. = ALIGN(SEGMENT_ALIGN);
RO_DATA(PAGE_SIZE) /* everything from this point to */
EXCEPTION_TABLE(8) /* _etext will be marked RO NX */
_etext = .; /* End of text section */
RO_DATA(PAGE_SIZE) /* everything from this point to */
EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */
NOTES
. = ALIGN(SEGMENT_ALIGN);
_etext = .; /* End of text and rodata section */
__init_begin = .;
INIT_TEXT_SECTION(8)
@ -159,19 +161,9 @@ SECTIONS
.rela : ALIGN(8) {
*(.rela .rela*)
}
.dynsym : ALIGN(8) {
*(.dynsym)
}
.dynstr : {
*(.dynstr)
}
.hash : {
*(.hash)
}
__rela_offset = ADDR(.rela) - KIMAGE_VADDR;
__rela_offset = ABSOLUTE(ADDR(.rela) - KIMAGE_VADDR);
__rela_size = SIZEOF(.rela);
__dynsym_offset = ADDR(.dynsym) - KIMAGE_VADDR;
. = ALIGN(SEGMENT_ALIGN);
__init_end = .;

View file

@ -64,7 +64,7 @@ __do_hyp_init:
mrs x4, tcr_el1
ldr x5, =TCR_EL2_MASK
and x4, x4, x5
ldr x5, =TCR_EL2_FLAGS
mov x5, #TCR_EL2_RES1
orr x4, x4, x5
#ifndef CONFIG_ARM64_VA_BITS_48
@ -85,15 +85,18 @@ __do_hyp_init:
ldr_l x5, idmap_t0sz
bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
#endif
msr tcr_el2, x4
ldr x4, =VTCR_EL2_FLAGS
/*
* Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
* VTCR_EL2.
* TCR_EL2 and VTCR_EL2.
*/
mrs x5, ID_AA64MMFR0_EL1
bfi x4, x5, #16, #3
msr tcr_el2, x4
ldr x4, =VTCR_EL2_FLAGS
bfi x4, x5, #16, #3
msr vtcr_el2, x4
mrs x4, mair_el1

View file

@ -66,7 +66,7 @@
.endm
end .req x5
ENTRY(__copy_from_user)
ENTRY(__arch_copy_from_user)
ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
CONFIG_ARM64_PAN)
add end, x0, x2
@ -75,7 +75,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
CONFIG_ARM64_PAN)
mov x0, #0 // Nothing to copy
ret
ENDPROC(__copy_from_user)
ENDPROC(__arch_copy_from_user)
.section .fixup,"ax"
.align 2

View file

@ -65,7 +65,7 @@
.endm
end .req x5
ENTRY(__copy_to_user)
ENTRY(__arch_copy_to_user)
ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
CONFIG_ARM64_PAN)
add end, x0, x2
@ -74,7 +74,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
CONFIG_ARM64_PAN)
mov x0, #0
ret
ENDPROC(__copy_to_user)
ENDPROC(__arch_copy_to_user)
.section .fixup,"ax"
.align 2

View file

@ -387,8 +387,8 @@ void __init mem_init(void)
MLM(MODULES_VADDR, MODULES_END),
MLG(VMALLOC_START, VMALLOC_END),
MLK_ROUNDUP(__init_begin, __init_end),
MLK_ROUNDUP(_text, __start_rodata),
MLK_ROUNDUP(__start_rodata, _etext),
MLK_ROUNDUP(_text, _etext),
MLK_ROUNDUP(__start_rodata, __init_begin),
MLK_ROUNDUP(_sdata, _edata),
#ifdef CONFIG_SPARSEMEM_VMEMMAP
MLG(VMEMMAP_START,

View file

@ -392,14 +392,14 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end)
{
unsigned long kernel_start = __pa(_text);
unsigned long kernel_end = __pa(_etext);
unsigned long kernel_end = __pa(__init_begin);
/*
* Take care not to create a writable alias for the
* read-only text and rodata sections of the kernel image.
*/
/* No overlap with the kernel text */
/* No overlap with the kernel text/rodata */
if (end < kernel_start || start >= kernel_end) {
__create_pgd_mapping(pgd, start, __phys_to_virt(start),
end - start, PAGE_KERNEL,
@ -408,7 +408,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
}
/*
* This block overlaps the kernel text mapping.
* This block overlaps the kernel text/rodata mappings.
* Map the portion(s) which don't overlap.
*/
if (start < kernel_start)
@ -423,7 +423,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
early_pgtable_alloc);
/*
* Map the linear alias of the [_text, _etext) interval as
* Map the linear alias of the [_text, __init_begin) interval as
* read-only/non-executable. This makes the contents of the
* region accessible to subsystems such as hibernate, but
* protects it from inadvertent modification or execution.
@ -453,14 +453,14 @@ void mark_rodata_ro(void)
{
unsigned long section_size;
section_size = (unsigned long)__start_rodata - (unsigned long)_text;
section_size = (unsigned long)_etext - (unsigned long)_text;
create_mapping_late(__pa(_text), (unsigned long)_text,
section_size, PAGE_KERNEL_ROX);
/*
* mark .rodata as read only. Use _etext rather than __end_rodata to
* cover NOTES and EXCEPTION_TABLE.
* mark .rodata as read only. Use __init_begin rather than __end_rodata
* to cover NOTES and EXCEPTION_TABLE.
*/
section_size = (unsigned long)_etext - (unsigned long)__start_rodata;
section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata,
section_size, PAGE_KERNEL_RO);
}
@ -503,8 +503,8 @@ static void __init map_kernel(pgd_t *pgd)
{
static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data;
map_kernel_segment(pgd, _text, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text);
map_kernel_segment(pgd, __start_rodata, _etext, PAGE_KERNEL, &vmlinux_rodata);
map_kernel_segment(pgd, _text, _etext, PAGE_KERNEL_EXEC, &vmlinux_text);
map_kernel_segment(pgd, __start_rodata, __init_begin, PAGE_KERNEL, &vmlinux_rodata);
map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC,
&vmlinux_init);
map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data);
@ -785,9 +785,9 @@ void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
/*
* Check whether the physical FDT address is set and meets the minimum
* alignment requirement. Since we are relying on MIN_FDT_ALIGN to be
* at least 8 bytes so that we can always access the size field of the
* FDT header after mapping the first chunk, double check here if that
* is indeed the case.
* at least 8 bytes so that we can always access the magic and size
* fields of the FDT header after mapping the first chunk, double check
* here if that is indeed the case.
*/
BUILD_BUG_ON(MIN_FDT_ALIGN < 8);
if (!dt_phys || dt_phys % MIN_FDT_ALIGN)
@ -815,7 +815,7 @@ void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
dt_virt_base, SWAPPER_BLOCK_SIZE, prot);
if (fdt_check_header(dt_virt) != 0)
if (fdt_magic(dt_virt) != FDT_MAGIC)
return NULL;
*size = fdt_totalsize(dt_virt);

View file

@ -25,6 +25,8 @@
#include <asm/hwcap.h>
#include <asm/pgtable-hwdef.h>
#include <asm/pgtable.h>
#include <asm/cpufeature.h>
#include <asm/alternative.h>
#include "proc-macros.S"
@ -183,7 +185,17 @@ ENTRY(cpu_do_switch_mm)
bfi x0, x1, #48, #16 // set the ASID
msr ttbr0_el1, x0 // set TTBR0
isb
alternative_if_not ARM64_WORKAROUND_CAVIUM_27456
ret
nop
nop
nop
alternative_else
ic iallu
dsb nsh
isb
ret
alternative_endif
ENDPROC(cpu_do_switch_mm)
.pushsection ".idmap.text", "ax"
@ -228,6 +240,8 @@ ENTRY(__cpu_setup)
msr cpacr_el1, x0 // Enable FP/ASIMD
mov x0, #1 << 12 // Reset mdscr_el1 and disable
msr mdscr_el1, x0 // access to the DCC from EL0
isb // Unmask debug exceptions now,
enable_dbg // since this is per-cpu
reset_pmuserenr_el0 x0 // Disable PMU access from EL0
/*
* Memory region attributes for LPAE:

View file

@ -53,6 +53,7 @@ config IA64
select MODULES_USE_ELF_RELA
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HARDENED_USERCOPY
default y
help
The Itanium Processor Family is Intel's 64-bit successor to

View file

@ -241,12 +241,18 @@ extern unsigned long __must_check __copy_user (void __user *to, const void __use
static inline unsigned long
__copy_to_user (void __user *to, const void *from, unsigned long count)
{
if (!__builtin_constant_p(count))
check_object_size(from, count, true);
return __copy_user(to, (__force void __user *) from, count);
}
static inline unsigned long
__copy_from_user (void *to, const void __user *from, unsigned long count)
{
if (!__builtin_constant_p(count))
check_object_size(to, count, false);
return __copy_user((__force void __user *) to, from, count);
}
@ -258,8 +264,11 @@ __copy_from_user (void *to, const void __user *from, unsigned long count)
const void *__cu_from = (from); \
long __cu_len = (n); \
\
if (__access_ok(__cu_to, __cu_len, get_fs())) \
__cu_len = __copy_user(__cu_to, (__force void __user *) __cu_from, __cu_len); \
if (__access_ok(__cu_to, __cu_len, get_fs())) { \
if (!__builtin_constant_p(n)) \
check_object_size(__cu_from, __cu_len, true); \
__cu_len = __copy_user(__cu_to, (__force void __user *) __cu_from, __cu_len); \
} \
__cu_len; \
})
@ -270,8 +279,11 @@ __copy_from_user (void *to, const void __user *from, unsigned long count)
long __cu_len = (n); \
\
__chk_user_ptr(__cu_from); \
if (__access_ok(__cu_from, __cu_len, get_fs())) \
if (__access_ok(__cu_from, __cu_len, get_fs())) { \
if (!__builtin_constant_p(n)) \
check_object_size(__cu_to, __cu_len, false); \
__cu_len = __copy_user((__force void __user *) __cu_to, __cu_from, __cu_len); \
} \
__cu_len; \
})

View file

@ -61,7 +61,7 @@ static inline int atomic_##op##_return(int i, atomic_t *v) \
" CMPT %0, #HI(0x02000000)\n" \
" BNZ 1b\n" \
: "=&d" (temp), "=&da" (result) \
: "da" (&v->counter), "bd" (i) \
: "da" (&v->counter), "br" (i) \
: "cc"); \
\
smp_mb(); \

View file

@ -73,7 +73,7 @@ static inline unsigned long __cmpxchg_u32(volatile int *m, unsigned long old,
" DCACHE [%2], %0\n"
#endif
"2:\n"
: "=&d" (temp), "=&da" (retval)
: "=&d" (temp), "=&d" (retval)
: "da" (m), "bd" (old), "da" (new)
: "cc"
);

View file

@ -23,7 +23,7 @@ static struct clocksource clocksource_mips = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static u64 notrace r4k_read_sched_clock(void)
static u64 __maybe_unused notrace r4k_read_sched_clock(void)
{
return read_c0_count();
}
@ -82,7 +82,9 @@ int __init init_r4k_clocksource(void)
clocksource_register_hz(&clocksource_mips, mips_hpt_frequency);
#ifndef CONFIG_CPU_FREQ
sched_clock_register(r4k_read_sched_clock, 32, mips_hpt_frequency);
#endif
return 0;
}

View file

@ -344,7 +344,7 @@ EXPORT(sysn32_call_table)
PTR sys_ni_syscall /* available, was setaltroot */
PTR sys_add_key
PTR sys_request_key
PTR sys_keyctl /* 6245 */
PTR compat_sys_keyctl /* 6245 */
PTR sys_set_thread_area
PTR sys_inotify_init
PTR sys_inotify_add_watch

View file

@ -500,7 +500,7 @@ EXPORT(sys32_call_table)
PTR sys_ni_syscall /* available, was setaltroot */
PTR sys_add_key /* 4280 */
PTR sys_request_key
PTR sys_keyctl
PTR compat_sys_keyctl
PTR sys_set_thread_area
PTR sys_inotify_init
PTR sys_inotify_add_watch /* 4285 */

View file

@ -1629,8 +1629,14 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
preempt_disable();
if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
if (kvm_mips_host_tlb_lookup(vcpu, va) < 0)
kvm_mips_handle_kseg0_tlb_fault(va, vcpu);
if (kvm_mips_host_tlb_lookup(vcpu, va) < 0 &&
kvm_mips_handle_kseg0_tlb_fault(va, vcpu)) {
kvm_err("%s: handling mapped kseg0 tlb fault for %lx, vcpu: %p, ASID: %#lx\n",
__func__, va, vcpu, read_c0_entryhi());
er = EMULATE_FAIL;
preempt_enable();
goto done;
}
} else if ((KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0) ||
KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
int index;
@ -1665,14 +1671,19 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
run, vcpu);
preempt_enable();
goto dont_update_pc;
} else {
/*
* We fault an entry from the guest tlb to the
* shadow host TLB
*/
kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb,
NULL,
NULL);
}
/*
* We fault an entry from the guest tlb to the
* shadow host TLB
*/
if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb,
NULL, NULL)) {
kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
__func__, va, index, vcpu,
read_c0_entryhi());
er = EMULATE_FAIL;
preempt_enable();
goto done;
}
}
} else {
@ -2633,8 +2644,13 @@ enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
* OK we have a Guest TLB entry, now inject it into the
* shadow host TLB
*/
kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, NULL,
NULL);
if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb,
NULL, NULL)) {
kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
__func__, va, index, vcpu,
read_c0_entryhi());
er = EMULATE_FAIL;
}
}
}

View file

@ -276,7 +276,7 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
}
gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
if (gfn >= kvm->arch.guest_pmap_npages) {
if ((gfn | 1) >= kvm->arch.guest_pmap_npages) {
kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
gfn, badvaddr);
kvm_mips_dump_host_tlbs();
@ -361,25 +361,39 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
struct kvm *kvm = vcpu->kvm;
pfn_t pfn0, pfn1;
gfn_t gfn0, gfn1;
long tlb_lo[2];
if ((tlb->tlb_hi & VPN2_MASK) == 0) {
pfn0 = 0;
pfn1 = 0;
} else {
if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
>> PAGE_SHIFT) < 0)
return -1;
tlb_lo[0] = tlb->tlb_lo0;
tlb_lo[1] = tlb->tlb_lo1;
if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
>> PAGE_SHIFT) < 0)
return -1;
/*
* The commpage address must not be mapped to anything else if the guest
* TLB contains entries nearby, or commpage accesses will break.
*/
if (!((tlb->tlb_hi ^ KVM_GUEST_COMMPAGE_ADDR) &
VPN2_MASK & (PAGE_MASK << 1)))
tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0;
pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
>> PAGE_SHIFT];
pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
>> PAGE_SHIFT];
gfn0 = mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT;
gfn1 = mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT;
if (gfn0 >= kvm->arch.guest_pmap_npages ||
gfn1 >= kvm->arch.guest_pmap_npages) {
kvm_err("%s: Invalid gfn: [%#llx, %#llx], EHi: %#lx\n",
__func__, gfn0, gfn1, tlb->tlb_hi);
kvm_mips_dump_guest_tlbs(vcpu);
return -1;
}
if (kvm_mips_map_page(kvm, gfn0) < 0)
return -1;
if (kvm_mips_map_page(kvm, gfn1) < 0)
return -1;
pfn0 = kvm->arch.guest_pmap[gfn0];
pfn1 = kvm->arch.guest_pmap[gfn1];
if (hpa0)
*hpa0 = pfn0 << PAGE_SHIFT;
@ -391,9 +405,9 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
kvm_mips_get_kernel_asid(vcpu) :
kvm_mips_get_user_asid(vcpu));
entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
(tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V);
(tlb_lo[0] & MIPS3_PG_D) | (tlb_lo[0] & MIPS3_PG_V);
entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
(tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V);
(tlb_lo[1] & MIPS3_PG_D) | (tlb_lo[1] & MIPS3_PG_V);
kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
tlb->tlb_lo0, tlb->tlb_lo1);
@ -794,10 +808,16 @@ uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu)
local_irq_restore(flags);
return KVM_INVALID_INST;
}
kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
&vcpu->arch.
guest_tlb[index],
NULL, NULL);
if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
&vcpu->arch.guest_tlb[index],
NULL, NULL)) {
kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n",
__func__, opc, index, vcpu,
read_c0_entryhi());
kvm_mips_dump_guest_tlbs(vcpu);
local_irq_restore(flags);
return KVM_INVALID_INST;
}
inst = *(opc);
}
local_irq_restore(flags);

View file

@ -13,8 +13,8 @@
#define SMBUS_PCI_REG64 0x64
#define SMBUS_PCI_REGB4 0xb4
#define HPET_MIN_CYCLES 64
#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
#define HPET_MIN_CYCLES 16
#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES * 12)
static DEFINE_SPINLOCK(hpet_lock);
DEFINE_PER_CPU(struct clock_event_device, hpet_clockevent_device);
@ -157,14 +157,14 @@ static int hpet_tick_resume(struct clock_event_device *evt)
static int hpet_next_event(unsigned long delta,
struct clock_event_device *evt)
{
unsigned int cnt;
int res;
u32 cnt;
s32 res;
cnt = hpet_read(HPET_COUNTER);
cnt += delta;
cnt += (u32) delta;
hpet_write(HPET_T0_CMP, cnt);
res = (int)(cnt - hpet_read(HPET_COUNTER));
res = (s32)(cnt - hpet_read(HPET_COUNTER));
return res < HPET_MIN_CYCLES ? -ETIME : 0;
}
@ -230,7 +230,7 @@ void __init setup_hpet_timer(void)
cd = &per_cpu(hpet_clockevent_device, cpu);
cd->name = "hpet";
cd->rating = 320;
cd->rating = 100;
cd->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT;
cd->set_state_shutdown = hpet_set_state_shutdown;
cd->set_state_periodic = hpet_set_state_periodic;

View file

@ -65,7 +65,7 @@ static struct insn insn_table[] = {
#ifndef CONFIG_CPU_MIPSR6
{ insn_cache, M(cache_op, 0, 0, 0, 0, 0), RS | RT | SIMM },
#else
{ insn_cache, M6(cache_op, 0, 0, 0, cache6_op), RS | RT | SIMM9 },
{ insn_cache, M6(spec3_op, 0, 0, 0, cache6_op), RS | RT | SIMM9 },
#endif
{ insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM },
{ insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD },

View file

@ -97,10 +97,10 @@
#define ENOTCONN 235 /* Transport endpoint is not connected */
#define ESHUTDOWN 236 /* Cannot send after transport endpoint shutdown */
#define ETOOMANYREFS 237 /* Too many references: cannot splice */
#define EREFUSED ECONNREFUSED /* for HP's NFS apparently */
#define ETIMEDOUT 238 /* Connection timed out */
#define ECONNREFUSED 239 /* Connection refused */
#define EREMOTERELEASE 240 /* Remote peer released connection */
#define EREFUSED ECONNREFUSED /* for HP's NFS apparently */
#define EREMOTERELEASE 240 /* Remote peer released connection */
#define EHOSTDOWN 241 /* Host is down */
#define EHOSTUNREACH 242 /* No route to host */

View file

@ -160,6 +160,7 @@ config PPC
select EDAC_ATOMIC_SCRUB
select ARCH_HAS_DMA_SET_COHERENT_MASK
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_HARDENED_USERCOPY
config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN

View file

@ -164,6 +164,7 @@ struct coprocessor_request_block {
#define ICSWX_INITIATED (0x8)
#define ICSWX_BUSY (0x4)
#define ICSWX_REJECTED (0x2)
#define ICSWX_XERS0 (0x1) /* undefined or set from XERSO. */
static inline int icswx(__be32 ccw, struct coprocessor_request_block *crb)
{

View file

@ -325,10 +325,15 @@ static inline unsigned long copy_from_user(void *to,
{
unsigned long over;
if (access_ok(VERIFY_READ, from, n))
if (access_ok(VERIFY_READ, from, n)) {
if (!__builtin_constant_p(n))
check_object_size(to, n, false);
return __copy_tofrom_user((__force void __user *)to, from, n);
}
if ((unsigned long)from < TASK_SIZE) {
over = (unsigned long)from + n - TASK_SIZE;
if (!__builtin_constant_p(n - over))
check_object_size(to, n - over, false);
return __copy_tofrom_user((__force void __user *)to, from,
n - over) + over;
}
@ -340,10 +345,15 @@ static inline unsigned long copy_to_user(void __user *to,
{
unsigned long over;
if (access_ok(VERIFY_WRITE, to, n))
if (access_ok(VERIFY_WRITE, to, n)) {
if (!__builtin_constant_p(n))
check_object_size(from, n, true);
return __copy_tofrom_user(to, (__force void __user *)from, n);
}
if ((unsigned long)to < TASK_SIZE) {
over = (unsigned long)to + n - TASK_SIZE;
if (!__builtin_constant_p(n))
check_object_size(from, n - over, true);
return __copy_tofrom_user(to, (__force void __user *)from,
n - over) + over;
}
@ -387,6 +397,10 @@ static inline unsigned long __copy_from_user_inatomic(void *to,
if (ret == 0)
return 0;
}
if (!__builtin_constant_p(n))
check_object_size(to, n, false);
return __copy_tofrom_user((__force void __user *)to, from, n);
}
@ -413,6 +427,9 @@ static inline unsigned long __copy_to_user_inatomic(void __user *to,
if (ret == 0)
return 0;
}
if (!__builtin_constant_p(n))
check_object_size(from, n, true);
return __copy_tofrom_user(to, (__force const void __user *)from, n);
}

View file

@ -677,7 +677,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function)
/* Check if the request is finished successfully */
if (active_flag) {
rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
if (rc <= 0)
if (rc < 0)
return rc;
if (rc & active_flag)

View file

@ -110,17 +110,11 @@ _GLOBAL(tm_reclaim)
std r3, STK_PARAM(R3)(r1)
SAVE_NVGPRS(r1)
/* We need to setup MSR for VSX register save instructions. Here we
* also clear the MSR RI since when we do the treclaim, we won't have a
* valid kernel pointer for a while. We clear RI here as it avoids
* adding another mtmsr closer to the treclaim. This makes the region
* maked as non-recoverable wider than it needs to be but it saves on
* inserting another mtmsrd later.
*/
/* We need to setup MSR for VSX register save instructions. */
mfmsr r14
mr r15, r14
ori r15, r15, MSR_FP
li r16, MSR_RI
li r16, 0
ori r16, r16, MSR_EE /* IRQs hard off */
andc r15, r15, r16
oris r15, r15, MSR_VEC@h
@ -176,7 +170,17 @@ dont_backup_fp:
1: tdeqi r6, 0
EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0
/* The moment we treclaim, ALL of our GPRs will switch
/* Clear MSR RI since we are about to change r1, EE is already off. */
li r4, 0
mtmsrd r4, 1
/*
* BE CAREFUL HERE:
* At this point we can't take an SLB miss since we have MSR_RI
* off. Load only to/from the stack/paca which are in SLB bolted regions
* until we turn MSR RI back on.
*
* The moment we treclaim, ALL of our GPRs will switch
* to user register state. (FPRs, CCR etc. also!)
* Use an sprg and a tm_scratch in the PACA to shuffle.
*/
@ -197,6 +201,11 @@ dont_backup_fp:
/* Store the PPR in r11 and reset to decent value */
std r11, GPR11(r1) /* Temporary stash */
/* Reset MSR RI so we can take SLB faults again */
li r11, MSR_RI
mtmsrd r11, 1
mfspr r11, SPRN_PPR
HMT_MEDIUM
@ -397,11 +406,6 @@ restore_gprs:
ld r5, THREAD_TM_DSCR(r3)
ld r6, THREAD_TM_PPR(r3)
/* Clear the MSR RI since we are about to change R1. EE is already off
*/
li r4, 0
mtmsrd r4, 1
REST_GPR(0, r7) /* GPR0 */
REST_2GPRS(2, r7) /* GPR2-3 */
REST_GPR(4, r7) /* GPR4 */
@ -439,10 +443,33 @@ restore_gprs:
ld r6, _CCR(r7)
mtcr r6
REST_GPR(1, r7) /* GPR1 */
REST_GPR(5, r7) /* GPR5-7 */
REST_GPR(6, r7)
ld r7, GPR7(r7)
/*
* Store r1 and r5 on the stack so that we can access them
* after we clear MSR RI.
*/
REST_GPR(5, r7)
std r5, -8(r1)
ld r5, GPR1(r7)
std r5, -16(r1)
REST_GPR(7, r7)
/* Clear MSR RI since we are about to change r1. EE is already off */
li r5, 0
mtmsrd r5, 1
/*
* BE CAREFUL HERE:
* At this point we can't take an SLB miss since we have MSR_RI
* off. Load only to/from the stack/paca which are in SLB bolted regions
* until we turn MSR RI back on.
*/
ld r5, -8(r1)
ld r1, -16(r1)
/* Commit register state as checkpointed state: */
TRECHKPT

View file

@ -655,112 +655,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
b skip_tm
END_FTR_SECTION_IFCLR(CPU_FTR_TM)
/* Turn on TM/FP/VSX/VMX so we can restore them. */
mfmsr r5
li r6, MSR_TM >> 32
sldi r6, r6, 32
or r5, r5, r6
ori r5, r5, MSR_FP
oris r5, r5, (MSR_VEC | MSR_VSX)@h
mtmsrd r5
/*
* The user may change these outside of a transaction, so they must
* always be context switched.
*/
ld r5, VCPU_TFHAR(r4)
ld r6, VCPU_TFIAR(r4)
ld r7, VCPU_TEXASR(r4)
mtspr SPRN_TFHAR, r5
mtspr SPRN_TFIAR, r6
mtspr SPRN_TEXASR, r7
ld r5, VCPU_MSR(r4)
rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
beq skip_tm /* TM not active in guest */
/* Make sure the failure summary is set, otherwise we'll program check
* when we trechkpt. It's possible that this might have been not set
* on a kvmppc_set_one_reg() call but we shouldn't let this crash the
* host.
*/
oris r7, r7, (TEXASR_FS)@h
mtspr SPRN_TEXASR, r7
/*
* We need to load up the checkpointed state for the guest.
* We need to do this early as it will blow away any GPRs, VSRs and
* some SPRs.
*/
mr r31, r4
addi r3, r31, VCPU_FPRS_TM
bl load_fp_state
addi r3, r31, VCPU_VRS_TM
bl load_vr_state
mr r4, r31
lwz r7, VCPU_VRSAVE_TM(r4)
mtspr SPRN_VRSAVE, r7
ld r5, VCPU_LR_TM(r4)
lwz r6, VCPU_CR_TM(r4)
ld r7, VCPU_CTR_TM(r4)
ld r8, VCPU_AMR_TM(r4)
ld r9, VCPU_TAR_TM(r4)
mtlr r5
mtcr r6
mtctr r7
mtspr SPRN_AMR, r8
mtspr SPRN_TAR, r9
/*
* Load up PPR and DSCR values but don't put them in the actual SPRs
* till the last moment to avoid running with userspace PPR and DSCR for
* too long.
*/
ld r29, VCPU_DSCR_TM(r4)
ld r30, VCPU_PPR_TM(r4)
std r2, PACATMSCRATCH(r13) /* Save TOC */
/* Clear the MSR RI since r1, r13 are all going to be foobar. */
li r5, 0
mtmsrd r5, 1
/* Load GPRs r0-r28 */
reg = 0
.rept 29
ld reg, VCPU_GPRS_TM(reg)(r31)
reg = reg + 1
.endr
mtspr SPRN_DSCR, r29
mtspr SPRN_PPR, r30
/* Load final GPRs */
ld 29, VCPU_GPRS_TM(29)(r31)
ld 30, VCPU_GPRS_TM(30)(r31)
ld 31, VCPU_GPRS_TM(31)(r31)
/* TM checkpointed state is now setup. All GPRs are now volatile. */
TRECHKPT
/* Now let's get back the state we need. */
HMT_MEDIUM
GET_PACA(r13)
ld r29, HSTATE_DSCR(r13)
mtspr SPRN_DSCR, r29
ld r4, HSTATE_KVM_VCPU(r13)
ld r1, HSTATE_HOST_R1(r13)
ld r2, PACATMSCRATCH(r13)
/* Set the MSR RI since we have our registers back. */
li r5, MSR_RI
mtmsrd r5, 1
skip_tm:
bl kvmppc_restore_tm
END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
/* Load guest PMU registers */
@ -841,12 +737,6 @@ BEGIN_FTR_SECTION
/* Skip next section on POWER7 */
b 8f
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
/* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
mfmsr r8
li r0, 1
rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
mtmsrd r8
/* Load up POWER8-specific registers */
ld r5, VCPU_IAMR(r4)
lwz r6, VCPU_PSPB(r4)
@ -1436,106 +1326,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
b 2f
END_FTR_SECTION_IFCLR(CPU_FTR_TM)
/* Turn on TM. */
mfmsr r8
li r0, 1
rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
mtmsrd r8
ld r5, VCPU_MSR(r9)
rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
beq 1f /* TM not active in guest. */
li r3, TM_CAUSE_KVM_RESCHED
/* Clear the MSR RI since r1, r13 are all going to be foobar. */
li r5, 0
mtmsrd r5, 1
/* All GPRs are volatile at this point. */
TRECLAIM(R3)
/* Temporarily store r13 and r9 so we have some regs to play with */
SET_SCRATCH0(r13)
GET_PACA(r13)
std r9, PACATMSCRATCH(r13)
ld r9, HSTATE_KVM_VCPU(r13)
/* Get a few more GPRs free. */
std r29, VCPU_GPRS_TM(29)(r9)
std r30, VCPU_GPRS_TM(30)(r9)
std r31, VCPU_GPRS_TM(31)(r9)
/* Save away PPR and DSCR soon so don't run with user values. */
mfspr r31, SPRN_PPR
HMT_MEDIUM
mfspr r30, SPRN_DSCR
ld r29, HSTATE_DSCR(r13)
mtspr SPRN_DSCR, r29
/* Save all but r9, r13 & r29-r31 */
reg = 0
.rept 29
.if (reg != 9) && (reg != 13)
std reg, VCPU_GPRS_TM(reg)(r9)
.endif
reg = reg + 1
.endr
/* ... now save r13 */
GET_SCRATCH0(r4)
std r4, VCPU_GPRS_TM(13)(r9)
/* ... and save r9 */
ld r4, PACATMSCRATCH(r13)
std r4, VCPU_GPRS_TM(9)(r9)
/* Reload stack pointer and TOC. */
ld r1, HSTATE_HOST_R1(r13)
ld r2, PACATOC(r13)
/* Set MSR RI now we have r1 and r13 back. */
li r5, MSR_RI
mtmsrd r5, 1
/* Save away checkpinted SPRs. */
std r31, VCPU_PPR_TM(r9)
std r30, VCPU_DSCR_TM(r9)
mflr r5
mfcr r6
mfctr r7
mfspr r8, SPRN_AMR
mfspr r10, SPRN_TAR
std r5, VCPU_LR_TM(r9)
stw r6, VCPU_CR_TM(r9)
std r7, VCPU_CTR_TM(r9)
std r8, VCPU_AMR_TM(r9)
std r10, VCPU_TAR_TM(r9)
/* Restore r12 as trap number. */
lwz r12, VCPU_TRAP(r9)
/* Save FP/VSX. */
addi r3, r9, VCPU_FPRS_TM
bl store_fp_state
addi r3, r9, VCPU_VRS_TM
bl store_vr_state
mfspr r6, SPRN_VRSAVE
stw r6, VCPU_VRSAVE_TM(r9)
1:
/*
* We need to save these SPRs after the treclaim so that the software
* error code is recorded correctly in the TEXASR. Also the user may
* change these outside of a transaction, so they must always be
* context switched.
*/
mfspr r5, SPRN_TFHAR
mfspr r6, SPRN_TFIAR
mfspr r7, SPRN_TEXASR
std r5, VCPU_TFHAR(r9)
std r6, VCPU_TFIAR(r9)
std r7, VCPU_TEXASR(r9)
2:
bl kvmppc_save_tm
END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
/* Increment yield count if they have a VPA */
@ -2245,6 +2037,13 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */
/* save FP state */
bl kvmppc_save_fp
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
ld r9, HSTATE_KVM_VCPU(r13)
bl kvmppc_save_tm
END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
/*
* Set DEC to the smaller of DEC and HDEC, so that we wake
* no later than the end of our timeslice (HDEC interrupts
@ -2321,6 +2120,12 @@ kvm_end_cede:
bl kvmhv_accumulate_time
#endif
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
bl kvmppc_restore_tm
END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
/* load up FP state */
bl kvmppc_load_fp
@ -2629,6 +2434,239 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
mr r4,r31
blr
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/*
* Save transactional state and TM-related registers.
* Called with r9 pointing to the vcpu struct.
* This can modify all checkpointed registers, but
* restores r1, r2 and r9 (vcpu pointer) before exit.
*/
kvmppc_save_tm:
mflr r0
std r0, PPC_LR_STKOFF(r1)
/* Turn on TM. */
mfmsr r8
li r0, 1
rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
mtmsrd r8
ld r5, VCPU_MSR(r9)
rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
beq 1f /* TM not active in guest. */
std r1, HSTATE_HOST_R1(r13)
li r3, TM_CAUSE_KVM_RESCHED
/* Clear the MSR RI since r1, r13 are all going to be foobar. */
li r5, 0
mtmsrd r5, 1
/* All GPRs are volatile at this point. */
TRECLAIM(R3)
/* Temporarily store r13 and r9 so we have some regs to play with */
SET_SCRATCH0(r13)
GET_PACA(r13)
std r9, PACATMSCRATCH(r13)
ld r9, HSTATE_KVM_VCPU(r13)
/* Get a few more GPRs free. */
std r29, VCPU_GPRS_TM(29)(r9)
std r30, VCPU_GPRS_TM(30)(r9)
std r31, VCPU_GPRS_TM(31)(r9)
/* Save away PPR and DSCR soon so don't run with user values. */
mfspr r31, SPRN_PPR
HMT_MEDIUM
mfspr r30, SPRN_DSCR
ld r29, HSTATE_DSCR(r13)
mtspr SPRN_DSCR, r29
/* Save all but r9, r13 & r29-r31 */
reg = 0
.rept 29
.if (reg != 9) && (reg != 13)
std reg, VCPU_GPRS_TM(reg)(r9)
.endif
reg = reg + 1
.endr
/* ... now save r13 */
GET_SCRATCH0(r4)
std r4, VCPU_GPRS_TM(13)(r9)
/* ... and save r9 */
ld r4, PACATMSCRATCH(r13)
std r4, VCPU_GPRS_TM(9)(r9)
/* Reload stack pointer and TOC. */
ld r1, HSTATE_HOST_R1(r13)
ld r2, PACATOC(r13)
/* Set MSR RI now we have r1 and r13 back. */
li r5, MSR_RI
mtmsrd r5, 1
/* Save away checkpinted SPRs. */
std r31, VCPU_PPR_TM(r9)
std r30, VCPU_DSCR_TM(r9)
mflr r5
mfcr r6
mfctr r7
mfspr r8, SPRN_AMR
mfspr r10, SPRN_TAR
std r5, VCPU_LR_TM(r9)
stw r6, VCPU_CR_TM(r9)
std r7, VCPU_CTR_TM(r9)
std r8, VCPU_AMR_TM(r9)
std r10, VCPU_TAR_TM(r9)
/* Restore r12 as trap number. */
lwz r12, VCPU_TRAP(r9)
/* Save FP/VSX. */
addi r3, r9, VCPU_FPRS_TM
bl store_fp_state
addi r3, r9, VCPU_VRS_TM
bl store_vr_state
mfspr r6, SPRN_VRSAVE
stw r6, VCPU_VRSAVE_TM(r9)
1:
/*
* We need to save these SPRs after the treclaim so that the software
* error code is recorded correctly in the TEXASR. Also the user may
* change these outside of a transaction, so they must always be
* context switched.
*/
mfspr r5, SPRN_TFHAR
mfspr r6, SPRN_TFIAR
mfspr r7, SPRN_TEXASR
std r5, VCPU_TFHAR(r9)
std r6, VCPU_TFIAR(r9)
std r7, VCPU_TEXASR(r9)
ld r0, PPC_LR_STKOFF(r1)
mtlr r0
blr
/*
* Restore transactional state and TM-related registers.
* Called with r4 pointing to the vcpu struct.
* This potentially modifies all checkpointed registers.
* It restores r1, r2, r4 from the PACA.
*/
kvmppc_restore_tm:
mflr r0
std r0, PPC_LR_STKOFF(r1)
/* Turn on TM/FP/VSX/VMX so we can restore them. */
mfmsr r5
li r6, MSR_TM >> 32
sldi r6, r6, 32
or r5, r5, r6
ori r5, r5, MSR_FP
oris r5, r5, (MSR_VEC | MSR_VSX)@h
mtmsrd r5
/*
* The user may change these outside of a transaction, so they must
* always be context switched.
*/
ld r5, VCPU_TFHAR(r4)
ld r6, VCPU_TFIAR(r4)
ld r7, VCPU_TEXASR(r4)
mtspr SPRN_TFHAR, r5
mtspr SPRN_TFIAR, r6
mtspr SPRN_TEXASR, r7
ld r5, VCPU_MSR(r4)
rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
beqlr /* TM not active in guest */
std r1, HSTATE_HOST_R1(r13)
/* Make sure the failure summary is set, otherwise we'll program check
* when we trechkpt. It's possible that this might have been not set
* on a kvmppc_set_one_reg() call but we shouldn't let this crash the
* host.
*/
oris r7, r7, (TEXASR_FS)@h
mtspr SPRN_TEXASR, r7
/*
* We need to load up the checkpointed state for the guest.
* We need to do this early as it will blow away any GPRs, VSRs and
* some SPRs.
*/
mr r31, r4
addi r3, r31, VCPU_FPRS_TM
bl load_fp_state
addi r3, r31, VCPU_VRS_TM
bl load_vr_state
mr r4, r31
lwz r7, VCPU_VRSAVE_TM(r4)
mtspr SPRN_VRSAVE, r7
ld r5, VCPU_LR_TM(r4)
lwz r6, VCPU_CR_TM(r4)
ld r7, VCPU_CTR_TM(r4)
ld r8, VCPU_AMR_TM(r4)
ld r9, VCPU_TAR_TM(r4)
mtlr r5
mtcr r6
mtctr r7
mtspr SPRN_AMR, r8
mtspr SPRN_TAR, r9
/*
* Load up PPR and DSCR values but don't put them in the actual SPRs
* till the last moment to avoid running with userspace PPR and DSCR for
* too long.
*/
ld r29, VCPU_DSCR_TM(r4)
ld r30, VCPU_PPR_TM(r4)
std r2, PACATMSCRATCH(r13) /* Save TOC */
/* Clear the MSR RI since r1, r13 are all going to be foobar. */
li r5, 0
mtmsrd r5, 1
/* Load GPRs r0-r28 */
reg = 0
.rept 29
ld reg, VCPU_GPRS_TM(reg)(r31)
reg = reg + 1
.endr
mtspr SPRN_DSCR, r29
mtspr SPRN_PPR, r30
/* Load final GPRs */
ld 29, VCPU_GPRS_TM(29)(r31)
ld 30, VCPU_GPRS_TM(30)(r31)
ld 31, VCPU_GPRS_TM(31)(r31)
/* TM checkpointed state is now setup. All GPRs are now volatile. */
TRECHKPT
/* Now let's get back the state we need. */
HMT_MEDIUM
GET_PACA(r13)
ld r29, HSTATE_DSCR(r13)
mtspr SPRN_DSCR, r29
ld r4, HSTATE_KVM_VCPU(r13)
ld r1, HSTATE_HOST_R1(r13)
ld r2, PACATMSCRATCH(r13)
/* Set the MSR RI since we have our registers back. */
li r5, MSR_RI
mtmsrd r5, 1
ld r0, PPC_LR_STKOFF(r1)
mtlr r0
blr
#endif
/*
* We come here if we get any exception or interrupt while we are
* executing host real mode code while in guest MMU context.

View file

@ -117,6 +117,7 @@ config S390
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_EARLY_PFN_TO_NID
select HAVE_ARCH_HARDENED_USERCOPY
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_SOFT_DIRTY

View file

@ -669,11 +669,13 @@ static const struct file_operations prng_tdes_fops = {
static struct miscdevice prng_sha512_dev = {
.name = "prandom",
.minor = MISC_DYNAMIC_MINOR,
.mode = 0644,
.fops = &prng_sha512_fops,
};
static struct miscdevice prng_tdes_dev = {
.name = "prandom",
.minor = MISC_DYNAMIC_MINOR,
.mode = 0644,
.fops = &prng_tdes_fops,
};

View file

@ -23,6 +23,8 @@ enum zpci_ioat_dtype {
#define ZPCI_IOTA_FS_2G 2
#define ZPCI_KEY (PAGE_DEFAULT_KEY << 5)
#define ZPCI_TABLE_SIZE_RT (1UL << 42)
#define ZPCI_IOTA_STO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_ST)
#define ZPCI_IOTA_RTTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RT)
#define ZPCI_IOTA_RSTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RS)

View file

@ -2070,13 +2070,6 @@ void s390_reset_system(void (*fn_pre)(void),
S390_lowcore.program_new_psw.addr =
PSW_ADDR_AMODE | (unsigned long) s390_base_pgm_handler;
/*
* Clear subchannel ID and number to signal new kernel that no CCW or
* SCSI IPL has been done (for kexec and kdump)
*/
S390_lowcore.subchannel_id = 0;
S390_lowcore.subchannel_nr = 0;
/* Store status at absolute zero */
store_status();

View file

@ -104,6 +104,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr,
unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n)
{
check_object_size(to, n, false);
if (static_branch_likely(&have_mvcos))
return copy_from_user_mvcos(to, from, n);
return copy_from_user_mvcp(to, from, n);
@ -177,6 +178,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x,
unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n)
{
check_object_size(from, n, true);
if (static_branch_likely(&have_mvcos))
return copy_to_user_mvcos(to, from, n);
return copy_to_user_mvcs(to, from, n);

View file

@ -701,8 +701,7 @@ static int zpci_restore(struct device *dev)
goto out;
zpci_map_resources(pdev);
zpci_register_ioat(zdev, 0, zdev->start_dma + PAGE_OFFSET,
zdev->start_dma + zdev->iommu_size - 1,
zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
(u64) zdev->dma_table);
out:

View file

@ -458,7 +458,19 @@ int zpci_dma_init_device(struct zpci_dev *zdev)
goto out_clean;
}
zdev->iommu_size = (unsigned long) high_memory - PAGE_OFFSET;
/*
* Restrict the iommu bitmap size to the minimum of the following:
* - main memory size
* - 3-level pagetable address limit minus start_dma offset
* - DMA address range allowed by the hardware (clp query pci fn)
*
* Also set zdev->end_dma to the actual end address of the usable
* range, instead of the theoretical maximum as reported by hardware.
*/
zdev->iommu_size = min3((u64) high_memory,
ZPCI_TABLE_SIZE_RT - zdev->start_dma,
zdev->end_dma - zdev->start_dma + 1);
zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1;
zdev->iommu_pages = zdev->iommu_size >> PAGE_SHIFT;
zdev->iommu_bitmap = vzalloc(zdev->iommu_pages / 8);
if (!zdev->iommu_bitmap) {
@ -466,10 +478,7 @@ int zpci_dma_init_device(struct zpci_dev *zdev)
goto out_reg;
}
rc = zpci_register_ioat(zdev,
0,
zdev->start_dma + PAGE_OFFSET,
zdev->start_dma + zdev->iommu_size - 1,
rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
(u64) zdev->dma_table);
if (rc)
goto out_reg;

View file

@ -43,6 +43,7 @@ config SPARC
select ODD_RT_SIGACTION
select OLD_SIGSUSPEND
select ARCH_HAS_SG_CHAIN
select HAVE_ARCH_HARDENED_USERCOPY
config SPARC32
def_bool !64BIT

View file

@ -313,22 +313,28 @@ unsigned long __copy_user(void __user *to, const void __user *from, unsigned lon
static inline unsigned long copy_to_user(void __user *to, const void *from, unsigned long n)
{
if (n && __access_ok((unsigned long) to, n))
if (n && __access_ok((unsigned long) to, n)) {
if (!__builtin_constant_p(n))
check_object_size(from, n, true);
return __copy_user(to, (__force void __user *) from, n);
else
} else
return n;
}
static inline unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n)
{
if (!__builtin_constant_p(n))
check_object_size(from, n, true);
return __copy_user(to, (__force void __user *) from, n);
}
static inline unsigned long copy_from_user(void *to, const void __user *from, unsigned long n)
{
if (n && __access_ok((unsigned long) from, n))
if (n && __access_ok((unsigned long) from, n)) {
if (!__builtin_constant_p(n))
check_object_size(to, n, false);
return __copy_user((__force void __user *) to, from, n);
else
} else
return n;
}

View file

@ -250,8 +250,12 @@ unsigned long copy_from_user_fixup(void *to, const void __user *from,
static inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long size)
{
unsigned long ret = ___copy_from_user(to, from, size);
unsigned long ret;
if (!__builtin_constant_p(size))
check_object_size(to, size, false);
ret = ___copy_from_user(to, from, size);
if (unlikely(ret))
ret = copy_from_user_fixup(to, from, size);
@ -267,8 +271,11 @@ unsigned long copy_to_user_fixup(void __user *to, const void *from,
static inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long size)
{
unsigned long ret = ___copy_to_user(to, from, size);
unsigned long ret;
if (!__builtin_constant_p(size))
check_object_size(from, size, true);
ret = ___copy_to_user(to, from, size);
if (unlikely(ret))
ret = copy_to_user_fixup(to, from, size);
return ret;

View file

@ -81,7 +81,7 @@
.altinstr_replacement : { *(.altinstr_replacement) }
/* .exit.text is discard at runtime, not link time, to deal with references
from .altinstructions and .eh_frame */
.exit.text : { *(.exit.text) }
.exit.text : { EXIT_TEXT }
.exit.data : { *(.exit.data) }
.preinit_array : {

View file

@ -80,6 +80,7 @@ config X86
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_AOUT if X86_32
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HARDENED_USERCOPY
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
@ -89,7 +90,7 @@ config X86
select HAVE_ARCH_SOFT_DIRTY if X86_64
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_BPF_JIT if X86_64
select HAVE_ARCH_WITHIN_STACK_FRAMES
select HAVE_CC_STACKPROTECTOR
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL

View file

@ -294,7 +294,7 @@
# 285 sys_setaltroot
286 i386 add_key sys_add_key
287 i386 request_key sys_request_key
288 i386 keyctl sys_keyctl
288 i386 keyctl sys_keyctl compat_sys_keyctl
289 i386 ioprio_set sys_ioprio_set
290 i386 ioprio_get sys_ioprio_get
291 i386 inotify_init sys_inotify_init

View file

@ -24,6 +24,7 @@
#define _ASM_X86_MTRR_H
#include <uapi/asm/mtrr.h>
#include <asm/pat.h>
/*
@ -83,9 +84,12 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
{
}
static inline void mtrr_bp_init(void)
{
pat_disable("MTRRs disabled, skipping PAT initialization too.");
}
#define mtrr_ap_init() do {} while (0)
#define mtrr_bp_init() do {} while (0)
#define set_mtrr_aps_delayed_init() do {} while (0)
#define mtrr_aps_init() do {} while (0)
#define mtrr_bp_restore() do {} while (0)

View file

@ -5,8 +5,8 @@
#include <asm/pgtable_types.h>
bool pat_enabled(void);
void pat_disable(const char *reason);
extern void pat_init(void);
void pat_init_cache_modes(u64);
extern int reserve_memtype(u64 start, u64 end,
enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm);

View file

@ -76,6 +76,8 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
u8 ret_flags;
version = src->version;
/* Make the latest version visible */
smp_rmb();
offset = pvclock_get_nsec_offset(src);
ret = src->system_time + offset;

View file

@ -177,6 +177,50 @@ static inline unsigned long current_stack_pointer(void)
return sp;
}
/*
* Walks up the stack frames to make sure that the specified object is
* entirely contained by a single stack frame.
*
* Returns:
* 1 if within a frame
* -1 if placed across a frame boundary (or outside stack)
* 0 unable to determine (no frame pointers, etc)
*/
static inline int arch_within_stack_frames(const void * const stack,
const void * const stackend,
const void *obj, unsigned long len)
{
#if defined(CONFIG_FRAME_POINTER)
const void *frame = NULL;
const void *oldframe;
oldframe = __builtin_frame_address(1);
if (oldframe)
frame = __builtin_frame_address(2);
/*
* low ----------------------------------------------> high
* [saved bp][saved ip][args][local vars][saved bp][saved ip]
* ^----------------^
* allow copies only within here
*/
while (stack <= frame && frame < stackend) {
/*
* If obj + len extends past the last frame, this
* check won't pass and the next frame will be 0,
* causing us to bail out and correctly report
* the copy as invalid.
*/
if (obj + len <= frame)
return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1;
oldframe = frame;
frame = *(const void * const *)frame;
}
return -1;
#else
return 0;
#endif
}
#else /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_64

View file

@ -86,7 +86,14 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
static inline void __native_flush_tlb(void)
{
/*
* If current->mm == NULL then we borrow a mm which may change during a
* task switch and therefore we must not be preempted while we write CR3
* back:
*/
preempt_disable();
native_write_cr3(native_read_cr3());
preempt_enable();
}
static inline void __native_flush_tlb_global_irq_disabled(void)

View file

@ -134,6 +134,9 @@ extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_bad(void);
#define __uaccess_begin() stac()
#define __uaccess_end() clac()
/*
* This is a type: either unsigned long, if the argument fits into
* that type, or otherwise unsigned long long.
@ -193,10 +196,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
#ifdef CONFIG_X86_32
#define __put_user_asm_u64(x, addr, err, errret) \
asm volatile(ASM_STAC "\n" \
asm volatile("\n" \
"1: movl %%eax,0(%2)\n" \
"2: movl %%edx,4(%2)\n" \
"3: " ASM_CLAC "\n" \
"3:" \
".section .fixup,\"ax\"\n" \
"4: movl %3,%0\n" \
" jmp 3b\n" \
@ -207,10 +210,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
: "A" (x), "r" (addr), "i" (errret), "0" (err))
#define __put_user_asm_ex_u64(x, addr) \
asm volatile(ASM_STAC "\n" \
asm volatile("\n" \
"1: movl %%eax,0(%1)\n" \
"2: movl %%edx,4(%1)\n" \
"3: " ASM_CLAC "\n" \
"3:" \
_ASM_EXTABLE_EX(1b, 2b) \
_ASM_EXTABLE_EX(2b, 3b) \
: : "A" (x), "r" (addr))
@ -304,6 +307,10 @@ do { \
} \
} while (0)
/*
* This doesn't do __uaccess_begin/end - the exception handling
* around it must do that.
*/
#define __put_user_size_ex(x, ptr, size) \
do { \
__chk_user_ptr(ptr); \
@ -358,9 +365,9 @@ do { \
} while (0)
#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
asm volatile(ASM_STAC "\n" \
asm volatile("\n" \
"1: mov"itype" %2,%"rtype"1\n" \
"2: " ASM_CLAC "\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"3: mov %3,%0\n" \
" xor"itype" %"rtype"1,%"rtype"1\n" \
@ -370,6 +377,10 @@ do { \
: "=r" (err), ltype(x) \
: "m" (__m(addr)), "i" (errret), "0" (err))
/*
* This doesn't do __uaccess_begin/end - the exception handling
* around it must do that.
*/
#define __get_user_size_ex(x, ptr, size) \
do { \
__chk_user_ptr(ptr); \
@ -400,7 +411,9 @@ do { \
#define __put_user_nocheck(x, ptr, size) \
({ \
int __pu_err; \
__uaccess_begin(); \
__put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
__uaccess_end(); \
__builtin_expect(__pu_err, 0); \
})
@ -408,7 +421,9 @@ do { \
({ \
int __gu_err; \
unsigned long __gu_val; \
__uaccess_begin(); \
__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
__uaccess_end(); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
__builtin_expect(__gu_err, 0); \
})
@ -423,9 +438,9 @@ struct __large_struct { unsigned long buf[100]; };
* aliasing issues.
*/
#define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \
asm volatile(ASM_STAC "\n" \
asm volatile("\n" \
"1: mov"itype" %"rtype"1,%2\n" \
"2: " ASM_CLAC "\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"3: mov %3,%0\n" \
" jmp 2b\n" \
@ -445,11 +460,11 @@ struct __large_struct { unsigned long buf[100]; };
*/
#define uaccess_try do { \
current_thread_info()->uaccess_err = 0; \
stac(); \
__uaccess_begin(); \
barrier();
#define uaccess_catch(err) \
clac(); \
__uaccess_end(); \
(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \
} while (0)
@ -547,12 +562,13 @@ extern void __cmpxchg_wrong_size(void)
__typeof__(ptr) __uval = (uval); \
__typeof__(*(ptr)) __old = (old); \
__typeof__(*(ptr)) __new = (new); \
__uaccess_begin(); \
switch (size) { \
case 1: \
{ \
asm volatile("\t" ASM_STAC "\n" \
asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \
"2:\t" ASM_CLAC "\n" \
"2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@ -566,9 +582,9 @@ extern void __cmpxchg_wrong_size(void)
} \
case 2: \
{ \
asm volatile("\t" ASM_STAC "\n" \
asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \
"2:\t" ASM_CLAC "\n" \
"2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@ -582,9 +598,9 @@ extern void __cmpxchg_wrong_size(void)
} \
case 4: \
{ \
asm volatile("\t" ASM_STAC "\n" \
asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \
"2:\t" ASM_CLAC "\n" \
"2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@ -601,9 +617,9 @@ extern void __cmpxchg_wrong_size(void)
if (!IS_ENABLED(CONFIG_X86_64)) \
__cmpxchg_wrong_size(); \
\
asm volatile("\t" ASM_STAC "\n" \
asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \
"2:\t" ASM_CLAC "\n" \
"2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@ -618,6 +634,7 @@ extern void __cmpxchg_wrong_size(void)
default: \
__cmpxchg_wrong_size(); \
} \
__uaccess_end(); \
*__uval = __old; \
__ret; \
})
@ -689,7 +706,7 @@ __copy_from_user_overflow(int size, unsigned long count)
#endif
static inline unsigned long __must_check
static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
int sz = __compiletime_object_size(to);
@ -714,9 +731,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
* case, and do only runtime checking for non-constant sizes.
*/
if (likely(sz < 0 || sz >= n))
if (likely(sz < 0 || sz >= n)) {
check_object_size(to, n, false);
n = _copy_from_user(to, from, n);
else if(__builtin_constant_p(n))
} else if (__builtin_constant_p(n))
copy_from_user_overflow();
else
__copy_from_user_overflow(sz, n);
@ -724,7 +742,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
return n;
}
static inline unsigned long __must_check
static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
int sz = __compiletime_object_size(from);
@ -732,9 +750,10 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
might_fault();
/* See the comment in copy_from_user() above. */
if (likely(sz < 0 || sz >= n))
if (likely(sz < 0 || sz >= n)) {
check_object_size(from, n, true);
n = _copy_to_user(to, from, n);
else if(__builtin_constant_p(n))
} else if (__builtin_constant_p(n))
copy_to_user_overflow();
else
__copy_to_user_overflow(sz, n);
@ -745,5 +764,30 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
#undef __copy_from_user_overflow
#undef __copy_to_user_overflow
/*
* The "unsafe" user accesses aren't really "unsafe", but the naming
* is a big fat warning: you have to not only do the access_ok()
* checking before using them, but you have to surround them with the
* user_access_begin/end() pair.
*/
#define user_access_begin() __uaccess_begin()
#define user_access_end() __uaccess_end()
#define unsafe_put_user(x, ptr, err_label) \
do { \
int __pu_err; \
__put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \
if (unlikely(__pu_err)) goto err_label; \
} while (0)
#define unsafe_get_user(x, ptr, err_label) \
do { \
int __gu_err; \
unsigned long __gu_val; \
__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
if (unlikely(__gu_err)) goto err_label; \
} while (0)
#endif /* _ASM_X86_UACCESS_H */

View file

@ -33,38 +33,11 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero
* the specified block with access_ok() before calling this function.
* The caller should also make sure he pins the user space address
* so that we don't result in page fault and sleep.
*
* Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault
* we return the initial request size (1, 2 or 4), as copy_*_user should do.
* If a store crosses a page boundary and gets a fault, the x86 will not write
* anything, so this is accurate.
*/
static __always_inline unsigned long __must_check
__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
{
if (__builtin_constant_p(n)) {
unsigned long ret;
switch (n) {
case 1:
__put_user_size(*(u8 *)from, (u8 __user *)to,
1, ret, 1);
return ret;
case 2:
__put_user_size(*(u16 *)from, (u16 __user *)to,
2, ret, 2);
return ret;
case 4:
__put_user_size(*(u32 *)from, (u32 __user *)to,
4, ret, 4);
return ret;
case 8:
__put_user_size(*(u64 *)from, (u64 __user *)to,
8, ret, 8);
return ret;
}
}
check_object_size(from, n, true);
return __copy_to_user_ll(to, from, n);
}
@ -93,26 +66,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
static __always_inline unsigned long
__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
{
/* Avoid zeroing the tail if the copy fails..
* If 'n' is constant and 1, 2, or 4, we do still zero on a failure,
* but as the zeroing behaviour is only significant when n is not
* constant, that shouldn't be a problem.
*/
if (__builtin_constant_p(n)) {
unsigned long ret;
switch (n) {
case 1:
__get_user_size(*(u8 *)to, from, 1, ret, 1);
return ret;
case 2:
__get_user_size(*(u16 *)to, from, 2, ret, 2);
return ret;
case 4:
__get_user_size(*(u32 *)to, from, 4, ret, 4);
return ret;
}
}
return __copy_from_user_ll_nozero(to, from, n);
}
@ -143,18 +96,25 @@ static __always_inline unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
might_fault();
check_object_size(to, n, false);
if (__builtin_constant_p(n)) {
unsigned long ret;
switch (n) {
case 1:
__uaccess_begin();
__get_user_size(*(u8 *)to, from, 1, ret, 1);
__uaccess_end();
return ret;
case 2:
__uaccess_begin();
__get_user_size(*(u16 *)to, from, 2, ret, 2);
__uaccess_end();
return ret;
case 4:
__uaccess_begin();
__get_user_size(*(u32 *)to, from, 4, ret, 4);
__uaccess_end();
return ret;
}
}
@ -170,13 +130,19 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
switch (n) {
case 1:
__uaccess_begin();
__get_user_size(*(u8 *)to, from, 1, ret, 1);
__uaccess_end();
return ret;
case 2:
__uaccess_begin();
__get_user_size(*(u16 *)to, from, 2, ret, 2);
__uaccess_end();
return ret;
case 4:
__uaccess_begin();
__get_user_size(*(u32 *)to, from, 4, ret, 4);
__uaccess_end();
return ret;
}
}

View file

@ -53,38 +53,53 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
{
int ret = 0;
check_object_size(dst, size, false);
if (!__builtin_constant_p(size))
return copy_user_generic(dst, (__force void *)src, size);
switch (size) {
case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
case 1:
__uaccess_begin();
__get_user_asm(*(u8 *)dst, (u8 __user *)src,
ret, "b", "b", "=q", 1);
__uaccess_end();
return ret;
case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
case 2:
__uaccess_begin();
__get_user_asm(*(u16 *)dst, (u16 __user *)src,
ret, "w", "w", "=r", 2);
__uaccess_end();
return ret;
case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
case 4:
__uaccess_begin();
__get_user_asm(*(u32 *)dst, (u32 __user *)src,
ret, "l", "k", "=r", 4);
__uaccess_end();
return ret;
case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
case 8:
__uaccess_begin();
__get_user_asm(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 8);
__uaccess_end();
return ret;
case 10:
__uaccess_begin();
__get_user_asm(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 10);
if (unlikely(ret))
return ret;
__get_user_asm(*(u16 *)(8 + (char *)dst),
(u16 __user *)(8 + (char __user *)src),
ret, "w", "w", "=r", 2);
if (likely(!ret))
__get_user_asm(*(u16 *)(8 + (char *)dst),
(u16 __user *)(8 + (char __user *)src),
ret, "w", "w", "=r", 2);
__uaccess_end();
return ret;
case 16:
__uaccess_begin();
__get_user_asm(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 16);
if (unlikely(ret))
return ret;
__get_user_asm(*(u64 *)(8 + (char *)dst),
(u64 __user *)(8 + (char __user *)src),
ret, "q", "", "=r", 8);
if (likely(!ret))
__get_user_asm(*(u64 *)(8 + (char *)dst),
(u64 __user *)(8 + (char __user *)src),
ret, "q", "", "=r", 8);
__uaccess_end();
return ret;
default:
return copy_user_generic(dst, (__force void *)src, size);
@ -103,38 +118,55 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
{
int ret = 0;
check_object_size(src, size, true);
if (!__builtin_constant_p(size))
return copy_user_generic((__force void *)dst, src, size);
switch (size) {
case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
case 1:
__uaccess_begin();
__put_user_asm(*(u8 *)src, (u8 __user *)dst,
ret, "b", "b", "iq", 1);
__uaccess_end();
return ret;
case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
case 2:
__uaccess_begin();
__put_user_asm(*(u16 *)src, (u16 __user *)dst,
ret, "w", "w", "ir", 2);
__uaccess_end();
return ret;
case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
case 4:
__uaccess_begin();
__put_user_asm(*(u32 *)src, (u32 __user *)dst,
ret, "l", "k", "ir", 4);
__uaccess_end();
return ret;
case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
case 8:
__uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 8);
__uaccess_end();
return ret;
case 10:
__uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 10);
if (unlikely(ret))
return ret;
asm("":::"memory");
__put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
ret, "w", "w", "ir", 2);
if (likely(!ret)) {
asm("":::"memory");
__put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
ret, "w", "w", "ir", 2);
}
__uaccess_end();
return ret;
case 16:
__uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 16);
if (unlikely(ret))
return ret;
asm("":::"memory");
__put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
ret, "q", "", "er", 8);
if (likely(!ret)) {
asm("":::"memory");
__put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
ret, "q", "", "er", 8);
}
__uaccess_end();
return ret;
default:
return copy_user_generic((__force void *)dst, src, size);
@ -160,39 +192,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
switch (size) {
case 1: {
u8 tmp;
__uaccess_begin();
__get_user_asm(tmp, (u8 __user *)src,
ret, "b", "b", "=q", 1);
if (likely(!ret))
__put_user_asm(tmp, (u8 __user *)dst,
ret, "b", "b", "iq", 1);
__uaccess_end();
return ret;
}
case 2: {
u16 tmp;
__uaccess_begin();
__get_user_asm(tmp, (u16 __user *)src,
ret, "w", "w", "=r", 2);
if (likely(!ret))
__put_user_asm(tmp, (u16 __user *)dst,
ret, "w", "w", "ir", 2);
__uaccess_end();
return ret;
}
case 4: {
u32 tmp;
__uaccess_begin();
__get_user_asm(tmp, (u32 __user *)src,
ret, "l", "k", "=r", 4);
if (likely(!ret))
__put_user_asm(tmp, (u32 __user *)dst,
ret, "l", "k", "ir", 4);
__uaccess_end();
return ret;
}
case 8: {
u64 tmp;
__uaccess_begin();
__get_user_asm(tmp, (u64 __user *)src,
ret, "q", "", "=r", 8);
if (likely(!ret))
__put_user_asm(tmp, (u64 __user *)dst,
ret, "q", "", "er", 8);
__uaccess_end();
return ret;
}
default:

View file

@ -1587,6 +1587,9 @@ void __init enable_IR_x2apic(void)
unsigned long flags;
int ret, ir_stat;
if (skip_ioapic_setup)
return;
ir_stat = irq_remapping_prepare();
if (ir_stat < 0 && !x2apic_supported())
return;

View file

@ -152,6 +152,11 @@ static struct clocksource hyperv_cs = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static unsigned char hv_get_nmi_reason(void)
{
return 0;
}
static void __init ms_hyperv_init_platform(void)
{
/*
@ -191,6 +196,13 @@ static void __init ms_hyperv_init_platform(void)
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
#endif
mark_tsc_unstable("running on Hyper-V");
/*
* Generation 2 instances don't support reading the NMI status from
* 0x61 port.
*/
if (efi_enabled(EFI_BOOT))
x86_platform.get_nmi_reason = hv_get_nmi_reason;
}
const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {

Some files were not shown because too many files have changed in this diff Show more