android_kernel_oneplus_msm8998/mm/page-writeback.c
Runmin Wang 4b7c952db6 Merge tag 'lsk-v4.4-16.12-android' into branch 'msm-4.4'
* remotes/origin/tmp-2f0de51:
  Linux 4.4.38
  esp6: Fix integrity verification when ESN are used
  esp4: Fix integrity verification when ESN are used
  ipv4: Set skb->protocol properly for local output
  ipv6: Set skb->protocol properly for local output
  Don't feed anything but regular iovec's to blk_rq_map_user_iov
  constify iov_iter_count() and iter_is_iovec()
  sparc64: fix compile warning section mismatch in find_node()
  sparc64: Fix find_node warning if numa node cannot be found
  sparc32: Fix inverted invalid_frame_pointer checks on sigreturns
  net: ping: check minimum size on ICMP header length
  net: avoid signed overflows for SO_{SND|RCV}BUFFORCE
  geneve: avoid use-after-free of skb->data
  sh_eth: remove unchecked interrupts for RZ/A1
  net: bcmgenet: Utilize correct struct device for all DMA operations
  packet: fix race condition in packet_set_ring
  net/dccp: fix use-after-free in dccp_invalid_packet
  netlink: Do not schedule work from sk_destruct
  netlink: Call cb->done from a worker thread
  net/sched: pedit: make sure that offset is valid
  net, sched: respect rcu grace period on cls destruction
  net: dsa: bcm_sf2: Ensure we re-negotiate EEE during after link change
  l2tp: fix racy SOCK_ZAPPED flag check in l2tp_ip{,6}_bind()
  rtnetlink: fix FDB size computation
  af_unix: conditionally use freezable blocking calls in read
  net: sky2: Fix shutdown crash
  ip6_tunnel: disable caching when the traffic class is inherited
  net: check dead netns for peernet2id_alloc()
  virtio-net: add a missing synchronize_net()
  Linux 4.4.37
  arm64: suspend: Reconfigure PSTATE after resume from idle
  arm64: mm: Set PSTATE.PAN from the cpu_enable_pan() call
  arm64: cpufeature: Schedule enable() calls instead of calling them via IPI
  pwm: Fix device reference leak
  mwifiex: printk() overflow with 32-byte SSIDs
  PCI: Set Read Completion Boundary to 128 iff Root Port supports it (_HPX)
  PCI: Export pcie_find_root_port
  rcu: Fix soft lockup for rcu_nocb_kthread
  ALSA: pcm : Call kill_fasync() in stream lock
  x86/traps: Ignore high word of regs->cs in early_fixup_exception()
  kasan: update kasan_global for gcc 7
  zram: fix unbalanced idr management at hot removal
  ARC: Don't use "+l" inline asm constraint
  Linux 4.4.36
  scsi: mpt3sas: Unblock device after controller reset
  flow_dissect: call init_default_flow_dissectors() earlier
  mei: fix return value on disconnection
  mei: me: fix place for kaby point device ids.
  mei: me: disable driver on SPT SPS firmware
  drm/radeon: Ensure vblank interrupt is enabled on DPMS transition to on
  mpi: Fix NULL ptr dereference in mpi_powm() [ver #3]
  parisc: Also flush data TLB in flush_icache_page_asm
  parisc: Fix race in pci-dma.c
  parisc: Fix races in parisc_setup_cache_timing()
  NFSv4.x: hide array-bounds warning
  apparmor: fix change_hat not finding hat after policy replacement
  cfg80211: limit scan results cache size
  tile: avoid using clocksource_cyc2ns with absolute cycle count
  scsi: mpt3sas: Fix secure erase premature termination
  Fix USB CB/CBI storage devices with CONFIG_VMAP_STACK=y
  USB: serial: ftdi_sio: add support for TI CC3200 LaunchPad
  USB: serial: cp210x: add ID for the Zone DPMX
  usb: chipidea: move the lock initialization to core file
  KVM: x86: check for pic and ioapic presence before use
  KVM: x86: drop error recovery in em_jmp_far and em_ret_far
  iommu/vt-d: Fix IOMMU lookup for SR-IOV Virtual Functions
  iommu/vt-d: Fix PASID table allocation
  sched: tune: Fix lacking spinlock initialization
  UPSTREAM: trace: Update documentation for mono, mono_raw and boot clock
  UPSTREAM: trace: Add an option for boot clock as trace clock
  UPSTREAM: timekeeping: Add a fast and NMI safe boot clock
  ANDROID: goldfish_pipe: fix allmodconfig build
  ANDROID: goldfish: goldfish_pipe: fix locking errors
  ANDROID: video: goldfishfb: fix platform_no_drv_owner.cocci warnings
  ANDROID: goldfish_pipe: fix call_kern.cocci warnings
  arm64: rename ranchu defconfig to ranchu64
  ANDROID: arch: x86: disable pic for Android toolchain
  ANDROID: goldfish_pipe: An implementation of more parallel pipe
  ANDROID: goldfish_pipe: bugfixes and performance improvements.
  ANDROID: goldfish: Add goldfish sync driver
  ANDROID: goldfish: add ranchu defconfigs
  ANDROID: goldfish_audio: Clear audio read buffer status after each read
  ANDROID: goldfish_events: no extra EV_SYN; register goldfish
  ANDROID: goldfish_fb: Set pixclock = 0
  ANDROID: goldfish: Enable ACPI-based enumeration for goldfish audio
  ANDROID: goldfish: Enable ACPI-based enumeration for goldfish framebuffer
  ANDROID: video: goldfishfb: add devicetree bindings
  BACKPORT: staging: goldfish: audio: fix compiliation on arm
  BACKPORT: Input: goldfish_events - enable ACPI-based enumeration for goldfish events
  BACKPORT: goldfish: Enable ACPI-based enumeration for goldfish battery
  BACKPORT: drivers: tty: goldfish: Add device tree bindings
  BACKPORT: tty: goldfish: support platform_device with id -1
  BACKPORT: Input: goldfish_events - add devicetree bindings
  BACKPORT: power: goldfish_battery: add devicetree bindings
  BACKPORT: staging: goldfish: audio: add devicetree bindings
  ANDROID: usb: gadget: function: cleanup: Add blank line after declaration
  cpufreq: sched: Fix kernel crash on accessing sysfs file
  usb: gadget: f_mtp: simplify ptp NULL pointer check
  cgroup: replace unified-hierarchy.txt with a proper cgroup v2 documentation
  cgroup: rename Documentation/cgroups/ to Documentation/cgroup-legacy/
  cgroup: replace __DEVEL__sane_behavior with cgroup2 fs type
  writeback: initialize inode members that track writeback history
  mm: page_alloc: generalize the dirty balance reserve
  block: fix module reference leak on put_disk() call for cgroups throttle
  Linux 4.4.35
  netfilter: nft_dynset: fix element timeout for HZ != 1000
  IB/cm: Mark stale CM id's whenever the mad agent was unregistered
  IB/uverbs: Fix leak of XRC target QPs
  IB/core: Avoid unsigned int overflow in sg_alloc_table
  IB/mlx5: Fix fatal error dispatching
  IB/mlx5: Use cache line size to select CQE stride
  IB/mlx4: Fix create CQ error flow
  IB/mlx4: Check gid_index return value
  PM / sleep: don't suspend parent when async child suspend_{noirq, late} fails
  PM / sleep: fix device reference leak in test_suspend
  uwb: fix device reference leaks
  mfd: core: Fix device reference leak in mfd_clone_cell
  iwlwifi: pcie: fix SPLC structure parsing
  rtc: omap: Fix selecting external osc
  clk: mmp: mmp2: fix return value check in mmp2_clk_init()
  clk: mmp: pxa168: fix return value check in pxa168_clk_init()
  clk: mmp: pxa910: fix return value check in pxa910_clk_init()
  drm/amdgpu: Attach exclusive fence to prime exported bo's. (v5)
  crypto: caam - do not register AES-XTS mode on LP units
  ext4: sanity check the block and cluster size at mount time
  kbuild: Steal gcc's pie from the very beginning
  x86/kexec: add -fno-PIE
  scripts/has-stack-protector: add -fno-PIE
  kbuild: add -fno-PIE
  i2c: mux: fix up dependencies
  can: bcm: fix warning in bcm_connect/proc_register
  mfd: intel-lpss: Do not put device in reset state on suspend
  fuse: fix fuse_write_end() if zero bytes were copied
  KVM: Disable irq while unregistering user notifier
  KVM: x86: fix missed SRCU usage in kvm_lapic_set_vapic_addr
  x86/cpu/AMD: Fix cpu_llc_id for AMD Fam17h systems
  Linux 4.4.34
  sparc64: Delete now unused user copy fixup functions.
  sparc64: Delete now unused user copy assembler helpers.
  sparc64: Convert U3copy_{from,to}_user to accurate exception reporting.
  sparc64: Convert NG2copy_{from,to}_user to accurate exception reporting.
  sparc64: Convert NGcopy_{from,to}_user to accurate exception reporting.
  sparc64: Convert NG4copy_{from,to}_user to accurate exception reporting.
  sparc64: Convert U1copy_{from,to}_user to accurate exception reporting.
  sparc64: Convert GENcopy_{from,to}_user to accurate exception reporting.
  sparc64: Convert copy_in_user to accurate exception reporting.
  sparc64: Prepare to move to more saner user copy exception handling.
  sparc64: Delete __ret_efault.
  sparc64: Handle extremely large kernel TLB range flushes more gracefully.
  sparc64: Fix illegal relative branches in hypervisor patched TLB cross-call code.
  sparc64: Fix instruction count in comment for __hypervisor_flush_tlb_pending.
  sparc64: Fix illegal relative branches in hypervisor patched TLB code.
  sparc64: Handle extremely large kernel TSB range flushes sanely.
  sparc: Handle negative offsets in arch_jump_label_transform
  sparc64 mm: Fix base TSB sizing when hugetlb pages are used
  sparc: serial: sunhv: fix a double lock bug
  sparc: Don't leak context bits into thread->fault_address
  tty: Prevent ldisc drivers from re-using stale tty fields
  tcp: take care of truncations done by sk_filter()
  ipv4: use new_gw for redirect neigh lookup
  net: __skb_flow_dissect() must cap its return value
  sock: fix sendmmsg for partial sendmsg
  fib_trie: Correct /proc/net/route off by one error
  sctp: assign assoc_id earlier in __sctp_connect
  ipv6: dccp: add missing bind_conflict to dccp_ipv6_mapped
  ipv6: dccp: fix out of bound access in dccp_v6_err()
  dccp: fix out of bound access in dccp_v4_err()
  dccp: do not send reset to already closed sockets
  tcp: fix potential memory corruption
  ip6_tunnel: Clear IP6CB in ip6tunnel_xmit()
  bgmac: stop clearing DMA receive control register right after it is set
  net: mangle zero checksum in skb_checksum_help()
  net: clear sk_err_soft in sk_clone_lock()
  dctcp: avoid bogus doubling of cwnd after loss
  ARM: 8485/1: cpuidle: remove cpu parameter from the cpuidle_ops suspend hook
  Linux 4.4.33
  netfilter: fix namespace handling in nf_log_proc_dostring
  btrfs: qgroup: Prevent qgroup->reserved from going subzero
  mmc: mxs: Initialize the spinlock prior to using it
  ASoC: sun4i-codec: return error code instead of NULL when create_card fails
  ACPI / APEI: Fix incorrect return value of ghes_proc()
  i40e: fix call of ndo_dflt_bridge_getlink()
  hwrng: core - Don't use a stack buffer in add_early_randomness()
  lib/genalloc.c: start search from start of chunk
  mei: bus: fix received data size check in NFC fixup
  iommu/vt-d: Fix dead-locks in disable_dmar_iommu() path
  iommu/amd: Free domain id when free a domain of struct dma_ops_domain
  tty/serial: at91: fix hardware handshake on Atmel platforms
  dmaengine: at_xdmac: fix spurious flag status for mem2mem transfers
  drm/i915: Respect alternate_ddc_pin for all DDI ports
  KVM: MIPS: Precalculate MMIO load resume PC
  scsi: mpt3sas: Fix for block device of raid exists even after deleting raid disk
  scsi: qla2xxx: Fix scsi scan hang triggered if adapter fails during init
  iio: orientation: hid-sensor-rotation: Add PM function (fix non working driver)
  iio: hid-sensors: Increase the precision of scale to fix wrong reading interpretation.
  clk: qoriq: Don't allow CPU clocks higher than starting value
  toshiba-wmi: Fix loading the driver on non Toshiba laptops
  drbd: Fix kernel_sendmsg() usage - potential NULL deref
  usb: gadget: u_ether: remove interrupt throttling
  USB: cdc-acm: fix TIOCMIWAIT
  staging: nvec: remove managed resource from PS2 driver
  Revert "staging: nvec: ps2: change serio type to passthrough"
  drivers: staging: nvec: remove bogus reset command for PS/2 interface
  staging: iio: ad5933: avoid uninitialized variable in error case
  pinctrl: cherryview: Prevent possible interrupt storm on resume
  pinctrl: cherryview: Serialize register access in suspend/resume
  ARC: timer: rtc: implement read loop in "C" vs. inline asm
  s390/hypfs: Use get_free_page() instead of kmalloc to ensure page alignment
  coredump: fix unfreezable coredumping task
  swapfile: fix memory corruption via malformed swapfile
  dib0700: fix nec repeat handling
  ASoC: cs4270: fix DAPM stream name mismatch
  ALSA: info: Limit the proc text input size
  ALSA: info: Return error for invalid read/write
  arm64: Enable KPROBES/HIBERNATION/CORESIGHT in defconfig
  arm64: kvm: allows kvm cpu hotplug
  arm64: KVM: Register CPU notifiers when the kernel runs at HYP
  arm64: KVM: Skip HYP setup when already running in HYP
  arm64: hyp/kvm: Make hyp-stub reject kvm_call_hyp()
  arm64: hyp/kvm: Make hyp-stub extensible
  arm64: kvm: Move lr save/restore from do_el2_call into EL1
  arm64: kvm: deal with kernel symbols outside of linear mapping
  arm64: introduce KIMAGE_VADDR as the virtual base of the kernel region
  ANDROID: video: adf: Avoid directly referencing user pointers
  ANDROID: usb: gadget: audio_source: fix comparison of distinct pointer types
  android: binder: support for file-descriptor arrays.
  android: binder: support for scatter-gather.
  android: binder: add extra size to allocator.
  android: binder: refactor binder_transact()
  android: binder: support multiple /dev instances.
  android: binder: deal with contexts in debugfs.
  android: binder: support multiple context managers.
  android: binder: split flat_binder_object.
  disable aio support in recommended configuration
  Linux 4.4.32
  scsi: megaraid_sas: fix macro MEGASAS_IS_LOGICAL to avoid regression
  drm/radeon: fix DP mode validation
  drm/radeon/dp: add back special handling for NUTMEG
  drm/amdgpu: fix DP mode validation
  drm/amdgpu/dp: add back special handling for NUTMEG
  KVM: MIPS: Drop other CPU ASIDs on guest MMU changes
  Revert KVM: MIPS: Drop other CPU ASIDs on guest MMU changes
  of: silence warnings due to max() usage
  packet: on direct_xmit, limit tso and csum to supported devices
  sctp: validate chunk len before actually using it
  net sched filters: fix notification of filter delete with proper handle
  udp: fix IP_CHECKSUM handling
  net: sctp, forbid negative length
  ipv4: use the right lock for ping_group_range
  ipv4: disable BH in set_ping_group_range()
  net: add recursion limit to GRO
  rtnetlink: Add rtnexthop offload flag to compare mask
  bridge: multicast: restore perm router ports on multicast enable
  net: pktgen: remove rcu locking in pktgen_change_name()
  ipv6: correctly add local routes when lo goes up
  ip6_tunnel: fix ip6_tnl_lookup
  ipv6: tcp: restore IP6CB for pktoptions skbs
  netlink: do not enter direct reclaim from netlink_dump()
  packet: call fanout_release, while UNREGISTERING a netdev
  net: Add netdev all_adj_list refcnt propagation to fix panic
  net/sched: act_vlan: Push skb->data to mac_header prior calling skb_vlan_*() functions
  net: pktgen: fix pkt_size
  net: fec: set mac address unconditionally
  tg3: Avoid NULL pointer dereference in tg3_io_error_detected()
  ipmr, ip6mr: fix scheduling while atomic and a deadlock with ipmr_get_route
  ip6_gre: fix flowi6_proto value in ip6gre_xmit_other()
  tcp: fix a compile error in DBGUNDO()
  tcp: fix wrong checksum calculation on MTU probing
  net: avoid sk_forward_alloc overflows
  tcp: fix overflow in __tcp_retransmit_skb()
  arm64/kvm: fix build issue on kvm debug
  arm64: ptdump: Indicate whether memory should be faulting
  arm64: Add support for ARCH_SUPPORTS_DEBUG_PAGEALLOC
  arm64: Drop alloc function from create_mapping
  arm64: allow vmalloc regions to be set with set_memory_*
  arm64: kernel: implement ACPI parking protocol
  arm64: mm: create new fine-grained mappings at boot
  arm64: ensure _stext and _etext are page-aligned
  arm64: mm: allow passing a pgdir to alloc_init_*
  arm64: mm: allocate pagetables anywhere
  arm64: mm: use fixmap when creating page tables
  arm64: mm: add functions to walk tables in fixmap
  arm64: mm: add __{pud,pgd}_populate
  arm64: mm: avoid redundant __pa(__va(x))
  Linux 4.4.31
  HID: usbhid: add ATEN CS962 to list of quirky devices
  ubi: fastmap: Fix add_vol() return value test in ubi_attach_fastmap()
  kvm: x86: Check memopp before dereference (CVE-2016-8630)
  tty: vt, fix bogus division in csi_J
  usb: dwc3: Fix size used in dma_free_coherent()
  pwm: Unexport children before chip removal
  UBI: fastmap: scrub PEB when bitflips are detected in a free PEB EC header
  Disable "frame-address" warning
  smc91x: avoid self-comparison warning
  cgroup: avoid false positive gcc-6 warning
  drm/exynos: fix error handling in exynos_drm_subdrv_open
  mm/cma: silence warnings due to max() usage
  ARM: 8584/1: floppy: avoid gcc-6 warning
  powerpc/ptrace: Fix out of bounds array access warning
  x86/xen: fix upper bound of pmd loop in xen_cleanhighmap()
  perf build: Fix traceevent plugins build race
  drm/dp/mst: Check peer device type before attempting EDID read
  drm/radeon: drop register readback in cayman_cp_int_cntl_setup
  drm/radeon/si_dpm: workaround for SI kickers
  drm/radeon/si_dpm: Limit clocks on HD86xx part
  Revert "drm/radeon: fix DP link training issue with second 4K monitor"
  mmc: dw_mmc-pltfm: fix the potential NULL pointer dereference
  scsi: arcmsr: Send SYNCHRONIZE_CACHE command to firmware
  scsi: scsi_debug: Fix memory leak if LBP enabled and module is unloaded
  scsi: megaraid_sas: Fix data integrity failure for JBOD (passthrough) devices
  mac80211: discard multicast and 4-addr A-MSDUs
  firewire: net: fix fragmented datagram_size off-by-one
  firewire: net: guard against rx buffer overflows
  Input: i8042 - add XMG C504 to keyboard reset table
  dm mirror: fix read error on recovery after default leg failure
  virtio: console: Unlock vqs while freeing buffers
  virtio_ring: Make interrupt suppression spec compliant
  parisc: Ensure consistent state when switching to kernel stack at syscall entry
  ovl: fsync after copy-up
  KVM: MIPS: Make ERET handle ERL before EXL
  KVM: x86: fix wbinvd_dirty_mask use-after-free
  dm: free io_barrier after blk_cleanup_queue call
  USB: serial: cp210x: fix tiocmget error handling
  tty: limit terminal size to 4M chars
  xhci: add restart quirk for Intel Wildcatpoint PCH
  hv: do not lose pending heartbeat vmbus packets
  vt: clear selection before resizing
  Fix potential infoleak in older kernels
  GenWQE: Fix bad page access during abort of resource allocation
  usb: increase ohci watchdog delay to 275 msec
  xhci: use default USB_RESUME_TIMEOUT when resuming ports.
  USB: serial: ftdi_sio: add support for Infineon TriBoard TC2X7
  USB: serial: fix potential NULL-dereference at probe
  usb: gadget: function: u_ether: don't starve tx request queue
  mei: txe: don't clean an unprocessed interrupt cause.
  ubifs: Fix regression in ubifs_readdir()
  ubifs: Abort readdir upon error
  btrfs: fix races on root_log_ctx lists
  ANDROID: binder: Clear binder and cookie when setting handle in flat binder struct
  ANDROID: binder: Add strong ref checks
  ALSA: hda - Fix headset mic detection problem for two Dell laptops
  ALSA: hda - Adding a new group of pin cfg into ALC295 pin quirk table
  ALSA: hda - allow 40 bit DMA mask for NVidia devices
  ALSA: hda - Raise AZX_DCAPS_RIRB_DELAY handling into top drivers
  ALSA: hda - Merge RIRB_PRE_DELAY into CTX_WORKAROUND caps
  ALSA: usb-audio: Add quirk for Syntek STK1160
  KEYS: Fix short sprintf buffer in /proc/keys show function
  mm: memcontrol: do not recurse in direct reclaim
  mm/list_lru.c: avoid error-path NULL pointer deref
  libxfs: clean up _calc_dquots_per_chunk
  h8300: fix syscall restarting
  drm/dp/mst: Clear port->pdt when tearing down the i2c adapter
  i2c: core: fix NULL pointer dereference under race condition
  i2c: xgene: Avoid dma_buffer overrun
  arm64:cpufeature ARM64_NCAPS is the indicator of last feature
  arm64: hibernate: Refuse to hibernate if the boot cpu is offline
  PM / sleep: Add support for read-only sysfs attributes
  arm64: kernel: Add support for hibernate/suspend-to-disk
  arm64: mm: add functions to walk page tables by PA
  arm64: mm: move pte_* macros
  PM / Hibernate: Call flush_icache_range() on pages restored in-place
  arm64: Add new asm macro copy_page
  arm64: Promote KERNEL_START/KERNEL_END definitions to a header file
  arm64: kernel: Include _AC definition in page.h
  arm64: Change cpu_resume() to enable mmu early then access sleep_sp by va
  arm64: kernel: Rework finisher callback out of __cpu_suspend_enter()
  arm64: Cleanup SCTLR flags
  arm64: Fold proc-macros.S into assembler.h
  arm/arm64: KVM: Add hook for C-based stage2 init
  arm/arm64: KVM: Detect vGIC presence at runtime
  arm64: KVM: Add support for 16-bit VMID
  arm: KVM: Make kvm_arm.h friendly to assembly code
  arm/arm64: KVM: Remove unreferenced S2_PGD_ORDER
  arm64: KVM: debug: Remove spurious inline attributes
  ARM: KVM: Cleanup exception injection
  arm64: KVM: Remove weak attributes
  arm64: KVM: Cleanup asm-offset.c
  arm64: KVM: Turn system register numbers to an enum
  arm64: KVM: VHE: Patch out use of HVC
  arm64: Add ARM64_HAS_VIRT_HOST_EXTN feature
  arm/arm64: Add new is_kernel_in_hyp_mode predicate
  arm64: KVM: Move away from the assembly version of the world switch
  arm64: KVM: Map the kernel RO section into HYP
  arm64: KVM: Add compatibility aliases
  arm64: KVM: Implement vgic-v3 save/restore
  arm64: KVM: Add panic handling
  arm64: KVM: HYP mode entry points
  arm64: KVM: Implement TLB handling
  arm64: KVM: Implement fpsimd save/restore
  arm64: KVM: Implement the core world switch
  arm64: KVM: Add patchable function selector
  arm64: KVM: Implement guest entry
  arm64: KVM: Implement debug save/restore
  arm64: KVM: Implement 32bit system register save/restore
  arm64: KVM: Implement system register save/restore
  arm64: KVM: Implement timer save/restore
  arm64: KVM: Implement vgic-v2 save/restore
  arm64: KVM: Add a HYP-specific header file
  KVM: arm/arm64: vgic-v3: Make the LR indexing macro public
  arm64: Add macros to read/write system registers
  Linux 4.4.30
  Revert "fix minor infoleak in get_user_ex()"
  Revert "x86/mm: Expand the exception table logic to allow new handling options"
  Linux 4.4.29
  ARM: pxa: pxa_cplds: fix interrupt handling
  powerpc/nvram: Fix an incorrect partition merge
  mpt3sas: Don't spam logs if logging level is 0
  perf symbols: Fixup symbol sizes before picking best ones
  perf symbols: Check symbol_conf.allow_aliases for kallsyms loading too
  perf hists browser: Fix event group display
  clk: divider: Fix clk_divider_round_rate() to use clk_readl()
  clk: qoriq: fix a register offset error
  s390/con3270: fix insufficient space padding
  s390/con3270: fix use of uninitialised data
  s390/cio: fix accidental interrupt enabling during resume
  x86/mm: Expand the exception table logic to allow new handling options
  dmaengine: ipu: remove bogus NO_IRQ reference
  power: bq24257: Fix use of uninitialized pointer bq->charger
  staging: r8188eu: Fix scheduling while atomic splat
  ASoC: dapm: Fix kcontrol creation for output driver widget
  ASoC: dapm: Fix value setting for _ENUM_DOUBLE MUX's second channel
  ASoC: dapm: Fix possible uninitialized variable in snd_soc_dapm_get_volsw()
  ASoC: topology: Fix error return code in soc_tplg_dapm_widget_create()
  hwrng: omap - Only fail if pm_runtime_get_sync returns < 0
  crypto: arm/ghash-ce - add missing async import/export
  crypto: gcm - Fix IV buffer size in crypto_gcm_setkey
  mwifiex: correct aid value during tdls setup
  spi: spi-fsl-dspi: Drop extra spi_master_put in device remove function
  ARM: clk-imx35: fix name for ckil clk
  uio: fix dmem_region_start computation
  genirq/generic_chip: Add irq_unmap callback
  perf stat: Fix interval output values
  powerpc/eeh: Null check uses of eeh_pe_bus_get
  tunnels: Remove encapsulation offloads on decap.
  tunnels: Don't apply GRO to multiple layers of encapsulation.
  ipip: Properly mark ipip GRO packets as encapsulated.
  posix_acl: Clear SGID bit when setting file permissions
  brcmfmac: avoid potential stack overflow in brcmf_cfg80211_start_ap()
  mm/hugetlb: fix memory offline with hugepage size > memory block size
  drm/i915: Unalias obj->phys_handle and obj->userptr
  drm/i915: Account for TSEG size when determining 865G stolen base
  Revert "drm/i915: Check live status before reading edid"
  drm/i915/gen9: fix the WaWmMemoryReadLatency implementation
  xenbus: don't look up transaction IDs for ordinary writes
  drm/vmwgfx: Limit the user-space command buffer size
  drm/radeon: change vblank_time's calculation method to reduce computational error.
  drm/radeon/si/dpm: fix phase shedding setup
  drm/radeon: narrow asic_init for virtualization
  drm/amdgpu: change vblank_time's calculation method to reduce computational error.
  drm/amdgpu/dce11: add missing drm_mode_config_cleanup call
  drm/amdgpu/dce11: disable hpd on local panels
  drm/amdgpu/dce8: disable hpd on local panels
  drm/amdgpu/dce10: disable hpd on local panels
  drm/amdgpu: fix IB alignment for UVD
  drm/prime: Pass the right module owner through to dma_buf_export()
  Linux 4.4.28
  target: Don't override EXTENDED_COPY xcopy_pt_cmd SCSI status code
  target: Make EXTENDED_COPY 0xe4 failure return COPY TARGET DEVICE NOT REACHABLE
  target: Re-add missing SCF_ACK_KREF assignment in v4.1.y
  ubifs: Fix xattr_names length in exit paths
  jbd2: fix incorrect unlock on j_list_lock
  ext4: do not advertise encryption support when disabled
  mmc: rtsx_usb_sdmmc: Handle runtime PM while changing the led
  mmc: rtsx_usb_sdmmc: Avoid keeping the device runtime resumed when unused
  mmc: core: Annotate cmd_hdr as __le32
  powerpc/mm: Prevent unlikely crash in copro_calculate_slb()
  ceph: fix error handling in ceph_read_iter
  arm64: kernel: Init MDCR_EL2 even in the absence of a PMU
  arm64: percpu: rewrite ll/sc loops in assembly
  memstick: rtsx_usb_ms: Manage runtime PM when accessing the device
  memstick: rtsx_usb_ms: Runtime resume the device when polling for cards
  isofs: Do not return EACCES for unknown filesystems
  irqchip/gic-v3-its: Fix entry size mask for GITS_BASER
  s390/mm: fix gmap tlb flush issues
  Using BUG_ON() as an assert() is _never_ acceptable
  mm: filemap: fix mapping->nrpages double accounting in fuse
  mm: workingset: fix crash in shadow node shrinker caused by replace_page_cache_page()
  acpi, nfit: check for the correct event code in notifications
  net/mlx4_core: Allow resetting VF admin mac to zero
  bnx2x: Prevent false warning for lack of FC NPIV
  PKCS#7: Don't require SpcSpOpusInfo in Authenticode pkcs7 signatures
  hpsa: correct skipping masked peripherals
  sd: Fix rw_max for devices that report an optimal xfer size
  irqchip/gicv3: Handle loop timeout proper
  kvm: x86: memset whole irq_eoi
  x86/e820: Don't merge consecutive E820_PRAM ranges
  blkcg: Unlock blkcg_pol_mutex only once when cpd == NULL
  Fix regression which breaks DFS mounting
  Cleanup missing frees on some ioctls
  Do not send SMB3 SET_INFO request if nothing is changing
  SMB3: GUIDs should be constructed as random but valid uuids
  Set previous session id correctly on SMB3 reconnect
  Display number of credits available
  Clarify locking of cifs file and tcon structures and make more granular
  fs/cifs: keep guid when assigning fid to fileinfo
  cifs: Limit the overall credit acquired
  fs/super.c: fix race between freeze_super() and thaw_super()
  arc: don't leak bits of kernel stack into coredump
  lightnvm: ensure that nvm_dev_ops can be used without CONFIG_NVM
  ipc/sem.c: fix complex_count vs. simple op race
  mm: filemap: don't plant shadow entries without radix tree node
  metag: Only define atomic_dec_if_positive conditionally
  scsi: Fix use-after-free
  NFSv4.2: Fix a reference leak in nfs42_proc_layoutstats_generic
  NFSv4: Open state recovery must account for file permission changes
  NFSv4: nfs4_copy_delegation_stateid() must fail if the delegation is invalid
  NFSv4: Don't report revoked delegations as valid in nfs_have_delegation()
  sunrpc: fix write space race causing stalls
  Input: elantech - add Fujitsu Lifebook E556 to force crc_enabled
  Input: elantech - force needed quirks on Fujitsu H760
  Input: i8042 - skip selftest on ASUS laptops
  lib: add "on"/"off" support to kstrtobool
  lib: update single-char callers of strtobool()
  lib: move strtobool() to kstrtobool()
  MIPS: ptrace: Fix regs_return_value for kernel context
  MIPS: Fix -mabi=64 build of vdso.lds
  ALSA: hda - Fix a failure of micmute led when having multi adcs
  cx231xx: fix GPIOs for Pixelview SBTVD hybrid
  cx231xx: don't return error on success
  mb86a20s: fix demod settings
  mb86a20s: fix the locking logic
  ovl: copy_up_xattr(): use strnlen
  ovl: Fix info leak in ovl_lookup_temp()
  fbdev/efifb: Fix 16 color palette entry calculation
  scsi: zfcp: spin_lock_irqsave() is not nestable
  zfcp: trace full payload of all SAN records (req,resp,iels)
  zfcp: fix payload trace length for SAN request&response
  zfcp: fix D_ID field with actual value on tracing SAN responses
  zfcp: restore tracing of handle for port and LUN with HBA records
  zfcp: trace on request for open and close of WKA port
  zfcp: restore: Dont use 0 to indicate invalid LUN in rec trace
  zfcp: retain trace level for SCSI and HBA FSF response records
  zfcp: close window with unblocked rport during rport gone
  zfcp: fix ELS/GS request&response length for hardware data router
  zfcp: fix fc_host port_type with NPIV
  ubi: Deal with interrupted erasures in WL
  powerpc/pseries: Fix stack corruption in htpe code
  powerpc/64: Fix incorrect return value from __copy_tofrom_user
  powerpc/powernv: Use CPU-endian PEST in pnv_pci_dump_p7ioc_diag_data()
  powerpc/powernv: Use CPU-endian hub diag-data type in pnv_eeh_get_and_dump_hub_diag()
  powerpc/powernv: Pass CPU-endian PE number to opal_pci_eeh_freeze_clear()
  powerpc/vdso64: Use double word compare on pointers
  dm crypt: fix crash on exit
  dm mpath: check if path's request_queue is dying in activate_path()
  dm: return correct error code in dm_resume()'s retry loop
  dm: mark request_queue dead before destroying the DM device
  perf intel-pt: Fix MTC timestamp calculation for large MTC periods
  perf intel-pt: Fix estimated timestamps for cycle-accurate mode
  perf intel-pt: Fix snapshot overlap detection decoder errors
  pstore/ram: Use memcpy_fromio() to save old buffer
  pstore/ram: Use memcpy_toio instead of memcpy
  pstore/core: drop cmpxchg based updates
  pstore/ramoops: fixup driver removal
  parisc: Increase initial kernel mapping size
  parisc: Fix kernel memory layout regarding position of __gp
  parisc: Increase KERNEL_INITIAL_SIZE for 32-bit SMP kernels
  cpufreq: intel_pstate: Fix unsafe HWP MSR access
  platform: don't return 0 from platform_get_irq[_byname]() on error
  PCI: Mark Atheros AR9580 to avoid bus reset
  mmc: sdhci: cast unsigned int to unsigned long long to avoid unexpeted error
  mmc: block: don't use CMD23 with very old MMC cards
  rtlwifi: Fix missing country code for Great Britain
  PM / devfreq: event: remove duplicate devfreq_event_get_drvdata()
  clk: imx6: initialize GPU clocks
  regulator: tps65910: Work around silicon erratum SWCZ010
  mei: me: add kaby point device ids
  gpio: mpc8xxx: Correct irq handler function
  cgroup: Change from CAP_SYS_NICE to CAP_SYS_RESOURCE for cgroup migration permissions
  UPSTREAM: cpu/hotplug: Handle unbalanced hotplug enable/disable
  UPSTREAM: arm64: kaslr: fix breakage with CONFIG_MODVERSIONS=y
  UPSTREAM: arm64: kaslr: keep modules close to the kernel when DYNAMIC_FTRACE=y
  cgroup: Remove leftover instances of allow_attach
  BACKPORT: lib: harden strncpy_from_user
  CHROMIUM: cgroups: relax permissions on moving tasks between cgroups
  CHROMIUM: remove Android's cgroup generic permissions checks
  Linux 4.4.27
  cfq: fix starvation of asynchronous writes
  vfs: move permission checking into notify_change() for utimes(NULL)
  dlm: free workqueues after the connections
  crypto: vmx - Fix memory corruption caused by p8_ghash
  crypto: ghash-generic - move common definitions to a new header file
  ext4: release bh in make_indexed_dir
  ext4: allow DAX writeback for hole punch
  ext4: fix memory leak in ext4_insert_range()
  ext4: reinforce check of i_dtime when clearing high fields of uid and gid
  ext4: enforce online defrag restriction for encrypted files
  scsi: ibmvfc: Fix I/O hang when port is not mapped
  scsi: arcmsr: Simplify user_len checking
  scsi: arcmsr: Buffer overflow in arcmsr_iop_message_xfer()
  async_pq_val: fix DMA memory leak
  reiserfs: switch to generic_{get,set,remove}xattr()
  reiserfs: Unlock superblock before calling reiserfs_quota_on_mount()
  ASoC: Intel: Atom: add a missing star in a memcpy call
  brcmfmac: fix memory leak in brcmf_fill_bss_param
  i40e: avoid NULL pointer dereference and recursive errors on early PCI error
  fuse: fix killing s[ug]id in setattr
  fuse: invalidate dir dentry after chmod
  fuse: listxattr: verify xattr list
  drivers: base: dma-mapping: page align the size when unmap_kernel_range
  btrfs: assign error values to the correct bio structs
  serial: 8250_dw: Check the data->pclk when get apb_pclk
  arm64: Use PoU cache instr for I/D coherency
  arm64: mm: add code to safely replace TTBR1_EL1
  arm64: mm: place __cpu_setup in .text
  arm64: add function to install the idmap
  arm64: unmap idmap earlier
  arm64: unify idmap removal
  arm64: mm: place empty_zero_page in bss
  arm64: head.S: use memset to clear BSS
  arm64: mm: specialise pagetable allocators
  arm64: mm: remove pointless PAGE_MASKing
  asm-generic: Fix local variable shadow in __set_fixmap_offset
  arm64: mm: fold alternatives into .init
  ARM: 8511/1: ARM64: kernel: PSCI: move PSCI idle management code to drivers/firmware
  ARM: 8481/2: drivers: psci: replace psci firmware calls
  ARM: 8480/2: arm64: add implementation for arm-smccc
  ARM: 8479/2: add implementation for arm-smccc
  ARM: 8478/2: arm/arm64: add arm-smccc
  ARM: 8510/1: rework ARM_CPU_SUSPEND dependencies
  ARM: 8458/1: bL_switcher: add GIC dependency
  Linux 4.4.26
  mm: remove gup_flags FOLL_WRITE games from __get_user_pages()
  x86/build: Build compressed x86 kernels as PIE
  arm64: Remove stack duplicating code from jprobes
  arm64: kprobes: Add KASAN instrumentation around stack accesses
  arm64: kprobes: Cleanup jprobe_return
  arm64: kprobes: Fix overflow when saving stack
  arm64: kprobes: WARN if attempting to step with PSTATE.D=1
  kprobes: Add arm64 case in kprobe example module
  arm64: Add kernel return probes support (kretprobes)
  arm64: Add trampoline code for kretprobes
  arm64: kprobes instruction simulation support
  arm64: Treat all entry code as non-kprobe-able
  arm64: Blacklist non-kprobe-able symbol
  arm64: Kprobes with single stepping support
  arm64: add conditional instruction simulation support
  arm64: Add more test functions to insn.c
  arm64: Add HAVE_REGS_AND_STACK_ACCESS_API feature
  Linux 4.4.25
  tpm_crb: fix crb_req_canceled behavior
  tpm: fix a race condition in tpm2_unseal_trusted()
  ima: use file_dentry()
  ARM: cpuidle: Fix error return code
  ARM: dts: MSM8064 remove flags from SPMI/MPP IRQs
  ARM: dts: mvebu: armada-390: add missing compatibility string and bracket
  x86/dumpstack: Fix x86_32 kernel_stack_pointer() previous stack access
  x86/irq: Prevent force migration of irqs which are not in the vector domain
  x86/boot: Fix kdump, cleanup aborted E820_PRAM max_pfn manipulation
  KVM: PPC: BookE: Fix a sanity check
  KVM: MIPS: Drop other CPU ASIDs on guest MMU changes
  KVM: PPC: Book3s PR: Allow access to unprivileged MMCR2 register
  mfd: wm8350-i2c: Make sure the i2c regmap functions are compiled
  mfd: 88pm80x: Double shifting bug in suspend/resume
  mfd: atmel-hlcdc: Do not sleep in atomic context
  mfd: rtsx_usb: Avoid setting ucr->current_sg.status
  ALSA: usb-line6: use the same declaration as definition in header for MIDI manufacturer ID
  ALSA: usb-audio: Extend DragonFly dB scale quirk to cover other variants
  ALSA: ali5451: Fix out-of-bound position reporting
  timekeeping: Fix __ktime_get_fast_ns() regression
  time: Add cycles to nanoseconds translation
  mm: Fix build for hardened usercopy
  ANDROID: binder: Clear binder and cookie when setting handle in flat binder struct
  ANDROID: binder: Add strong ref checks
  UPSTREAM: staging/android/ion : fix a race condition in the ion driver
  ANDROID: android-base: CONFIG_HARDENED_USERCOPY=y
  UPSTREAM: fs/proc/kcore.c: Add bounce buffer for ktext data
  UPSTREAM: fs/proc/kcore.c: Make bounce buffer global for read
  BACKPORT: arm64: Correctly bounds check virt_addr_valid
  Fix a build breakage in IO latency hist code.
  UPSTREAM: efi: include asm/early_ioremap.h not asm/efi.h to get early_memremap
  UPSTREAM: ia64: split off early_ioremap() declarations into asm/early_ioremap.h
  FROMLIST: arm64: Enable CONFIG_ARM64_SW_TTBR0_PAN
  FROMLIST: arm64: xen: Enable user access before a privcmd hvc call
  FROMLIST: arm64: Handle faults caused by inadvertent user access with PAN enabled
  FROMLIST: arm64: Disable TTBR0_EL1 during normal kernel execution
  FROMLIST: arm64: Introduce uaccess_{disable,enable} functionality based on TTBR0_EL1
  FROMLIST: arm64: Factor out TTBR0_EL1 post-update workaround into a specific asm macro
  FROMLIST: arm64: Factor out PAN enabling/disabling into separate uaccess_* macros
  UPSTREAM: arm64: Handle el1 synchronous instruction aborts cleanly
  UPSTREAM: arm64: include alternative handling in dcache_by_line_op
  UPSTREAM: arm64: fix "dc cvau" cache operation on errata-affected core
  UPSTREAM: Revert "arm64: alternatives: add enable parameter to conditional asm macros"
  UPSTREAM: arm64: Add new asm macro copy_page
  UPSTREAM: arm64: kill ESR_LNX_EXEC
  UPSTREAM: arm64: add macro to extract ESR_ELx.EC
  UPSTREAM: arm64: mm: mark fault_info table const
  UPSTREAM: arm64: fix dump_instr when PAN and UAO are in use
  BACKPORT: arm64: Fold proc-macros.S into assembler.h
  UPSTREAM: arm64: choose memstart_addr based on minimum sparsemem section alignment
  UPSTREAM: arm64/mm: ensure memstart_addr remains sufficiently aligned
  UPSTREAM: arm64/kernel: fix incorrect EL0 check in inv_entry macro
  UPSTREAM: arm64: Add macros to read/write system registers
  UPSTREAM: arm64/efi: refactor EFI init and runtime code for reuse by 32-bit ARM
  UPSTREAM: arm64/efi: split off EFI init and runtime code for reuse by 32-bit ARM
  UPSTREAM: arm64/efi: mark UEFI reserved regions as MEMBLOCK_NOMAP
  BACKPORT: arm64: only consider memblocks with NOMAP cleared for linear mapping
  UPSTREAM: mm/memblock: add MEMBLOCK_NOMAP attribute to memblock memory table
  ANDROID: dm: android-verity: Remove fec_header location constraint
  BACKPORT: audit: consistently record PIDs with task_tgid_nr()
  android-base.cfg: Enable kernel ASLR
  UPSTREAM: vmlinux.lds.h: allow arch specific handling of ro_after_init data section
  UPSTREAM: arm64: spinlock: fix spin_unlock_wait for LSE atomics
  UPSTREAM: arm64: avoid TLB conflict with CONFIG_RANDOMIZE_BASE
  UPSTREAM: arm64: Only select ARM64_MODULE_PLTS if MODULES=y
  sched: Add Kconfig option DEFAULT_USE_ENERGY_AWARE to set ENERGY_AWARE feature flag
  sched/fair: remove printk while schedule is in progress
  ANDROID: fs: FS tracepoints to track IO.
  sched/walt: Drop arch-specific timer access
  ANDROID: fiq_debugger: Pass task parameter to unwind_frame()
  eas/sched/fair: Fixing comments in find_best_target.
  input: keyreset: switch to orderly_reboot
  UPSTREAM: tun: fix transmit timestamp support
  UPSTREAM: arch/arm/include/asm/pgtable-3level.h: add pmd_mkclean for THP
  net: inet: diag: expose the socket mark to privileged processes.
  net: diag: make udp_diag_destroy work for mapped addresses.
  net: diag: support SOCK_DESTROY for UDP sockets
  net: diag: allow socket bytecode filters to match socket marks
  net: diag: slightly refactor the inet_diag_bc_audit error checks.
  net: diag: Add support to filter on device index
  UPSTREAM: brcmfmac: avoid potential stack overflow in brcmf_cfg80211_start_ap()
  Linux 4.4.24
  ALSA: hda - Add the top speaker pin config for HP Spectre x360
  ALSA: hda - Fix headset mic detection problem for several Dell laptops
  ACPICA: acpi_get_sleep_type_data: Reduce warnings
  ALSA: hda - Adding one more ALC255 pin definition for headset problem
  Revert "usbtmc: convert to devm_kzalloc"
  USB: serial: cp210x: Add ID for a Juniper console
  Staging: fbtft: Fix bug in fbtft-core
  usb: misc: legousbtower: Fix NULL pointer deference
  USB: serial: cp210x: fix hardware flow-control disable
  dm log writes: fix bug with too large bios
  clk: xgene: Add missing parenthesis when clearing divider value
  aio: mark AIO pseudo-fs noexec
  batman-adv: remove unused callback from batadv_algo_ops struct
  IB/mlx4: Use correct subnet-prefix in QP1 mads under SR-IOV
  IB/mlx4: Fix code indentation in QP1 MAD flow
  IB/mlx4: Fix incorrect MC join state bit-masking on SR-IOV
  IB/ipoib: Don't allow MC joins during light MC flush
  IB/core: Fix use after free in send_leave function
  IB/ipoib: Fix memory corruption in ipoib cm mode connect flow
  KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write
  dmaengine: at_xdmac: fix to pass correct device identity to free_irq()
  kernel/fork: fix CLONE_CHILD_CLEARTID regression in nscd
  ASoC: omap-mcpdm: Fix irq resource handling
  sysctl: handle error writing UINT_MAX to u32 fields
  powerpc/prom: Fix sub-processor option passed to ibm, client-architecture-support
  brcmsmac: Initialize power in brcms_c_stf_ss_algo_channel_get()
  brcmsmac: Free packet if dma_mapping_error() fails in dma_rxfill
  brcmfmac: Fix glob_skb leak in brcmf_sdiod_recv_chain
  ASoC: Intel: Skylake: Fix error return code in skl_probe()
  pNFS/flexfiles: Fix layoutcommit after a commit to DS
  pNFS/files: Fix layoutcommit after a commit to DS
  NFS: Don't drop CB requests with invalid principals
  svc: Avoid garbage replies when pc_func() returns rpc_drop_reply
  dmaengine: at_xdmac: fix debug string
  fnic: pci_dma_mapping_error() doesn't return an error code
  avr32: off by one in at32_init_pio()
  ath9k: Fix programming of minCCA power threshold
  gspca: avoid unused variable warnings
  em28xx-i2c: rt_mutex_trylock() returns zero on failure
  NFC: fdp: Detect errors from fdp_nci_create_conn()
  iwlmvm: mvm: set correct state in smart-fifo configuration
  tile: Define AT_VECTOR_SIZE_ARCH for ARCH_DLINFO
  pstore: drop file opened reference count
  blk-mq: actually hook up defer list when running requests
  hwrng: omap - Fix assumption that runtime_get_sync will always succeed
  ARM: sa1111: fix pcmcia suspend/resume
  ARM: shmobile: fix regulator quirk for Gen2
  ARM: sa1100: clear reset status prior to reboot
  ARM: sa1100: fix 3.6864MHz clock
  ARM: sa1100: register clocks early
  ARM: sun5i: Fix typo in trip point temperature
  regulator: qcom_smd: Fix voltage ranges for pm8x41
  regulator: qcom_spmi: Update mvs1/mvs2 switches on pm8941
  regulator: qcom_spmi: Add support for get_mode/set_mode on switches
  regulator: qcom_spmi: Add support for S4 supply on pm8941
  tpm: fix byte-order for the value read by tpm2_get_tpm_pt
  printk: fix parsing of "brl=" option
  MIPS: uprobes: fix use of uninitialised variable
  MIPS: Malta: Fix IOCU disable switch read for MIPS64
  MIPS: fix uretprobe implementation
  MIPS: uprobes: remove incorrect set_orig_insn
  arm64: debug: avoid resetting stepping state machine when TIF_SINGLESTEP
  ARM: 8618/1: decompressor: reset ttbcr fields to use TTBR0 on ARMv7
  irqchip/gicv3: Silence noisy DEBUG_PER_CPU_MAPS warning
  gpio: sa1100: fix irq probing for ucb1x00
  usb: gadget: fsl_qe_udc: signedness bug in qe_get_frame()
  ceph: fix race during filling readdir cache
  iwlwifi: mvm: don't use ret when not initialised
  iwlwifi: pcie: fix access to scratch buffer
  spi: sh-msiof: Avoid invalid clock generator parameters
  hwmon: (adt7411) set bit 3 in CFG1 register
  nvmem: Declare nvmem_cell_read() consistently
  ipvs: fix bind to link-local mcast IPv6 address in backup
  tools/vm/slabinfo: fix an unintentional printf
  mmc: pxamci: fix potential oops
  drivers/perf: arm_pmu: Fix leak in error path
  pinctrl: Flag strict is a field in struct pinmux_ops
  pinctrl: uniphier: fix .pin_dbg_show() callback
  i40e: avoid null pointer dereference
  perf/core: Fix pmu::filter_match for SW-led groups
  iwlwifi: mvm: fix a few firmware capability checks
  usb: musb: fix DMA for host mode
  usb: musb: Fix DMA desired mode for Mentor DMA engine
  ARM: 8617/1: dma: fix dma_max_pfn()
  ARM: 8616/1: dt: Respect property size when parsing CPUs
  drm/radeon/si/dpm: add workaround for for Jet parts
  drm/nouveau/fifo/nv04: avoid ramht race against cookie insertion
  x86/boot: Initialize FPU and X86_FEATURE_ALWAYS even if we don't have CPUID
  x86/init: Fix cr4_init_shadow() on CR4-less machines
  can: dev: fix deadlock reported after bus-off
  mm,ksm: fix endless looping in allocating memory when ksm enable
  mtd: nand: davinci: Reinitialize the HW ECC engine in 4bit hwctl
  cpuset: handle race between CPU hotplug and cpuset_hotplug_work
  usercopy: fold builtin_const check into inline function
  Linux 4.4.23
  hostfs: Freeing an ERR_PTR in hostfs_fill_sb_common()
  qxl: check for kmap failures
  power: supply: max17042_battery: fix model download bug.
  power_supply: tps65217-charger: fix missing platform_set_drvdata()
  PM / hibernate: Fix rtree_next_node() to avoid walking off list ends
  PM / hibernate: Restore processor state before using per-CPU variables
  MIPS: paravirt: Fix undefined reference to smp_bootstrap
  MIPS: Add a missing ".set pop" in an early commit
  MIPS: Avoid a BUG warning during prctl(PR_SET_FP_MODE, ...)
  MIPS: Remove compact branch policy Kconfig entries
  MIPS: vDSO: Fix Malta EVA mapping to vDSO page structs
  MIPS: SMP: Fix possibility of deadlock when bringing CPUs online
  MIPS: Fix pre-r6 emulation FPU initialisation
  i2c: qup: skip qup_i2c_suspend if the device is already runtime suspended
  i2c-eg20t: fix race between i2c init and interrupt enable
  btrfs: ensure that file descriptor used with subvol ioctls is a dir
  nl80211: validate number of probe response CSA counters
  can: flexcan: fix resume function
  mm: delete unnecessary and unsafe init_tlb_ubc()
  tracing: Move mutex to protect against resetting of seq data
  fix memory leaks in tracing_buffers_splice_read()
  power: reset: hisi-reboot: Unmap region obtained by of_iomap
  mtd: pmcmsp-flash: Allocating too much in init_msp_flash()
  mtd: maps: sa1100-flash: potential NULL dereference
  fix fault_in_multipages_...() on architectures with no-op access_ok()
  fanotify: fix list corruption in fanotify_get_response()
  fsnotify: add a way to stop queueing events on group shutdown
  xfs: prevent dropping ioend completions during buftarg wait
  autofs: use dentry flags to block walks during expire
  autofs races
  pwm: Mark all devices as "might sleep"
  bridge: re-introduce 'fix parsing of MLDv2 reports'
  net: smc91x: fix SMC accesses
  Revert "phy: IRQ cannot be shared"
  net: dsa: bcm_sf2: Fix race condition while unmasking interrupts
  net/mlx5: Added missing check of msg length in verifying its signature
  tipc: fix NULL pointer dereference in shutdown()
  net/irda: handle iriap_register_lsap() allocation failure
  vti: flush x-netns xfrm cache when vti interface is removed
  af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'
  Revert "af_unix: Fix splice-bind deadlock"
  bonding: Fix bonding crash
  megaraid: fix null pointer check in megasas_detach_one().
  nouveau: fix nv40_perfctr_next() cleanup regression
  Staging: iio: adc: fix indent on break statement
  iwlegacy: avoid warning about missing braces
  ath9k: fix misleading indentation
  am437x-vfpe: fix typo in vpfe_get_app_input_index
  Add braces to avoid "ambiguous ‘else’" compiler warnings
  net: caif: fix misleading indentation
  Makefile: Mute warning for __builtin_return_address(>0) for tracing only
  Disable "frame-address" warning
  Disable "maybe-uninitialized" warning globally
  gcov: disable -Wmaybe-uninitialized warning
  Kbuild: disable 'maybe-uninitialized' warning for CONFIG_PROFILE_ALL_BRANCHES
  kbuild: forbid kernel directory to contain spaces and colons
  tools: Support relative directory path for 'O='
  Makefile: revert "Makefile: Document ability to make file.lst and file.S" partially
  kbuild: Do not run modules_install and install in paralel
  ocfs2: fix start offset to ocfs2_zero_range_for_truncate()
  ocfs2/dlm: fix race between convert and migration
  crypto: echainiv - Replace chaining with multiplication
  crypto: skcipher - Fix blkcipher walk OOM crash
  crypto: arm/aes-ctr - fix NULL dereference in tail processing
  crypto: arm64/aes-ctr - fix NULL dereference in tail processing
  tcp: properly scale window in tcp_v[46]_reqsk_send_ack()
  tcp: fix use after free in tcp_xmit_retransmit_queue()
  tcp: cwnd does not increase in TCP YeAH
  ipv6: release dst in ping_v6_sendmsg
  ipv4: panic in leaf_walk_rcu due to stale node pointer
  reiserfs: fix "new_insert_key may be used uninitialized ..."
  Fix build warning in kernel/cpuset.c
  include/linux/kernel.h: change abs() macro so it uses consistent return type
  Linux 4.4.22
  openrisc: fix the fix of copy_from_user()
  avr32: fix 'undefined reference to `___copy_from_user'
  ia64: copy_from_user() should zero the destination on access_ok() failure
  genirq/msi: Fix broken debug output
  ppc32: fix copy_from_user()
  sparc32: fix copy_from_user()
  mn10300: copy_from_user() should zero on access_ok() failure...
  nios2: copy_from_user() should zero the tail of destination
  openrisc: fix copy_from_user()
  parisc: fix copy_from_user()
  metag: copy_from_user() should zero the destination on access_ok() failure
  alpha: fix copy_from_user()
  asm-generic: make copy_from_user() zero the destination properly
  mips: copy_from_user() must zero the destination on access_ok() failure
  hexagon: fix strncpy_from_user() error return
  sh: fix copy_from_user()
  score: fix copy_from_user() and friends
  blackfin: fix copy_from_user()
  cris: buggered copy_from_user/copy_to_user/clear_user
  frv: fix clear_user()
  asm-generic: make get_user() clear the destination on errors
  ARC: uaccess: get_user to zero out dest in cause of fault
  s390: get_user() should zero on failure
  score: fix __get_user/get_user
  nios2: fix __get_user()
  sh64: failing __get_user() should zero
  m32r: fix __get_user()
  mn10300: failing __get_user() and get_user() should zero
  fix minor infoleak in get_user_ex()
  microblaze: fix copy_from_user()
  avr32: fix copy_from_user()
  microblaze: fix __get_user()
  fix iov_iter_fault_in_readable()
  irqchip/atmel-aic: Fix potential deadlock in ->xlate()
  genirq: Provide irq_gc_{lock_irqsave,unlock_irqrestore}() helpers
  drm: Only use compat ioctl for addfb2 on X86/IA64
  drm: atmel-hlcdc: Fix vertical scaling
  net: simplify napi_synchronize() to avoid warnings
  kconfig: tinyconfig: provide whole choice blocks to avoid warnings
  soc: qcom/spm: shut up uninitialized variable warning
  pinctrl: at91-pio4: use %pr format string for resource
  mmc: dw_mmc: use resource_size_t to store physical address
  drm/i915: Avoid pointer arithmetic in calculating plane surface offset
  mpssd: fix buffer overflow warning
  gma500: remove annoying deprecation warning
  ipv6: addrconf: fix dev refcont leak when DAD failed
  sched/core: Fix a race between try_to_wake_up() and a woken up task
  Revert "wext: Fix 32 bit iwpriv compatibility issue with 64 bit Kernel"
  ath9k: fix using sta->drv_priv before initializing it
  md-cluster: make md-cluster also can work when compiled into kernel
  xhci: fix null pointer dereference in stop command timeout function
  fuse: direct-io: don't dirty ITER_BVEC pages
  Btrfs: remove root_log_ctx from ctx list before btrfs_sync_log returns
  crypto: cryptd - initialize child shash_desc on import
  arm64: spinlocks: implement smp_mb__before_spinlock() as smp_mb()
  pinctrl: sunxi: fix uart1 CTS/RTS pins at PG on A23/A33
  pinctrl: pistachio: fix mfio pll_lock pinmux
  dm crypt: fix error with too large bios
  dm log writes: move IO accounting earlier to fix error path
  dm log writes: fix check of kthread_run() return value
  bus: arm-ccn: Fix XP watchpoint settings bitmask
  bus: arm-ccn: Do not attempt to configure XPs for cycle counter
  bus: arm-ccn: Fix PMU handling of MN
  ARM: dts: STiH407-family: Provide interconnect clock for consumption in ST SDHCI
  ARM: dts: overo: fix gpmc nand on boards with ethernet
  ARM: dts: overo: fix gpmc nand cs0 range
  ARM: dts: imx6qdl: Fix SPDIF regression
  ARM: OMAP3: hwmod data: Add sysc information for DSI
  ARM: kirkwood: ib62x0: fix size of u-boot environment partition
  ARM: imx6: add missing BM_CLPCR_BYPASS_PMIC_READY setting for imx6sx
  ARM: imx6: add missing BM_CLPCR_BYP_MMDC_CH0_LPM_HS setting for imx6ul
  ARM: AM43XX: hwmod: Fix RSTST register offset for pruss
  cpuset: make sure new tasks conform to the current config of the cpuset
  net: thunderx: Fix OOPs with ethtool --register-dump
  USB: change bInterval default to 10 ms
  ARM: dts: STiH410: Handle interconnect clock required by EHCI/OHCI (USB)
  usb: chipidea: udc: fix NULL ptr dereference in isr_setup_status_phase
  usb: renesas_usbhs: fix clearing the {BRDY,BEMP}STS condition
  USB: serial: simple: add support for another Infineon flashloader
  serial: 8250: added acces i/o products quad and octal serial cards
  serial: 8250_mid: fix divide error bug if baud rate is 0
  iio: ensure ret is initialized to zero before entering do loop
  iio:core: fix IIO_VAL_FRACTIONAL sign handling
  iio: accel: kxsd9: Fix scaling bug
  iio: fix pressure data output unit in hid-sensor-attributes
  iio: accel: bmc150: reset chip at init time
  iio: adc: at91: unbreak channel adc channel 3
  iio: ad799x: Fix buffered capture for ad7991/ad7995/ad7999
  iio: adc: ti_am335x_adc: Increase timeout value waiting for ADC sample
  iio: adc: ti_am335x_adc: Protect FIFO1 from concurrent access
  iio: adc: rockchip_saradc: reset saradc controller before programming it
  iio: proximity: as3935: set up buffer timestamps for non-zero values
  iio: accel: kxsd9: Fix raw read return
  kvm-arm: Unmap shadow pagetables properly
  x86/AMD: Apply erratum 665 on machines without a BIOS fix
  x86/paravirt: Do not trace _paravirt_ident_*() functions
  ARC: mm: fix build breakage with STRICT_MM_TYPECHECKS
  IB/uverbs: Fix race between uverbs_close and remove_one
  dm flakey: fix reads to be issued if drop_writes configured
  audit: fix exe_file access in audit_exe_compare
  mm: introduce get_task_exe_file
  kexec: fix double-free when failing to relocate the purgatory
  NFSv4.1: Fix the CREATE_SESSION slot number accounting
  pNFS: Ensure LAYOUTGET and LAYOUTRETURN are properly serialised
  nfsd: Close race between nfsd4_release_lockowner and nfsd4_lock
  NFSv4.x: Fix a refcount leak in nfs_callback_up_net
  pNFS: The client must not do I/O to the DS if it's lease has expired
  kernfs: don't depend on d_find_any_alias() when generating notifications
  powerpc/mm: Don't alias user region to other regions below PAGE_OFFSET
  powerpc/powernv : Drop reference added by kset_find_obj()
  powerpc/tm: do not use r13 for tabort_syscall
  tipc: move linearization of buffers to generic code
  lightnvm: put bio before return
  fscrypto: require write access to mount to set encryption policy
  Revert "KVM: x86: fix missed hardware breakpoints"
  MIPS: KVM: Check for pfn noslot case
  clocksource/drivers/sun4i: Clear interrupts after stopping timer in probe function
  fscrypto: add authorization check for setting encryption policy
  ext4: use __GFP_NOFAIL in ext4_free_blocks()

Conflicts:
	arch/arm/kernel/devtree.c
	arch/arm64/Kconfig
	arch/arm64/kernel/arm64ksyms.c
	arch/arm64/kernel/psci.c
	arch/arm64/mm/fault.c
	drivers/android/binder.c
	drivers/usb/host/xhci-hub.c
	fs/ext4/readpage.c
	include/linux/mmc/core.h
	include/linux/mmzone.h
	mm/memcontrol.c
	net/core/filter.c
	net/netlink/af_netlink.c
	net/netlink/af_netlink.h

Change-Id: I99fe7a0914e83e284b11b33185b71448a8999d1f
Signed-off-by: Runmin Wang <runminw@codeaurora.org>
Signed-off-by: Blagovest Kolenichev <bkolenichev@codeaurora.org>
2017-02-28 17:10:49 -08:00

2838 lines
84 KiB
C

/*
* mm/page-writeback.c
*
* Copyright (C) 2002, Linus Torvalds.
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*
* Contains functions related to writing back dirty pages at the
* address_space level.
*
* 10Apr2002 Andrew Morton
* Initial version
*/
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
#include <linux/pagevec.h>
#include <linux/timer.h>
#include <linux/sched/rt.h>
#include <linux/mm_inline.h>
#include <trace/events/writeback.h>
#include "internal.h"
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
*/
#define MAX_PAUSE max(HZ/5, 1)
/*
* Try to keep balance_dirty_pages() call intervals higher than this many pages
* by raising pause time to max_pause when falls below it.
*/
#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
/*
* Estimate write bandwidth at 200ms intervals.
*/
#define BANDWIDTH_INTERVAL max(HZ/5, 1)
#define RATELIMIT_CALC_SHIFT 10
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
*/
static long ratelimit_pages = 32;
/* The following parameters are exported via /proc/sys/vm */
/*
* Start background writeback (via writeback threads) at this percentage
*/
int dirty_background_ratio = 10;
/*
* dirty_background_bytes starts at 0 (disabled) so that it is a function of
* dirty_background_ratio * the amount of dirtyable memory
*/
unsigned long dirty_background_bytes;
/*
* free highmem will not be subtracted from the total free memory
* for calculating free ratios if vm_highmem_is_dirtyable is true
*/
int vm_highmem_is_dirtyable;
/*
* The generator of dirty data starts writeback at this percentage
*/
int vm_dirty_ratio = 20;
/*
* vm_dirty_bytes starts at 0 (disabled) so that it is a function of
* vm_dirty_ratio * the amount of dirtyable memory
*/
unsigned long vm_dirty_bytes;
/*
* The interval between `kupdate'-style writebacks
*/
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
EXPORT_SYMBOL_GPL(dirty_writeback_interval);
/*
* The longest time for which data is allowed to remain dirty
*/
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
/*
* Flag that makes the machine dump writes/reads and block dirtyings.
*/
int block_dump;
/*
* Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
* a full sync is triggered after this time elapses without any disk activity.
*/
int laptop_mode;
EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
struct wb_domain global_wb_domain;
/* consolidated parameters for balance_dirty_pages() and its subroutines */
struct dirty_throttle_control {
#ifdef CONFIG_CGROUP_WRITEBACK
struct wb_domain *dom;
struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */
#endif
struct bdi_writeback *wb;
struct fprop_local_percpu *wb_completions;
unsigned long avail; /* dirtyable */
unsigned long dirty; /* file_dirty + write + nfs */
unsigned long thresh; /* dirty threshold */
unsigned long bg_thresh; /* dirty background threshold */
unsigned long wb_dirty; /* per-wb counterparts */
unsigned long wb_thresh;
unsigned long wb_bg_thresh;
unsigned long pos_ratio;
};
/*
* Length of period for aging writeout fractions of bdis. This is an
* arbitrarily chosen number. The longer the period, the slower fractions will
* reflect changes in current writeout rate.
*/
#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
#ifdef CONFIG_CGROUP_WRITEBACK
#define GDTC_INIT(__wb) .wb = (__wb), \
.dom = &global_wb_domain, \
.wb_completions = &(__wb)->completions
#define GDTC_INIT_NO_WB .dom = &global_wb_domain
#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \
.dom = mem_cgroup_wb_domain(__wb), \
.wb_completions = &(__wb)->memcg_completions, \
.gdtc = __gdtc
static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
return dtc->dom;
}
static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
return dtc->dom;
}
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
return mdtc->gdtc;
}
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
return &wb->memcg_completions;
}
static void wb_min_max_ratio(struct bdi_writeback *wb,
unsigned long *minp, unsigned long *maxp)
{
unsigned long this_bw = wb->avg_write_bandwidth;
unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
unsigned long long min = wb->bdi->min_ratio;
unsigned long long max = wb->bdi->max_ratio;
/*
* @wb may already be clean by the time control reaches here and
* the total may not include its bw.
*/
if (this_bw < tot_bw) {
if (min) {
min *= this_bw;
do_div(min, tot_bw);
}
if (max < 100) {
max *= this_bw;
do_div(max, tot_bw);
}
}
*minp = min;
*maxp = max;
}
#else /* CONFIG_CGROUP_WRITEBACK */
#define GDTC_INIT(__wb) .wb = (__wb), \
.wb_completions = &(__wb)->completions
#define GDTC_INIT_NO_WB
#define MDTC_INIT(__wb, __gdtc)
static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
return false;
}
static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
return &global_wb_domain;
}
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
return NULL;
}
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
return NULL;
}
static void wb_min_max_ratio(struct bdi_writeback *wb,
unsigned long *minp, unsigned long *maxp)
{
*minp = wb->bdi->min_ratio;
*maxp = wb->bdi->max_ratio;
}
#endif /* CONFIG_CGROUP_WRITEBACK */
/*
* In a memory zone, there is a certain amount of pages we consider
* available for the page cache, which is essentially the number of
* free and reclaimable pages, minus some zone reserves to protect
* lowmem and the ability to uphold the zone's watermarks without
* requiring writeback.
*
* This number of dirtyable pages is the base value of which the
* user-configurable dirty ratio is the effictive number of pages that
* are allowed to be actually dirtied. Per individual zone, or
* globally by using the sum of dirtyable pages over all zones.
*
* Because the user is allowed to specify the dirty limit globally as
* absolute number of bytes, calculating the per-zone dirty limit can
* require translating the configured limit into a percentage of
* global dirtyable memory first.
*/
/**
* zone_dirtyable_memory - number of dirtyable pages in a zone
* @zone: the zone
*
* Returns the zone's number of pages potentially available for dirty
* page cache. This is the base value for the per-zone dirty limits.
*/
static unsigned long zone_dirtyable_memory(struct zone *zone)
{
unsigned long nr_pages;
nr_pages = zone_page_state(zone, NR_FREE_PAGES);
/*
* Pages reserved for the kernel should not be considered
* dirtyable, to prevent a situation where reclaim has to
* clean pages in order to balance the zones.
*/
nr_pages -= min(nr_pages, zone->totalreserve_pages);
nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
return nr_pages;
}
static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
int node;
unsigned long x = 0;
for_each_node_state(node, N_HIGH_MEMORY) {
struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
x += zone_dirtyable_memory(z);
}
/*
* Unreclaimable memory (kernel memory or anonymous memory
* without swap) can bring down the dirtyable pages below
* the zone's dirty balance reserve and the above calculation
* will underflow. However we still want to add in nodes
* which are below threshold (negative values) to get a more
* accurate calculation but make sure that the total never
* underflows.
*/
if ((long)x < 0)
x = 0;
/*
* Make sure that the number of highmem pages is never larger
* than the number of the total dirtyable memory. This can only
* occur in very strange VM situations but we want to make sure
* that this does not occur.
*/
return min(x, total);
#else
return 0;
#endif
}
/**
* global_dirtyable_memory - number of globally dirtyable pages
*
* Returns the global number of pages potentially available for dirty
* page cache. This is the base value for the global dirty limits.
*/
static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
x = global_page_state(NR_FREE_PAGES);
/*
* Pages reserved for the kernel should not be considered
* dirtyable, to prevent a situation where reclaim has to
* clean pages in order to balance the zones.
*/
x -= min(x, totalreserve_pages);
x += global_page_state(NR_INACTIVE_FILE);
x += global_page_state(NR_ACTIVE_FILE);
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
return x + 1; /* Ensure that we never return 0 */
}
/**
* domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
* @dtc: dirty_throttle_control of interest
*
* Calculate @dtc->thresh and ->bg_thresh considering
* vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller
* must ensure that @dtc->avail is set before calling this function. The
* dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
* real-time tasks.
*/
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
const unsigned long available_memory = dtc->avail;
struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
unsigned long bytes = vm_dirty_bytes;
unsigned long bg_bytes = dirty_background_bytes;
/* convert ratios to per-PAGE_SIZE for higher precision */
unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
unsigned long thresh;
unsigned long bg_thresh;
struct task_struct *tsk;
/* gdtc is !NULL iff @dtc is for memcg domain */
if (gdtc) {
unsigned long global_avail = gdtc->avail;
/*
* The byte settings can't be applied directly to memcg
* domains. Convert them to ratios by scaling against
* globally available memory. As the ratios are in
* per-PAGE_SIZE, they can be obtained by dividing bytes by
* number of pages.
*/
if (bytes)
ratio = min(DIV_ROUND_UP(bytes, global_avail),
PAGE_SIZE);
if (bg_bytes)
bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
PAGE_SIZE);
bytes = bg_bytes = 0;
}
if (bytes)
thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
else
thresh = (ratio * available_memory) / PAGE_SIZE;
if (bg_bytes)
bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
else
bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
if (bg_thresh >= thresh)
bg_thresh = thresh / 2;
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
bg_thresh += bg_thresh / 4;
thresh += thresh / 4;
}
dtc->thresh = thresh;
dtc->bg_thresh = bg_thresh;
/* we should eventually report the domain in the TP */
if (!gdtc)
trace_global_dirty_state(bg_thresh, thresh);
}
/**
* global_dirty_limits - background-writeback and dirty-throttling thresholds
* @pbackground: out parameter for bg_thresh
* @pdirty: out parameter for thresh
*
* Calculate bg_thresh and thresh for global_wb_domain. See
* domain_dirty_limits() for details.
*/
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
gdtc.avail = global_dirtyable_memory();
domain_dirty_limits(&gdtc);
*pbackground = gdtc.bg_thresh;
*pdirty = gdtc.thresh;
}
/**
* zone_dirty_limit - maximum number of dirty pages allowed in a zone
* @zone: the zone
*
* Returns the maximum number of dirty pages allowed in a zone, based
* on the zone's dirtyable memory.
*/
static unsigned long zone_dirty_limit(struct zone *zone)
{
unsigned long zone_memory = zone_dirtyable_memory(zone);
struct task_struct *tsk = current;
unsigned long dirty;
if (vm_dirty_bytes)
dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
zone_memory / global_dirtyable_memory();
else
dirty = vm_dirty_ratio * zone_memory / 100;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
dirty += dirty / 4;
return dirty;
}
/**
* zone_dirty_ok - tells whether a zone is within its dirty limits
* @zone: the zone to check
*
* Returns %true when the dirty pages in @zone are within the zone's
* dirty limit, %false if the limit is exceeded.
*/
bool zone_dirty_ok(struct zone *zone)
{
unsigned long limit = zone_dirty_limit(zone);
return zone_page_state(zone, NR_FILE_DIRTY) +
zone_page_state(zone, NR_UNSTABLE_NFS) +
zone_page_state(zone, NR_WRITEBACK) <= limit;
}
int dirty_background_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
dirty_background_bytes = 0;
return ret;
}
int dirty_background_bytes_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
dirty_background_ratio = 0;
return ret;
}
int dirty_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int old_ratio = vm_dirty_ratio;
int ret;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
writeback_set_ratelimit();
vm_dirty_bytes = 0;
}
return ret;
}
int dirty_bytes_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
unsigned long old_bytes = vm_dirty_bytes;
int ret;
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
writeback_set_ratelimit();
vm_dirty_ratio = 0;
}
return ret;
}
static unsigned long wp_next_time(unsigned long cur_time)
{
cur_time += VM_COMPLETIONS_PERIOD_LEN;
/* 0 has a special meaning... */
if (!cur_time)
return 1;
return cur_time;
}
static void wb_domain_writeout_inc(struct wb_domain *dom,
struct fprop_local_percpu *completions,
unsigned int max_prop_frac)
{
__fprop_inc_percpu_max(&dom->completions, completions,
max_prop_frac);
/* First event after period switching was turned off? */
if (!unlikely(dom->period_time)) {
/*
* We can race with other __bdi_writeout_inc calls here but
* it does not cause any harm since the resulting time when
* timer will fire and what is in writeout_period_time will be
* roughly the same.
*/
dom->period_time = wp_next_time(jiffies);
mod_timer(&dom->period_timer, dom->period_time);
}
}
/*
* Increment @wb's writeout completion count and the global writeout
* completion count. Called from test_clear_page_writeback().
*/
static inline void __wb_writeout_inc(struct bdi_writeback *wb)
{
struct wb_domain *cgdom;
__inc_wb_stat(wb, WB_WRITTEN);
wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
wb->bdi->max_prop_frac);
cgdom = mem_cgroup_wb_domain(wb);
if (cgdom)
wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
wb->bdi->max_prop_frac);
}
void wb_writeout_inc(struct bdi_writeback *wb)
{
unsigned long flags;
local_irq_save(flags);
__wb_writeout_inc(wb);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(wb_writeout_inc);
/*
* On idle system, we can be called long after we scheduled because we use
* deferred timers so count with missed periods.
*/
static void writeout_period(unsigned long t)
{
struct wb_domain *dom = (void *)t;
int miss_periods = (jiffies - dom->period_time) /
VM_COMPLETIONS_PERIOD_LEN;
if (fprop_new_period(&dom->completions, miss_periods + 1)) {
dom->period_time = wp_next_time(dom->period_time +
miss_periods * VM_COMPLETIONS_PERIOD_LEN);
mod_timer(&dom->period_timer, dom->period_time);
} else {
/*
* Aging has zeroed all fractions. Stop wasting CPU on period
* updates.
*/
dom->period_time = 0;
}
}
int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
{
memset(dom, 0, sizeof(*dom));
spin_lock_init(&dom->lock);
init_timer_deferrable(&dom->period_timer);
dom->period_timer.function = writeout_period;
dom->period_timer.data = (unsigned long)dom;
dom->dirty_limit_tstamp = jiffies;
return fprop_global_init(&dom->completions, gfp);
}
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
del_timer_sync(&dom->period_timer);
fprop_global_destroy(&dom->completions);
}
#endif
/*
* bdi_min_ratio keeps the sum of the minimum dirty shares of all
* registered backing devices, which, for obvious reasons, can not
* exceed 100%.
*/
static unsigned int bdi_min_ratio;
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
int ret = 0;
spin_lock_bh(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
} else {
min_ratio -= bdi->min_ratio;
if (bdi_min_ratio + min_ratio < 100) {
bdi_min_ratio += min_ratio;
bdi->min_ratio += min_ratio;
} else {
ret = -EINVAL;
}
}
spin_unlock_bh(&bdi_lock);
return ret;
}
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
{
int ret = 0;
if (max_ratio > 100)
return -EINVAL;
spin_lock_bh(&bdi_lock);
if (bdi->min_ratio > max_ratio) {
ret = -EINVAL;
} else {
bdi->max_ratio = max_ratio;
bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
}
spin_unlock_bh(&bdi_lock);
return ret;
}
EXPORT_SYMBOL(bdi_set_max_ratio);
static unsigned long dirty_freerun_ceiling(unsigned long thresh,
unsigned long bg_thresh)
{
return (thresh + bg_thresh) / 2;
}
static unsigned long hard_dirty_limit(struct wb_domain *dom,
unsigned long thresh)
{
return max(thresh, dom->dirty_limit);
}
/*
* Memory which can be further allocated to a memcg domain is capped by
* system-wide clean memory excluding the amount being used in the domain.
*/
static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
unsigned long filepages, unsigned long headroom)
{
struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
unsigned long clean = filepages - min(filepages, mdtc->dirty);
unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
unsigned long other_clean = global_clean - min(global_clean, clean);
mdtc->avail = filepages + min(headroom, other_clean);
}
/**
* __wb_calc_thresh - @wb's share of dirty throttling threshold
* @dtc: dirty_throttle_context of interest
*
* Returns @wb's dirty limit in pages. The term "dirty" in the context of
* dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
*
* Note that balance_dirty_pages() will only seriously take it as a hard limit
* when sleeping max_pause per page is not enough to keep the dirty pages under
* control. For example, when the device is completely stalled due to some error
* conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
* In the other normal situations, it acts more gently by throttling the tasks
* more (rather than completely block them) when the wb dirty pages go high.
*
* It allocates high/low dirty limits to fast/slow devices, in order to prevent
* - starving fast devices
* - piling up dirty pages (that will take long time to sync) on slow devices
*
* The wb's share of dirty limit will be adapting to its throughput and
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
*/
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
struct wb_domain *dom = dtc_dom(dtc);
unsigned long thresh = dtc->thresh;
u64 wb_thresh;
long numerator, denominator;
unsigned long wb_min_ratio, wb_max_ratio;
/*
* Calculate this BDI's share of the thresh ratio.
*/
fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
&numerator, &denominator);
wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
wb_thresh *= numerator;
do_div(wb_thresh, denominator);
wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
wb_thresh += (thresh * wb_min_ratio) / 100;
if (wb_thresh > (thresh * wb_max_ratio) / 100)
wb_thresh = thresh * wb_max_ratio / 100;
return wb_thresh;
}
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
.thresh = thresh };
return __wb_calc_thresh(&gdtc);
}
/*
* setpoint - dirty 3
* f(dirty) := 1.0 + (----------------)
* limit - setpoint
*
* it's a 3rd order polynomial that subjects to
*
* (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
* (2) f(setpoint) = 1.0 => the balance point
* (3) f(limit) = 0 => the hard limit
* (4) df/dx <= 0 => negative feedback control
* (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
* => fast response on large errors; small oscillation near setpoint
*/
static long long pos_ratio_polynom(unsigned long setpoint,
unsigned long dirty,
unsigned long limit)
{
long long pos_ratio;
long x;
x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
(limit - setpoint) | 1);
pos_ratio = x;
pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
}
/*
* Dirty position control.
*
* (o) global/bdi setpoints
*
* We want the dirty pages be balanced around the global/wb setpoints.
* When the number of dirty pages is higher/lower than the setpoint, the
* dirty position control ratio (and hence task dirty ratelimit) will be
* decreased/increased to bring the dirty pages back to the setpoint.
*
* pos_ratio = 1 << RATELIMIT_CALC_SHIFT
*
* if (dirty < setpoint) scale up pos_ratio
* if (dirty > setpoint) scale down pos_ratio
*
* if (wb_dirty < wb_setpoint) scale up pos_ratio
* if (wb_dirty > wb_setpoint) scale down pos_ratio
*
* task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
*
* (o) global control line
*
* ^ pos_ratio
* |
* | |<===== global dirty control scope ======>|
* 2.0 .............*
* | .*
* | . *
* | . *
* | . *
* | . *
* | . *
* 1.0 ................................*
* | . . *
* | . . *
* | . . *
* | . . *
* | . . *
* 0 +------------.------------------.----------------------*------------->
* freerun^ setpoint^ limit^ dirty pages
*
* (o) wb control line
*
* ^ pos_ratio
* |
* | *
* | *
* | *
* | *
* | * |<=========== span ============>|
* 1.0 .......................*
* | . *
* | . *
* | . *
* | . *
* | . *
* | . *
* | . *
* | . *
* | . *
* | . *
* | . *
* 1/4 ...............................................* * * * * * * * * * * *
* | . .
* | . .
* | . .
* 0 +----------------------.-------------------------------.------------->
* wb_setpoint^ x_intercept^
*
* The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
* be smoothly throttled down to normal if it starts high in situations like
* - start writing to a slow SD card and a fast disk at the same time. The SD
* card's wb_dirty may rush to many times higher than wb_setpoint.
* - the wb dirty thresh drops quickly due to change of JBOD workload
*/
static void wb_position_ratio(struct dirty_throttle_control *dtc)
{
struct bdi_writeback *wb = dtc->wb;
unsigned long write_bw = wb->avg_write_bandwidth;
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long wb_thresh = dtc->wb_thresh;
unsigned long x_intercept;
unsigned long setpoint; /* dirty pages' target balance point */
unsigned long wb_setpoint;
unsigned long span;
long long pos_ratio; /* for scaling up/down the rate limit */
long x;
dtc->pos_ratio = 0;
if (unlikely(dtc->dirty >= limit))
return;
/*
* global setpoint
*
* See comment for pos_ratio_polynom().
*/
setpoint = (freerun + limit) / 2;
pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
/*
* The strictlimit feature is a tool preventing mistrusted filesystems
* from growing a large number of dirty pages before throttling. For
* such filesystems balance_dirty_pages always checks wb counters
* against wb limits. Even if global "nr_dirty" is under "freerun".
* This is especially important for fuse which sets bdi->max_ratio to
* 1% by default. Without strictlimit feature, fuse writeback may
* consume arbitrary amount of RAM because it is accounted in
* NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
*
* Here, in wb_position_ratio(), we calculate pos_ratio based on
* two values: wb_dirty and wb_thresh. Let's consider an example:
* total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
* limits are set by default to 10% and 20% (background and throttle).
* Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
* wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
* about ~6K pages (as the average of background and throttle wb
* limits). The 3rd order polynomial will provide positive feedback if
* wb_dirty is under wb_setpoint and vice versa.
*
* Note, that we cannot use global counters in these calculations
* because we want to throttle process writing to a strictlimit wb
* much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
* in the example above).
*/
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
long long wb_pos_ratio;
if (dtc->wb_dirty < 8) {
dtc->pos_ratio = min_t(long long, pos_ratio * 2,
2 << RATELIMIT_CALC_SHIFT);
return;
}
if (dtc->wb_dirty >= wb_thresh)
return;
wb_setpoint = dirty_freerun_ceiling(wb_thresh,
dtc->wb_bg_thresh);
if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
return;
wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
wb_thresh);
/*
* Typically, for strictlimit case, wb_setpoint << setpoint
* and pos_ratio >> wb_pos_ratio. In the other words global
* state ("dirty") is not limiting factor and we have to
* make decision based on wb counters. But there is an
* important case when global pos_ratio should get precedence:
* global limits are exceeded (e.g. due to activities on other
* wb's) while given strictlimit wb is below limit.
*
* "pos_ratio * wb_pos_ratio" would work for the case above,
* but it would look too non-natural for the case of all
* activity in the system coming from a single strictlimit wb
* with bdi->max_ratio == 100%.
*
* Note that min() below somewhat changes the dynamics of the
* control system. Normally, pos_ratio value can be well over 3
* (when globally we are at freerun and wb is well below wb
* setpoint). Now the maximum pos_ratio in the same situation
* is 2. We might want to tweak this if we observe the control
* system is too slow to adapt.
*/
dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
return;
}
/*
* We have computed basic pos_ratio above based on global situation. If
* the wb is over/under its share of dirty pages, we want to scale
* pos_ratio further down/up. That is done by the following mechanism.
*/
/*
* wb setpoint
*
* f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
*
* x_intercept - wb_dirty
* := --------------------------
* x_intercept - wb_setpoint
*
* The main wb control line is a linear function that subjects to
*
* (1) f(wb_setpoint) = 1.0
* (2) k = - 1 / (8 * write_bw) (in single wb case)
* or equally: x_intercept = wb_setpoint + 8 * write_bw
*
* For single wb case, the dirty pages are observed to fluctuate
* regularly within range
* [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
* for various filesystems, where (2) can yield in a reasonable 12.5%
* fluctuation range for pos_ratio.
*
* For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
* own size, so move the slope over accordingly and choose a slope that
* yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
*/
if (unlikely(wb_thresh > dtc->thresh))
wb_thresh = dtc->thresh;
/*
* It's very possible that wb_thresh is close to 0 not because the
* device is slow, but that it has remained inactive for long time.
* Honour such devices a reasonable good (hopefully IO efficient)
* threshold, so that the occasional writes won't be blocked and active
* writes can rampup the threshold quickly.
*/
wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
/*
* scale global setpoint to wb's:
* wb_setpoint = setpoint * wb_thresh / thresh
*/
x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
wb_setpoint = setpoint * (u64)x >> 16;
/*
* Use span=(8*write_bw) in single wb case as indicated by
* (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
*
* wb_thresh thresh - wb_thresh
* span = --------- * (8 * write_bw) + ------------------ * wb_thresh
* thresh thresh
*/
span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
x_intercept = wb_setpoint + span;
if (dtc->wb_dirty < x_intercept - span / 4) {
pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
(x_intercept - wb_setpoint) | 1);
} else
pos_ratio /= 4;
/*
* wb reserve area, safeguard against dirty pool underrun and disk idle
* It may push the desired control point of global dirty pages higher
* than setpoint.
*/
x_intercept = wb_thresh / 2;
if (dtc->wb_dirty < x_intercept) {
if (dtc->wb_dirty > x_intercept / 8)
pos_ratio = div_u64(pos_ratio * x_intercept,
dtc->wb_dirty);
else
pos_ratio *= 8;
}
dtc->pos_ratio = pos_ratio;
}
static void wb_update_write_bandwidth(struct bdi_writeback *wb,
unsigned long elapsed,
unsigned long written)
{
const unsigned long period = roundup_pow_of_two(3 * HZ);
unsigned long avg = wb->avg_write_bandwidth;
unsigned long old = wb->write_bandwidth;
u64 bw;
/*
* bw = written * HZ / elapsed
*
* bw * elapsed + write_bandwidth * (period - elapsed)
* write_bandwidth = ---------------------------------------------------
* period
*
* @written may have decreased due to account_page_redirty().
* Avoid underflowing @bw calculation.
*/
bw = written - min(written, wb->written_stamp);
bw *= HZ;
if (unlikely(elapsed > period)) {
do_div(bw, elapsed);
avg = bw;
goto out;
}
bw += (u64)wb->write_bandwidth * (period - elapsed);
bw >>= ilog2(period);
/*
* one more level of smoothing, for filtering out sudden spikes
*/
if (avg > old && old >= (unsigned long)bw)
avg -= (avg - old) >> 3;
if (avg < old && old <= (unsigned long)bw)
avg += (old - avg) >> 3;
out:
/* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
avg = max(avg, 1LU);
if (wb_has_dirty_io(wb)) {
long delta = avg - wb->avg_write_bandwidth;
WARN_ON_ONCE(atomic_long_add_return(delta,
&wb->bdi->tot_write_bandwidth) <= 0);
}
wb->write_bandwidth = bw;
wb->avg_write_bandwidth = avg;
}
static void update_dirty_limit(struct dirty_throttle_control *dtc)
{
struct wb_domain *dom = dtc_dom(dtc);
unsigned long thresh = dtc->thresh;
unsigned long limit = dom->dirty_limit;
/*
* Follow up in one step.
*/
if (limit < thresh) {
limit = thresh;
goto update;
}
/*
* Follow down slowly. Use the higher one as the target, because thresh
* may drop below dirty. This is exactly the reason to introduce
* dom->dirty_limit which is guaranteed to lie above the dirty pages.
*/
thresh = max(thresh, dtc->dirty);
if (limit > thresh) {
limit -= (limit - thresh) >> 5;
goto update;
}
return;
update:
dom->dirty_limit = limit;
}
static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
unsigned long now)
{
struct wb_domain *dom = dtc_dom(dtc);
/*
* check locklessly first to optimize away locking for the most time
*/
if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
return;
spin_lock(&dom->lock);
if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
update_dirty_limit(dtc);
dom->dirty_limit_tstamp = now;
}
spin_unlock(&dom->lock);
}
/*
* Maintain wb->dirty_ratelimit, the base dirty throttle rate.
*
* Normal wb tasks will be curbed at or below it in long term.
* Obviously it should be around (write_bw / N) when there are N dd tasks.
*/
static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
unsigned long dirtied,
unsigned long elapsed)
{
struct bdi_writeback *wb = dtc->wb;
unsigned long dirty = dtc->dirty;
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long setpoint = (freerun + limit) / 2;
unsigned long write_bw = wb->avg_write_bandwidth;
unsigned long dirty_ratelimit = wb->dirty_ratelimit;
unsigned long dirty_rate;
unsigned long task_ratelimit;
unsigned long balanced_dirty_ratelimit;
unsigned long step;
unsigned long x;
/*
* The dirty rate will match the writeout rate in long term, except
* when dirty pages are truncated by userspace or re-dirtied by FS.
*/
dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
/*
* task_ratelimit reflects each dd's dirty rate for the past 200ms.
*/
task_ratelimit = (u64)dirty_ratelimit *
dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
/*
* A linear estimation of the "balanced" throttle rate. The theory is,
* if there are N dd tasks, each throttled at task_ratelimit, the wb's
* dirty_rate will be measured to be (N * task_ratelimit). So the below
* formula will yield the balanced rate limit (write_bw / N).
*
* Note that the expanded form is not a pure rate feedback:
* rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1)
* but also takes pos_ratio into account:
* rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2)
*
* (1) is not realistic because pos_ratio also takes part in balancing
* the dirty rate. Consider the state
* pos_ratio = 0.5 (3)
* rate = 2 * (write_bw / N) (4)
* If (1) is used, it will stuck in that state! Because each dd will
* be throttled at
* task_ratelimit = pos_ratio * rate = (write_bw / N) (5)
* yielding
* dirty_rate = N * task_ratelimit = write_bw (6)
* put (6) into (1) we get
* rate_(i+1) = rate_(i) (7)
*
* So we end up using (2) to always keep
* rate_(i+1) ~= (write_bw / N) (8)
* regardless of the value of pos_ratio. As long as (8) is satisfied,
* pos_ratio is able to drive itself to 1.0, which is not only where
* the dirty count meet the setpoint, but also where the slope of
* pos_ratio is most flat and hence task_ratelimit is least fluctuated.
*/
balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
dirty_rate | 1);
/*
* balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
*/
if (unlikely(balanced_dirty_ratelimit > write_bw))
balanced_dirty_ratelimit = write_bw;
/*
* We could safely do this and return immediately:
*
* wb->dirty_ratelimit = balanced_dirty_ratelimit;
*
* However to get a more stable dirty_ratelimit, the below elaborated
* code makes use of task_ratelimit to filter out singular points and
* limit the step size.
*
* The below code essentially only uses the relative value of
*
* task_ratelimit - dirty_ratelimit
* = (pos_ratio - 1) * dirty_ratelimit
*
* which reflects the direction and size of dirty position error.
*/
/*
* dirty_ratelimit will follow balanced_dirty_ratelimit iff
* task_ratelimit is on the same side of dirty_ratelimit, too.
* For example, when
* - dirty_ratelimit > balanced_dirty_ratelimit
* - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
* lowering dirty_ratelimit will help meet both the position and rate
* control targets. Otherwise, don't update dirty_ratelimit if it will
* only help meet the rate target. After all, what the users ultimately
* feel and care are stable dirty rate and small position error.
*
* |task_ratelimit - dirty_ratelimit| is used to limit the step size
* and filter out the singular points of balanced_dirty_ratelimit. Which
* keeps jumping around randomly and can even leap far away at times
* due to the small 200ms estimation period of dirty_rate (we want to
* keep that period small to reduce time lags).
*/
step = 0;
/*
* For strictlimit case, calculations above were based on wb counters
* and limits (starting from pos_ratio = wb_position_ratio() and up to
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
* Hence, to calculate "step" properly, we have to use wb_dirty as
* "dirty" and wb_setpoint as "setpoint".
*
* We rampup dirty_ratelimit forcibly if wb_dirty is low because
* it's possible that wb_thresh is close to zero due to inactivity
* of backing device.
*/
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
dirty = dtc->wb_dirty;
if (dtc->wb_dirty < 8)
setpoint = dtc->wb_dirty + 1;
else
setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
}
if (dirty < setpoint) {
x = min3(wb->balanced_dirty_ratelimit,
balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit < x)
step = x - dirty_ratelimit;
} else {
x = max3(wb->balanced_dirty_ratelimit,
balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit > x)
step = dirty_ratelimit - x;
}
/*
* Don't pursue 100% rate matching. It's impossible since the balanced
* rate itself is constantly fluctuating. So decrease the track speed
* when it gets close to the target. Helps eliminate pointless tremors.
*/
step >>= dirty_ratelimit / (2 * step + 1);
/*
* Limit the tracking speed to avoid overshooting.
*/
step = (step + 7) / 8;
if (dirty_ratelimit < balanced_dirty_ratelimit)
dirty_ratelimit += step;
else
dirty_ratelimit -= step;
wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
}
static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
struct dirty_throttle_control *mdtc,
unsigned long start_time,
bool update_ratelimit)
{
struct bdi_writeback *wb = gdtc->wb;
unsigned long now = jiffies;
unsigned long elapsed = now - wb->bw_time_stamp;
unsigned long dirtied;
unsigned long written;
lockdep_assert_held(&wb->list_lock);
/*
* rate-limit, only update once every 200ms.
*/
if (elapsed < BANDWIDTH_INTERVAL)
return;
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
/*
* Skip quiet periods when disk bandwidth is under-utilized.
* (at least 1s idle time between two flusher runs)
*/
if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
goto snapshot;
if (update_ratelimit) {
domain_update_bandwidth(gdtc, now);
wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
/*
* @mdtc is always NULL if !CGROUP_WRITEBACK but the
* compiler has no way to figure that out. Help it.
*/
if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
domain_update_bandwidth(mdtc, now);
wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
}
}
wb_update_write_bandwidth(wb, elapsed, written);
snapshot:
wb->dirtied_stamp = dirtied;
wb->written_stamp = written;
wb->bw_time_stamp = now;
}
void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
{
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
__wb_update_bandwidth(&gdtc, NULL, start_time, false);
}
/*
* After a task dirtied this many pages, balance_dirty_pages_ratelimited()
* will look to see if it needs to start dirty throttling.
*
* If dirty_poll_interval is too low, big NUMA machines will call the expensive
* global_page_state() too often. So scale it near-sqrt to the safety margin
* (the number of pages we may dirty without exceeding the dirty limits).
*/
static unsigned long dirty_poll_interval(unsigned long dirty,
unsigned long thresh)
{
if (thresh > dirty)
return 1UL << (ilog2(thresh - dirty) >> 1);
return 1;
}
static unsigned long wb_max_pause(struct bdi_writeback *wb,
unsigned long wb_dirty)
{
unsigned long bw = wb->avg_write_bandwidth;
unsigned long t;
/*
* Limit pause time for small memory systems. If sleeping for too long
* time, a small pool of dirty/writeback pages may go empty and disk go
* idle.
*
* 8 serves as the safety ratio.
*/
t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
t++;
return min_t(unsigned long, t, MAX_PAUSE);
}
static long wb_min_pause(struct bdi_writeback *wb,
long max_pause,
unsigned long task_ratelimit,
unsigned long dirty_ratelimit,
int *nr_dirtied_pause)
{
long hi = ilog2(wb->avg_write_bandwidth);
long lo = ilog2(wb->dirty_ratelimit);
long t; /* target pause */
long pause; /* estimated next pause */
int pages; /* target nr_dirtied_pause */
/* target for 10ms pause on 1-dd case */
t = max(1, HZ / 100);
/*
* Scale up pause time for concurrent dirtiers in order to reduce CPU
* overheads.
*
* (N * 10ms) on 2^N concurrent tasks.
*/
if (hi > lo)
t += (hi - lo) * (10 * HZ) / 1024;
/*
* This is a bit convoluted. We try to base the next nr_dirtied_pause
* on the much more stable dirty_ratelimit. However the next pause time
* will be computed based on task_ratelimit and the two rate limits may
* depart considerably at some time. Especially if task_ratelimit goes
* below dirty_ratelimit/2 and the target pause is max_pause, the next
* pause time will be max_pause*2 _trimmed down_ to max_pause. As a
* result task_ratelimit won't be executed faithfully, which could
* eventually bring down dirty_ratelimit.
*
* We apply two rules to fix it up:
* 1) try to estimate the next pause time and if necessary, use a lower
* nr_dirtied_pause so as not to exceed max_pause. When this happens,
* nr_dirtied_pause will be "dancing" with task_ratelimit.
* 2) limit the target pause time to max_pause/2, so that the normal
* small fluctuations of task_ratelimit won't trigger rule (1) and
* nr_dirtied_pause will remain as stable as dirty_ratelimit.
*/
t = min(t, 1 + max_pause / 2);
pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
/*
* Tiny nr_dirtied_pause is found to hurt I/O performance in the test
* case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
* When the 16 consecutive reads are often interrupted by some dirty
* throttling pause during the async writes, cfq will go into idles
* (deadline is fine). So push nr_dirtied_pause as high as possible
* until reaches DIRTY_POLL_THRESH=32 pages.
*/
if (pages < DIRTY_POLL_THRESH) {
t = max_pause;
pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
if (pages > DIRTY_POLL_THRESH) {
pages = DIRTY_POLL_THRESH;
t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
}
}
pause = HZ * pages / (task_ratelimit + 1);
if (pause > max_pause) {
t = max_pause;
pages = task_ratelimit * t / roundup_pow_of_two(HZ);
}
*nr_dirtied_pause = pages;
/*
* The minimal pause time will normally be half the target pause time.
*/
return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
}
static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
{
struct bdi_writeback *wb = dtc->wb;
unsigned long wb_reclaimable;
/*
* wb_thresh is not treated as some limiting factor as
* dirty_thresh, due to reasons
* - in JBOD setup, wb_thresh can fluctuate a lot
* - in a system with HDD and USB key, the USB key may somehow
* go into state (wb_dirty >> wb_thresh) either because
* wb_dirty starts high, or because wb_thresh drops low.
* In this case we don't want to hard throttle the USB key
* dirtiers for 100 seconds until wb_dirty drops under
* wb_thresh. Instead the auxiliary wb control line in
* wb_position_ratio() will let the dirtier task progress
* at some rate <= (write_bw / 2) for bringing down wb_dirty.
*/
dtc->wb_thresh = __wb_calc_thresh(dtc);
dtc->wb_bg_thresh = dtc->thresh ?
div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
/*
* In order to avoid the stacked BDI deadlock we need
* to ensure we accurately count the 'dirty' pages when
* the threshold is low.
*
* Otherwise it would be possible to get thresh+n pages
* reported dirty, even though there are thresh-m pages
* actually dirty; with m+n sitting in the percpu
* deltas.
*/
if (dtc->wb_thresh < 2 * wb_stat_error(wb)) {
wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
} else {
wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
}
}
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
* the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
* If we're over `background_thresh' then the writeback threads are woken to
* perform some writeout.
*/
static void balance_dirty_pages(struct address_space *mapping,
struct bdi_writeback *wb,
unsigned long pages_dirtied)
{
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
struct dirty_throttle_control * const gdtc = &gdtc_stor;
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
struct dirty_throttle_control *sdtc;
unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
long period;
long pause;
long max_pause;
long min_pause;
int nr_dirtied_pause;
bool dirty_exceeded = false;
unsigned long task_ratelimit;
unsigned long dirty_ratelimit;
struct backing_dev_info *bdi = wb->bdi;
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
for (;;) {
unsigned long now = jiffies;
unsigned long dirty, thresh, bg_thresh;
unsigned long m_dirty = 0; /* stop bogus uninit warnings */
unsigned long m_thresh = 0;
unsigned long m_bg_thresh = 0;
/*
* Unstable writes are a feature of certain networked
* filesystems (i.e. NFS) in which data may have been
* written to the server's write cache, but has not yet
* been flushed to permanent storage.
*/
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
gdtc->avail = global_dirtyable_memory();
gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
domain_dirty_limits(gdtc);
if (unlikely(strictlimit)) {
wb_dirty_limits(gdtc);
dirty = gdtc->wb_dirty;
thresh = gdtc->wb_thresh;
bg_thresh = gdtc->wb_bg_thresh;
} else {
dirty = gdtc->dirty;
thresh = gdtc->thresh;
bg_thresh = gdtc->bg_thresh;
}
if (mdtc) {
unsigned long filepages, headroom, writeback;
/*
* If @wb belongs to !root memcg, repeat the same
* basic calculations for the memcg domain.
*/
mem_cgroup_wb_stats(wb, &filepages, &headroom,
&mdtc->dirty, &writeback);
mdtc->dirty += writeback;
mdtc_calc_avail(mdtc, filepages, headroom);
domain_dirty_limits(mdtc);
if (unlikely(strictlimit)) {
wb_dirty_limits(mdtc);
m_dirty = mdtc->wb_dirty;
m_thresh = mdtc->wb_thresh;
m_bg_thresh = mdtc->wb_bg_thresh;
} else {
m_dirty = mdtc->dirty;
m_thresh = mdtc->thresh;
m_bg_thresh = mdtc->bg_thresh;
}
}
/*
* Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts
* when the wb limits are ramping up in case of !strictlimit.
*
* In strictlimit case make decision based on the wb counters
* and limits. Small writeouts when the wb limits are ramping
* up are the price we consciously pay for strictlimit-ing.
*
* If memcg domain is in effect, @dirty should be under
* both global and memcg freerun ceilings.
*/
if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
(!mdtc ||
m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
unsigned long intv = dirty_poll_interval(dirty, thresh);
unsigned long m_intv = ULONG_MAX;
current->dirty_paused_when = now;
current->nr_dirtied = 0;
if (mdtc)
m_intv = dirty_poll_interval(m_dirty, m_thresh);
current->nr_dirtied_pause = min(intv, m_intv);
break;
}
if (unlikely(!writeback_in_progress(wb)))
wb_start_background_writeback(wb);
/*
* Calculate global domain's pos_ratio and select the
* global dtc by default.
*/
if (!strictlimit)
wb_dirty_limits(gdtc);
dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
((gdtc->dirty > gdtc->thresh) || strictlimit);
wb_position_ratio(gdtc);
sdtc = gdtc;
if (mdtc) {
/*
* If memcg domain is in effect, calculate its
* pos_ratio. @wb should satisfy constraints from
* both global and memcg domains. Choose the one
* w/ lower pos_ratio.
*/
if (!strictlimit)
wb_dirty_limits(mdtc);
dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
((mdtc->dirty > mdtc->thresh) || strictlimit);
wb_position_ratio(mdtc);
if (mdtc->pos_ratio < gdtc->pos_ratio)
sdtc = mdtc;
}
if (dirty_exceeded && !wb->dirty_exceeded)
wb->dirty_exceeded = 1;
if (time_is_before_jiffies(wb->bw_time_stamp +
BANDWIDTH_INTERVAL)) {
spin_lock(&wb->list_lock);
__wb_update_bandwidth(gdtc, mdtc, start_time, true);
spin_unlock(&wb->list_lock);
}
/* throttle according to the chosen dtc */
dirty_ratelimit = wb->dirty_ratelimit;
task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
RATELIMIT_CALC_SHIFT;
max_pause = wb_max_pause(wb, sdtc->wb_dirty);
min_pause = wb_min_pause(wb, max_pause,
task_ratelimit, dirty_ratelimit,
&nr_dirtied_pause);
if (unlikely(task_ratelimit == 0)) {
period = max_pause;
pause = max_pause;
goto pause;
}
period = HZ * pages_dirtied / task_ratelimit;
pause = period;
if (current->dirty_paused_when)
pause -= now - current->dirty_paused_when;
/*
* For less than 1s think time (ext3/4 may block the dirtier
* for up to 800ms from time to time on 1-HDD; so does xfs,
* however at much less frequency), try to compensate it in
* future periods by updating the virtual time; otherwise just
* do a reset, as it may be a light dirtier.
*/
if (pause < min_pause) {
trace_balance_dirty_pages(wb,
sdtc->thresh,
sdtc->bg_thresh,
sdtc->dirty,
sdtc->wb_thresh,
sdtc->wb_dirty,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
period,
min(pause, 0L),
start_time);
if (pause < -HZ) {
current->dirty_paused_when = now;
current->nr_dirtied = 0;
} else if (period) {
current->dirty_paused_when += period;
current->nr_dirtied = 0;
} else if (current->nr_dirtied_pause <= pages_dirtied)
current->nr_dirtied_pause += pages_dirtied;
break;
}
if (unlikely(pause > max_pause)) {
/* for occasional dropped task_ratelimit */
now += min(pause - max_pause, max_pause);
pause = max_pause;
}
pause:
trace_balance_dirty_pages(wb,
sdtc->thresh,
sdtc->bg_thresh,
sdtc->dirty,
sdtc->wb_thresh,
sdtc->wb_dirty,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
period,
pause,
start_time);
__set_current_state(TASK_KILLABLE);
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
current->nr_dirtied = 0;
current->nr_dirtied_pause = nr_dirtied_pause;
/*
* This is typically equal to (dirty < thresh) and can also
* keep "1000+ dd on a slow USB stick" under control.
*/
if (task_ratelimit)
break;
/*
* In the case of an unresponding NFS server and the NFS dirty
* pages exceeds dirty_thresh, give the other good wb's a pipe
* to go through, so that tasks on them still remain responsive.
*
* In theory 1 page is enough to keep the comsumer-producer
* pipe going: the flusher cleans 1 page => the task dirties 1
* more page. However wb_dirty has accounting errors. So use
* the larger and more IO friendly wb_stat_error.
*/
if (sdtc->wb_dirty <= wb_stat_error(wb))
break;
if (fatal_signal_pending(current))
break;
}
if (!dirty_exceeded && wb->dirty_exceeded)
wb->dirty_exceeded = 0;
if (writeback_in_progress(wb))
return;
/*
* In laptop mode, we wait until hitting the higher threshold before
* starting background writeout, and then write out all the way down
* to the lower threshold. So slow writers cause minimal disk activity.
*
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
if (laptop_mode)
return;
if (nr_reclaimable > gdtc->bg_thresh)
wb_start_background_writeback(wb);
}
static DEFINE_PER_CPU(int, bdp_ratelimits);
/*
* Normal tasks are throttled by
* loop {
* dirty tsk->nr_dirtied_pause pages;
* take a snap in balance_dirty_pages();
* }
* However there is a worst case. If every task exit immediately when dirtied
* (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
* called to throttle the page dirties. The solution is to save the not yet
* throttled page dirties in dirty_throttle_leaks on task exit and charge them
* randomly into the running tasks. This works well for the above worst case,
* as the new task will pick up and accumulate the old task's leaked dirty
* count and eventually get throttled.
*/
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
/**
* balance_dirty_pages_ratelimited - balance dirty memory state
* @mapping: address_space which was dirtied
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed.
*
* On really big machines, get_writeback_state is expensive, so try to avoid
* calling it too often (ratelimiting). But once we're over the dirty memory
* limit we decrease the ratelimiting by a lot, to prevent individual processes
* from overshooting the limit by (ratelimit_pages) each.
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct bdi_writeback *wb = NULL;
int ratelimit;
int *p;
if (!bdi_cap_account_dirty(bdi))
return;
if (inode_cgwb_enabled(inode))
wb = wb_get_create_current(bdi, GFP_KERNEL);
if (!wb)
wb = &bdi->wb;
ratelimit = current->nr_dirtied_pause;
if (wb->dirty_exceeded)
ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
preempt_disable();
/*
* This prevents one CPU to accumulate too many dirtied pages without
* calling into balance_dirty_pages(), which can happen when there are
* 1000+ tasks, all of them start dirtying pages at exactly the same
* time, hence all honoured too large initial task->nr_dirtied_pause.
*/
p = this_cpu_ptr(&bdp_ratelimits);
if (unlikely(current->nr_dirtied >= ratelimit))
*p = 0;
else if (unlikely(*p >= ratelimit_pages)) {
*p = 0;
ratelimit = 0;
}
/*
* Pick up the dirtied pages by the exited tasks. This avoids lots of
* short-lived tasks (eg. gcc invocations in a kernel build) escaping
* the dirty throttling and livelock other long-run dirtiers.
*/
p = this_cpu_ptr(&dirty_throttle_leaks);
if (*p > 0 && current->nr_dirtied < ratelimit) {
unsigned long nr_pages_dirtied;
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
*p -= nr_pages_dirtied;
current->nr_dirtied += nr_pages_dirtied;
}
preempt_enable();
if (unlikely(current->nr_dirtied >= ratelimit))
balance_dirty_pages(mapping, wb, current->nr_dirtied);
wb_put(wb);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
/**
* wb_over_bg_thresh - does @wb need to be written back?
* @wb: bdi_writeback of interest
*
* Determines whether background writeback should keep writing @wb or it's
* clean enough. Returns %true if writeback should continue.
*/
bool wb_over_bg_thresh(struct bdi_writeback *wb)
{
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
struct dirty_throttle_control * const gdtc = &gdtc_stor;
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
/*
* Similar to balance_dirty_pages() but ignores pages being written
* as we're trying to decide whether to put more under writeback.
*/
gdtc->avail = global_dirtyable_memory();
gdtc->dirty = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
domain_dirty_limits(gdtc);
if (gdtc->dirty > gdtc->bg_thresh)
return true;
if (wb_stat(wb, WB_RECLAIMABLE) >
wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
return true;
if (mdtc) {
unsigned long filepages, headroom, writeback;
mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
&writeback);
mdtc_calc_avail(mdtc, filepages, headroom);
domain_dirty_limits(mdtc); /* ditto, ignore writeback */
if (mdtc->dirty > mdtc->bg_thresh)
return true;
if (wb_stat(wb, WB_RECLAIMABLE) >
wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
return true;
}
return false;
}
void throttle_vm_writeout(gfp_t gfp_mask)
{
unsigned long background_thresh;
unsigned long dirty_thresh;
for ( ; ; ) {
global_dirty_limits(&background_thresh, &dirty_thresh);
dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
/*
* Boost the allowable dirty threshold a bit for page
* allocators so they don't get DoS'ed by heavy writers
*/
dirty_thresh += dirty_thresh / 10; /* wheeee... */
if (global_page_state(NR_UNSTABLE_NFS) +
global_page_state(NR_WRITEBACK) <= dirty_thresh)
break;
/* Try safe version */
else if (unlikely(global_page_state_snapshot(NR_UNSTABLE_NFS) +
global_page_state_snapshot(NR_WRITEBACK) <=
dirty_thresh))
break;
congestion_wait(BLK_RW_ASYNC, HZ/10);
/*
* The caller might hold locks which can prevent IO completion
* or progress in the filesystem. So we cannot just sit here
* waiting for IO to complete.
*/
if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
break;
}
}
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, buffer, length, ppos);
return 0;
}
#ifdef CONFIG_BLOCK
void laptop_mode_timer_fn(unsigned long data)
{
struct request_queue *q = (struct request_queue *)data;
int nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
struct bdi_writeback *wb;
/*
* We want to write everything out, not just down to the dirty
* threshold
*/
if (!bdi_has_dirty_io(&q->backing_dev_info))
return;
rcu_read_lock();
list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
if (wb_has_dirty_io(wb))
wb_start_writeback(wb, nr_pages, true,
WB_REASON_LAPTOP_TIMER);
rcu_read_unlock();
}
/*
* We've spun up the disk and we're in laptop mode: schedule writeback
* of all dirty data a few seconds from now. If the flush is already scheduled
* then push it back - the user is still using the disk.
*/
void laptop_io_completion(struct backing_dev_info *info)
{
mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
}
/*
* We're in laptop mode and we've just synced. The sync's writes will have
* caused another writeback to be scheduled by laptop_io_completion.
* Nothing needs to be written back anymore, so we unschedule the writeback.
*/
void laptop_sync_completion(void)
{
struct backing_dev_info *bdi;
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
del_timer(&bdi->laptop_mode_wb_timer);
rcu_read_unlock();
}
#endif
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
* if a large number of processes all perform writes at the same time.
* If it is too low then SMP machines will call the (expensive)
* get_writeback_state too often.
*
* Here we set ratelimit_pages to a level which ensures that when all CPUs are
* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
* thresholds.
*/
void writeback_set_ratelimit(void)
{
struct wb_domain *dom = &global_wb_domain;
unsigned long background_thresh;
unsigned long dirty_thresh;
global_dirty_limits(&background_thresh, &dirty_thresh);
dom->dirty_limit = dirty_thresh;
ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
if (ratelimit_pages < 16)
ratelimit_pages = 16;
}
static int
ratelimit_handler(struct notifier_block *self, unsigned long action,
void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
case CPU_DEAD:
writeback_set_ratelimit();
return NOTIFY_OK;
default:
return NOTIFY_DONE;
}
}
static struct notifier_block ratelimit_nb = {
.notifier_call = ratelimit_handler,
.next = NULL,
};
/*
* Called early on to tune the page writeback dirty limits.
*
* We used to scale dirty pages according to how total memory
* related to pages that could be allocated for buffers (by
* comparing nr_free_buffer_pages() to vm_total_pages.
*
* However, that was when we used "dirty_ratio" to scale with
* all memory, and we don't do that any more. "dirty_ratio"
* is now applied to total non-HIGHPAGE memory (by subtracting
* totalhigh_pages from vm_total_pages), and as such we can't
* get into the old insane situation any more where we had
* large amounts of dirty pages compared to a small amount of
* non-HIGHMEM memory.
*
* But we might still want to scale the dirty_ratio by how
* much memory the box has..
*/
void __init page_writeback_init(void)
{
BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
}
/**
* tag_pages_for_writeback - tag pages to be written by write_cache_pages
* @mapping: address space structure to write
* @start: starting page index
* @end: ending page index (inclusive)
*
* This function scans the page range from @start to @end (inclusive) and tags
* all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
* that write_cache_pages (or whoever calls this function) will then use
* TOWRITE tag to identify pages eligible for writeback. This mechanism is
* used to avoid livelocking of writeback by a process steadily creating new
* dirty pages in the file (thus it is important for this function to be quick
* so that it can tag pages faster than a dirtying process can create them).
*/
/*
* We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
*/
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
#define WRITEBACK_TAG_BATCH 4096
unsigned long tagged;
do {
spin_lock_irq(&mapping->tree_lock);
tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
&start, end, WRITEBACK_TAG_BATCH,
PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
spin_unlock_irq(&mapping->tree_lock);
WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
cond_resched();
/* We check 'start' to handle wrapping when end == ~0UL */
} while (tagged >= WRITEBACK_TAG_BATCH && start);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
/**
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
* @writepage: function called for each page
* @data: data passed to writepage function
*
* If a page is already under I/O, write_cache_pages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
* but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
* and msync() need to guarantee that all the data which was dirty at the time
* the call was made get new I/O started against them. If wbc->sync_mode is
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete.
*
* To avoid livelocks (when other process dirties new pages), we first tag
* pages which should be written back with TOWRITE tag and only then start
* writing them. For data-integrity sync we have to be careful so that we do
* not miss some pages (e.g., because some other process has cleared TOWRITE
* tag we set). The rule we follow is that TOWRITE tag can be cleared only
* by the process clearing the DIRTY tag (and submitting the page for IO).
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
void *data)
{
int ret = 0;
int done = 0;
struct pagevec pvec;
int nr_pages;
pgoff_t uninitialized_var(writeback_index);
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
int cycled;
int range_whole = 0;
int tag;
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
if (index == 0)
cycled = 1;
else
cycled = 0;
end = -1;
} else {
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && (index <= end)) {
int i;
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
/*
* At this point, the page may be truncated or
* invalidated (changing page->mapping to NULL), or
* even swizzled back from swapper_space to tmpfs file
* mapping. However, page->index will not change
* because we have a reference on the page.
*/
if (page->index > end) {
/*
* can't be range_cyclic (1st pass) because
* end == -1 in that case.
*/
done = 1;
break;
}
done_index = page->index;
lock_page(page);
/*
* Page truncated or invalidated. We can freely skip it
* then, even for data integrity operations: the page
* has disappeared concurrently, so there could be no
* real expectation of this data interity operation
* even if there is now a new, dirty page at the same
* pagecache address.
*/
if (unlikely(page->mapping != mapping)) {
continue_unlock:
unlock_page(page);
continue;
}
if (!PageDirty(page)) {
/* someone wrote it for us */
goto continue_unlock;
}
if (PageWriteback(page)) {
if (wbc->sync_mode != WB_SYNC_NONE)
wait_on_page_writeback(page);
else
goto continue_unlock;
}
BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
ret = (*writepage)(page, wbc, data);
if (unlikely(ret)) {
if (ret == AOP_WRITEPAGE_ACTIVATE) {
unlock_page(page);
ret = 0;
} else {
/*
* done_index is set past this page,
* so media errors will not choke
* background writeout for the entire
* file. This has consequences for
* range_cyclic semantics (ie. it may
* not be suitable for data integrity
* writeout).
*/
done_index = page->index + 1;
done = 1;
break;
}
}
/*
* We stop writing back only if we are not doing
* integrity sync. In case of integrity sync we have to
* keep going until we have written all the pages
* we tagged for writeback prior to entering this loop.
*/
if (--wbc->nr_to_write <= 0 &&
wbc->sync_mode == WB_SYNC_NONE) {
done = 1;
break;
}
}
pagevec_release(&pvec);
cond_resched();
}
if (!cycled && !done) {
/*
* range_cyclic:
* We hit the last page and there is more work to be done: wrap
* back to the start of the file
*/
cycled = 1;
index = 0;
end = writeback_index - 1;
goto retry;
}
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
return ret;
}
EXPORT_SYMBOL(write_cache_pages);
/*
* Function used by generic_writepages to call the real writepage
* function and set the mapping flags on error
*/
static int __writepage(struct page *page, struct writeback_control *wbc,
void *data)
{
struct address_space *mapping = data;
int ret = mapping->a_ops->writepage(page, wbc);
mapping_set_error(mapping, ret);
return ret;
}
/**
* generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
*
* This is a library function, which implements the writepages()
* address_space_operation.
*/
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct blk_plug plug;
int ret;
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
return 0;
blk_start_plug(&plug);
ret = write_cache_pages(mapping, wbc, __writepage, mapping);
blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL(generic_writepages);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
if (wbc->nr_to_write <= 0)
return 0;
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
else
ret = generic_writepages(mapping, wbc);
return ret;
}
/**
* write_one_page - write out a single page and optionally wait on I/O
* @page: the page to write
* @wait: if true, wait on writeout
*
* The page must be locked by the caller and will be unlocked upon return.
*
* write_one_page() returns a negative error code if I/O failed.
*/
int write_one_page(struct page *page, int wait)
{
struct address_space *mapping = page->mapping;
int ret = 0;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 1,
};
BUG_ON(!PageLocked(page));
if (wait)
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
page_cache_get(page);
ret = mapping->a_ops->writepage(page, &wbc);
if (ret == 0 && wait) {
wait_on_page_writeback(page);
if (PageError(page))
ret = -EIO;
}
page_cache_release(page);
} else {
unlock_page(page);
}
return ret;
}
EXPORT_SYMBOL(write_one_page);
/*
* For address_spaces which do not use buffers nor write back.
*/
int __set_page_dirty_no_writeback(struct page *page)
{
if (!PageDirty(page))
return !TestSetPageDirty(page);
return 0;
}
/*
* Helper function for set_page_dirty family.
*
* Caller must hold mem_cgroup_begin_page_stat().
*
* NOTE: This relies on being atomic wrt interrupts.
*/
void account_page_dirtied(struct page *page, struct address_space *mapping,
struct mem_cgroup *memcg)
{
struct inode *inode = mapping->host;
trace_writeback_dirty_page(page, mapping);
if (mapping_cap_account_dirty(mapping)) {
struct bdi_writeback *wb;
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
__inc_wb_stat(wb, WB_RECLAIMABLE);
__inc_wb_stat(wb, WB_DIRTIED);
task_io_account_write(PAGE_CACHE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
}
}
EXPORT_SYMBOL(account_page_dirtied);
/*
* Helper function for deaccounting dirty page without writeback.
*
* Caller must hold mem_cgroup_begin_page_stat().
*/
void account_page_cleaned(struct page *page, struct address_space *mapping,
struct mem_cgroup *memcg, struct bdi_writeback *wb)
{
if (mapping_cap_account_dirty(mapping)) {
mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
task_io_account_cancelled_write(PAGE_CACHE_SIZE);
}
}
/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
* This is also used when a single buffer is being dirtied: we want to set the
* page dirty in that case, but not all the buffers. This is a "bottom-up"
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
*
* The caller must ensure this doesn't race with truncation. Most will simply
* hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
* the pte lock held, which also locks out truncation.
*/
int __set_page_dirty_nobuffers(struct page *page)
{
struct mem_cgroup *memcg;
memcg = mem_cgroup_begin_page_stat(page);
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
unsigned long flags;
if (!mapping) {
mem_cgroup_end_page_stat(memcg);
return 1;
}
spin_lock_irqsave(&mapping->tree_lock, flags);
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping, memcg);
radix_tree_tag_set(&mapping->page_tree, page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
mem_cgroup_end_page_stat(memcg);
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
return 1;
}
mem_cgroup_end_page_stat(memcg);
return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
/*
* Call this whenever redirtying a page, to de-account the dirty counters
* (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
* counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
* systematic errors in balanced_dirty_ratelimit and the dirty pages position
* control.
*/
void account_page_redirty(struct page *page)
{
struct address_space *mapping = page->mapping;
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
bool locked;
wb = unlocked_inode_to_wb_begin(inode, &locked);
current->nr_dirtied--;
dec_zone_page_state(page, NR_DIRTIED);
dec_wb_stat(wb, WB_DIRTIED);
unlocked_inode_to_wb_end(inode, locked);
}
}
EXPORT_SYMBOL(account_page_redirty);
/*
* When a writepage implementation decides that it doesn't want to write this
* page for some reason, it should redirty the locked page via
* redirty_page_for_writepage() and it should then unlock the page and return 0
*/
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
{
int ret;
wbc->pages_skipped++;
ret = __set_page_dirty_nobuffers(page);
account_page_redirty(page);
return ret;
}
EXPORT_SYMBOL(redirty_page_for_writepage);
/*
* Dirty a page.
*
* For pages with a mapping this should be done under the page lock
* for the benefit of asynchronous memory errors who prefer a consistent
* dirty state. This rule can be broken in some special cases,
* but should be better not to.
*
* If the mapping doesn't provide a set_page_dirty a_op, then
* just fall through and assume that it wants buffer_heads.
*/
int set_page_dirty(struct page *page)
{
struct address_space *mapping = page_mapping(page);
if (likely(mapping)) {
int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
/*
* readahead/lru_deactivate_page could remain
* PG_readahead/PG_reclaim due to race with end_page_writeback
* About readahead, if the page is written, the flags would be
* reset. So no problem.
* About lru_deactivate_page, if the page is redirty, the flag
* will be reset. So no problem. but if the page is used by readahead
* it will confuse readahead and make it restart the size rampup
* process. But it's a trivial problem.
*/
if (PageReclaim(page))
ClearPageReclaim(page);
#ifdef CONFIG_BLOCK
if (!spd)
spd = __set_page_dirty_buffers;
#endif
return (*spd)(page);
}
if (!PageDirty(page)) {
if (!TestSetPageDirty(page))
return 1;
}
return 0;
}
EXPORT_SYMBOL(set_page_dirty);
/*
* set_page_dirty() is racy if the caller has no reference against
* page->mapping->host, and if the page is unlocked. This is because another
* CPU could truncate the page off the mapping and then free the mapping.
*
* Usually, the page _is_ locked, or the caller is a user-space process which
* holds a reference on the inode by having an open file.
*
* In other cases, the page should be locked before running set_page_dirty().
*/
int set_page_dirty_lock(struct page *page)
{
int ret;
lock_page(page);
ret = set_page_dirty(page);
unlock_page(page);
return ret;
}
EXPORT_SYMBOL(set_page_dirty_lock);
/*
* This cancels just the dirty bit on the kernel page itself, it does NOT
* actually remove dirty bits on any mmap's that may be around. It also
* leaves the page tagged dirty, so any sync activity will still find it on
* the dirty lists, and in particular, clear_page_dirty_for_io() will still
* look at the dirty bits in the VM.
*
* Doing this should *normally* only ever be done when a page is truncated,
* and is not actually mapped anywhere at all. However, fs/buffer.c does
* this when it notices that somebody has cleaned out all the buffers on a
* page without actually doing it through the VM. Can you say "ext3 is
* horribly ugly"? Thought you could.
*/
void cancel_dirty_page(struct page *page)
{
struct address_space *mapping = page_mapping(page);
if (mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct mem_cgroup *memcg;
bool locked;
memcg = mem_cgroup_begin_page_stat(page);
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page))
account_page_cleaned(page, mapping, memcg, wb);
unlocked_inode_to_wb_end(inode, locked);
mem_cgroup_end_page_stat(memcg);
} else {
ClearPageDirty(page);
}
}
EXPORT_SYMBOL(cancel_dirty_page);
/*
* Clear a page's dirty flag, while caring for dirty memory accounting.
* Returns true if the page was previously dirty.
*
* This is for preparing to put the page under writeout. We leave the page
* tagged as dirty in the radix tree so that a concurrent write-for-sync
* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
* implementation will run either set_page_writeback() or set_page_dirty(),
* at which stage we bring the page's dirty flag and radix-tree dirty tag
* back into sync.
*
* This incoherency between the page's dirty flag and radix-tree tag is
* unfortunate, but it only exists while the page is locked.
*/
int clear_page_dirty_for_io(struct page *page)
{
struct address_space *mapping = page_mapping(page);
int ret = 0;
BUG_ON(!PageLocked(page));
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct mem_cgroup *memcg;
bool locked;
/*
* Yes, Virginia, this is indeed insane.
*
* We use this sequence to make sure that
* (a) we account for dirty stats properly
* (b) we tell the low-level filesystem to
* mark the whole page dirty if it was
* dirty in a pagetable. Only to then
* (c) clean the page again and return 1 to
* cause the writeback.
*
* This way we avoid all nasty races with the
* dirty bit in multiple places and clearing
* them concurrently from different threads.
*
* Note! Normally the "set_page_dirty(page)"
* has no effect on the actual dirty bit - since
* that will already usually be set. But we
* need the side effects, and it can help us
* avoid races.
*
* We basically use the page "master dirty bit"
* as a serialization point for all the different
* threads doing their things.
*/
if (page_mkclean(page))
set_page_dirty(page);
/*
* We carefully synchronise fault handlers against
* installing a dirty pte and marking the page dirty
* at this point. We do this by having them hold the
* page lock while dirtying the page, and pages are
* always locked coming in here, so we get the desired
* exclusion.
*/
memcg = mem_cgroup_begin_page_stat(page);
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) {
mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
}
unlocked_inode_to_wb_end(inode, locked);
mem_cgroup_end_page_stat(memcg);
return ret;
}
return TestClearPageDirty(page);
}
EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
struct mem_cgroup *memcg;
int ret;
memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
ret = TestClearPageWriteback(page);
if (ret) {
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi)) {
struct bdi_writeback *wb = inode_to_wb(inode);
__dec_wb_stat(wb, WB_WRITEBACK);
__wb_writeout_inc(wb);
}
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestClearPageWriteback(page);
}
if (ret) {
mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
mem_cgroup_end_page_stat(memcg);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
struct mem_cgroup *memcg;
int ret;
memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
ret = TestSetPageWriteback(page);
if (!ret) {
radix_tree_tag_set(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi))
__inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
}
if (!PageDirty(page))
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
if (!keep_write)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_TOWRITE);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
}
if (!ret) {
mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
mem_cgroup_end_page_stat(memcg);
return ret;
}
EXPORT_SYMBOL(__test_set_page_writeback);
/*
* Return true if any of the pages in the mapping are marked with the
* passed tag.
*/
int mapping_tagged(struct address_space *mapping, int tag)
{
return radix_tree_tagged(&mapping->page_tree, tag);
}
EXPORT_SYMBOL(mapping_tagged);
/**
* wait_for_stable_page() - wait for writeback to finish, if necessary.
* @page: The page to wait on.
*
* This function determines if the given page is related to a backing device
* that requires page contents to be held stable during writeback. If so, then
* it will wait for any pending writeback to complete.
*/
void wait_for_stable_page(struct page *page)
{
if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
wait_on_page_writeback(page);
}
EXPORT_SYMBOL_GPL(wait_for_stable_page);