android_kernel_oneplus_msm8998/mm/readahead.c
Srinivasarao P dd21e159de Merge android-4.4.172 (b3e9e81) into msm-4.4
* refs/heads/tmp-b3e9e81
  Linux 4.4.172
  ipmi:ssif: Fix handling of multi-part return messages
  net: speed up skb_rbtree_purge()
  mm, proc: be more verbose about unstable VMA flags in /proc/<pid>/smaps
  mm/page-writeback.c: don't break integrity writeback on ->writepage() error
  ocfs2: fix panic due to unrecovered local alloc
  scsi: megaraid: fix out-of-bound array accesses
  sysfs: Disable lockdep for driver bind/unbind files
  ALSA: bebob: fix model-id of unit for Apogee Ensemble
  dm snapshot: Fix excessive memory usage and workqueue stalls
  dm kcopyd: Fix bug causing workqueue stalls
  perf parse-events: Fix unchecked usage of strncpy()
  perf svghelper: Fix unchecked usage of strncpy()
  perf intel-pt: Fix error with config term "pt=0"
  mmc: atmel-mci: do not assume idle after atmci_request_end
  kconfig: fix memory leak when EOF is encountered in quotation
  kconfig: fix file name and line number of warn_ignored_character()
  clk: imx6q: reset exclusive gates on init
  scsi: target: use consistent left-aligned ASCII INQUIRY data
  net: call sk_dst_reset when set SO_DONTROUTE
  media: firewire: Fix app_info parameter type in avc_ca{,_app}_info
  powerpc/pseries/cpuidle: Fix preempt warning
  pstore/ram: Do not treat empty buffers as valid
  jffs2: Fix use of uninitialized delayed_work, lockdep breakage
  arm64: perf: set suppress_bind_attrs flag to true
  MIPS: SiByte: Enable swiotlb for SWARM, LittleSur and BigSur
  writeback: don't decrement wb->refcnt if !wb->bdi
  e1000e: allow non-monotonic SYSTIM readings
  platform/x86: asus-wmi: Tell the EC the OS will handle the display off hotkey
  xfs: don't fail when converting shortform attr to long form during ATTR_REPLACE
  ipv6: Take rcu_read_lock in __inet6_bind for mapped addresses
  ipv6: Consider sk_bound_dev_if when binding a socket to a v4 mapped address
  r8169: Add support for new Realtek Ethernet
  media: vb2: be sure to unlock mutex on errors
  drm/fb-helper: Ignore the value of fb_var_screeninfo.pixclock
  loop: Fix double mutex_unlock(&loop_ctl_mutex) in loop_control_ioctl()
  loop: Get rid of loop_index_mutex
  loop: Fold __loop_release into loop_release
  block/loop: Use global lock for ioctl() operation.
  tipc: fix uninit-value in tipc_nl_compat_doit
  tipc: fix uninit-value in tipc_nl_compat_name_table_dump
  tipc: fix uninit-value in tipc_nl_compat_link_set
  tipc: fix uninit-value in tipc_nl_compat_bearer_enable
  tipc: fix uninit-value in tipc_nl_compat_link_reset_stats
  sctp: allocate sctp_sockaddr_entry with kzalloc
  selinux: fix GPF on invalid policy
  sunrpc: handle ENOMEM in rpcb_getport_async
  media: vb2: vb2_mmap: move lock up
  LSM: Check for NULL cred-security on free
  media: vivid: set min width/height to a value > 0
  media: vivid: fix error handling of kthread_run
  omap2fb: Fix stack memory disclosure
  Disable MSI also when pcie-octeon.pcie_disable on
  mfd: tps6586x: Handle interrupts on suspend
  mips: fix n32 compat_ipc_parse_version
  scsi: sd: Fix cache_type_store()
  Yama: Check for pid death before checking ancestry
  btrfs: wait on ordered extents on abort cleanup
  crypto: authenc - fix parsing key with misaligned rta_len
  crypto: authencesn - Avoid twice completion call in decrypt path
  ip: on queued skb use skb_header_pointer instead of pskb_may_pull
  packet: Do not leak dev refcounts on error exit
  net: bridge: fix a bug on using a neighbour cache entry without checking its state
  ipv6: fix kernel-infoleak in ipv6_local_error()
  arm64: Don't trap host pointer auth use to EL2
  arm64/kvm: consistently handle host HCR_EL2 flags
  proc: Remove empty line in /proc/self/status
  media: em28xx: Fix misplaced reset of dev->v4l::field_count
  f2fs: fix validation of the block count in sanity_check_raw_super
  f2fs: fix missing up_read
  f2fs: fix invalid memory access
  f2fs: fix to do sanity check with cp_pack_start_sum
  f2fs: fix to do sanity check with block address in main area v2
  f2fs: fix to do sanity check with block address in main area
  f2fs: fix to do sanity check with reserved blkaddr of inline inode
  f2fs: fix to do sanity check with node footer and iblocks
  f2fs: Add sanity_check_inode() function
  f2fs: fix to do sanity check with user_block_count
  f2fs: fix to do sanity check with secs_per_zone
  f2fs: introduce and spread verify_blkaddr
  f2fs: clean up with is_valid_blkaddr()
  f2fs: enhance sanity_check_raw_super() to avoid potential overflow
  f2fs: sanity check on sit entry
  f2fs: check blkaddr more accuratly before issue a bio
  f2fs: return error during fill_super
  f2fs: fix race condition in between free nid allocator/initializer
  f2fs: free meta pages if sanity check for ckpt is failed
  f2fs: detect wrong layout
  f2fs: fix to determine start_cp_addr by sbi->cur_cp_pack
  f2fs: put directory inodes before checkpoint in roll-forward recovery
  f2fs: introduce get_checkpoint_version for cleanup
  f2fs: use crc and cp version to determine roll-forward recovery
  f2fs: avoid unneeded loop in build_sit_entries
  f2fs: not allow to write illegal blkaddr
  f2fs: fix to avoid reading out encrypted data in page cache
  f2fs: fix inode cache leak
  f2fs: factor out fsync inode entry operations
  f2fs: remove an obsolete variable
  f2fs: give -EINVAL for norecovery and rw mount
  f2fs: fix to convert inline directory correctly
  f2fs: move sanity checking of cp into get_valid_checkpoint
  f2fs: cover more area with nat_tree_lock
  f2fs: clean up argument of recover_data
  can: gw: ensure DLC boundaries after CAN frame modification
  tty/ldsem: Wake up readers after timed out down_write()
  UPSTREAM: dm: do not allow readahead to limit IO size
  UPSTREAM: readahead: stricter check for bdi io_pages
  UPSTREAM: mm: don't cap request size based on read-ahead setting
  ANDROID: Fix cuttlefish redundant vsock connection.
  UPSTREAM: loop: drop caches if offset or block_size are changed

Conflicts:
	arch/arm64/kvm/hyp.S

Fixed compilation issue due to variable 'backing_dev_info',
which got changed to pointer in downstream.

Change-Id: I8baa569fe9fc13ed5a7e863e5ad1fb8cf1cd469e
Signed-off-by: Srinivasarao P <spathi@codeaurora.org>
2019-01-29 16:37:33 +05:30

597 lines
16 KiB
C

/*
* mm/readahead.c - address_space-level file readahead.
*
* Copyright (C) 2002, Linus Torvalds
*
* 09Apr2002 Andrew Morton
* Initial version.
*/
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/file.h>
#include "internal.h"
/*
* Initialise a struct file's readahead state. Assumes that the caller has
* memset *ra to zero.
*/
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
ra->prev_pos = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
/*
* see if a page needs releasing upon read_cache_pages() failure
* - the caller of read_cache_pages() may have set PG_private or PG_fscache
* before calling, such as the NFS fs marking pages that are cached locally
* on disk, thus we need to give the fs a chance to clean up in the event of
* an error
*/
static void read_cache_pages_invalidate_page(struct address_space *mapping,
struct page *page)
{
if (page_has_private(page)) {
if (!trylock_page(page))
BUG();
page->mapping = mapping;
do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
page->mapping = NULL;
unlock_page(page);
}
page_cache_release(page);
}
/*
* release a list of pages, invalidating them first if need be
*/
static void read_cache_pages_invalidate_pages(struct address_space *mapping,
struct list_head *pages)
{
struct page *victim;
while (!list_empty(pages)) {
victim = list_to_page(pages);
list_del(&victim->lru);
read_cache_pages_invalidate_page(mapping, victim);
}
}
/**
* read_cache_pages - populate an address space with some pages & start reads against them
* @mapping: the address_space
* @pages: The address of a list_head which contains the target pages. These
* pages have their ->index populated and are otherwise uninitialised.
* @filler: callback routine for filling a single page.
* @data: private data for the callback routine.
*
* Hides the details of the LRU cache etc from the filesystems.
*/
int read_cache_pages(struct address_space *mapping, struct list_head *pages,
int (*filler)(void *, struct page *), void *data)
{
struct page *page;
int ret = 0;
while (!list_empty(pages)) {
page = list_to_page(pages);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
mapping_gfp_constraint(mapping, GFP_KERNEL))) {
read_cache_pages_invalidate_page(mapping, page);
continue;
}
page_cache_release(page);
ret = filler(data, page);
if (unlikely(ret)) {
read_cache_pages_invalidate_pages(mapping, pages);
break;
}
task_io_account_read(PAGE_CACHE_SIZE);
}
return ret;
}
EXPORT_SYMBOL(read_cache_pages);
static int read_pages(struct address_space *mapping, struct file *filp,
struct list_head *pages, unsigned nr_pages)
{
struct blk_plug plug;
unsigned page_idx;
int ret;
blk_start_plug(&plug);
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
put_pages_list(pages);
goto out;
}
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_to_page(pages);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping, page->index,
mapping_gfp_constraint(mapping, GFP_KERNEL))) {
mapping->a_ops->readpage(filp, page);
}
page_cache_release(page);
}
ret = 0;
out:
blk_finish_plug(&plug);
return ret;
}
/*
* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size)
{
struct inode *inode = mapping->host;
struct page *page;
unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
int ret = 0;
loff_t isize = i_size_read(inode);
if (isize == 0)
goto out;
end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
/*
* Preallocate as many pages as we will need.
*/
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
pgoff_t page_offset = offset + page_idx;
if (page_offset > end_index)
break;
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_offset);
rcu_read_unlock();
if (page && !radix_tree_exceptional_entry(page))
continue;
page = page_cache_alloc_readahead(mapping);
if (!page)
break;
page->index = page_offset;
list_add(&page->lru, &page_pool);
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page);
ret++;
}
/*
* Now start the IO. We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
if (ret)
read_pages(mapping, filp, &page_pool, ret);
BUG_ON(!list_empty(&page_pool));
out:
return ret;
}
/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.
*/
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
struct file_ra_state *ra = &filp->f_ra;
unsigned long max_pages;
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
/*
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
nr_to_read = min(nr_to_read, max_pages);
while (nr_to_read) {
int err;
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
err = __do_page_cache_readahead(mapping, filp,
offset, this_chunk, 0);
if (err < 0)
return err;
offset += this_chunk;
nr_to_read -= this_chunk;
}
return 0;
}
/*
* Set the initial window size, round to next power of 2 and square
* Small size is not dependant on max value - only a one-page read is regarded
* as small.
* for small size, x 4 for medium, and x 2 for large
* for 128k (32 page) max ra
* 1-8 page = 32k initial, > 8 page = 128k initial
*/
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
unsigned long newsize = roundup_pow_of_two(size);
if (newsize <= 1)
newsize = newsize * 4;
else if (newsize <= max / 4)
newsize = newsize * 2;
else
newsize = max;
return newsize;
}
/*
* Get the previous window size, ramp it up, and
* return it as the new window size.
*/
static unsigned long get_next_ra_size(struct file_ra_state *ra,
unsigned long max)
{
unsigned long cur = ra->size;
unsigned long newsize;
if (cur < max / 16)
newsize = 4 * cur;
else
newsize = 2 * cur;
return min(newsize, max);
}
/*
* On-demand readahead design.
*
* The fields in struct file_ra_state represent the most-recently-executed
* readahead attempt:
*
* |<----- async_size ---------|
* |------------------- size -------------------->|
* |==================#===========================|
* ^start ^page marked with PG_readahead
*
* To overlap application thinking time and disk I/O time, we do
* `readahead pipelining': Do not wait until the application consumed all
* readahead pages and stalled on the missing page at readahead_index;
* Instead, submit an asynchronous readahead I/O as soon as there are
* only async_size pages left in the readahead window. Normally async_size
* will be equal to size, for maximum pipelining.
*
* In interleaved sequential reads, concurrent streams on the same fd can
* be invalidating each other's readahead state. So we flag the new readahead
* page at (start+size-async_size) with PG_readahead, and use it as readahead
* indicator. The flag won't be set on already cached pages, to avoid the
* readahead-for-nothing fuss, saving pointless page cache lookups.
*
* prev_pos tracks the last visited byte in the _previous_ read request.
* It should be maintained by the caller, and will be used for detecting
* small random reads. Note that the readahead algorithm checks loosely
* for sequential patterns. Hence interleaved reads might be served as
* sequential ones.
*
* There is a special-case: if the first page which the application tries to
* read happens to be the first page of the file, it is assumed that a linear
* read is about to happen and the window is immediately set to the initial size
* based on I/O request size and the max_readahead.
*
* The code ramps up the readahead size aggressively at first, but slow down as
* it approaches max_readhead.
*/
/*
* Count contiguously cached pages from @offset-1 to @offset-@max,
* this count is a conservative estimation of
* - length of the sequential read sequence, or
* - thrashing threshold in memory tight systems
*/
static pgoff_t count_history_pages(struct address_space *mapping,
pgoff_t offset, unsigned long max)
{
pgoff_t head;
rcu_read_lock();
head = page_cache_prev_hole(mapping, offset - 1, max);
rcu_read_unlock();
return offset - 1 - head;
}
/*
* page cache context based read-ahead
*/
static int try_context_readahead(struct address_space *mapping,
struct file_ra_state *ra,
pgoff_t offset,
unsigned long req_size,
unsigned long max)
{
pgoff_t size;
size = count_history_pages(mapping, offset, max);
/*
* not enough history pages:
* it could be a random read
*/
if (size <= req_size)
return 0;
/*
* starts from beginning of file:
* it is a strong indication of long-run stream (or whole-file-read)
*/
if (size >= offset)
size *= 2;
ra->start = offset;
ra->size = min(size + req_size, max);
ra->async_size = 1;
return 1;
}
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
static unsigned long
ondemand_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
pgoff_t prev_offset;
/*
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
if (req_size > max_pages && bdi->io_pages > max_pages)
max_pages = min(req_size, bdi->io_pages);
/*
* start of file
*/
if (!offset)
goto initial_readahead;
/*
* It's the expected callback offset, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
if ((offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
/*
* Hit a marked page without valid readahead state.
* E.g. interleaved reads.
* Query the pagecache for async_size, which normally equals to
* readahead size. Ramp it up and use it as the new readahead size.
*/
if (hit_readahead_marker) {
pgoff_t start;
rcu_read_lock();
start = page_cache_next_hole(mapping, offset + 1, max_pages);
rcu_read_unlock();
if (!start || start - offset > max_pages)
return 0;
ra->start = start;
ra->size = start - offset; /* old async_size */
ra->size += req_size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
/*
* oversize read
*/
if (req_size > max_pages)
goto initial_readahead;
/*
* sequential cache miss
* trivial case: (offset - prev_offset) == 1
* unaligned reads: (offset - prev_offset) == 0
*/
prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
if (offset - prev_offset <= 1UL)
goto initial_readahead;
/*
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
initial_readahead:
ra->start = offset;
ra->size = get_init_ra_size(req_size, max_pages);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
readit:
/*
* Will this read hit the readahead marker made by itself?
* If so, trigger the readahead marker hit now, and merge
* the resulted next readahead window into the current one.
* Take care of maximum IO pages as above.
*/
if (offset == ra->start && ra->size == ra->async_size) {
add_pages = get_next_ra_size(ra, max_pages);
if (ra->size + add_pages <= max_pages) {
ra->async_size = add_pages;
ra->size += add_pages;
} else {
ra->size = max_pages;
ra->async_size = max_pages >> 1;
}
}
return ra_submit(ra, mapping, filp);
}
/**
* page_cache_sync_readahead - generic file readahead
* @mapping: address_space which holds the pagecache and I/O vectors
* @ra: file_ra_state which holds the readahead state
* @filp: passed on to ->readpage() and ->readpages()
* @offset: start offset into @mapping, in pagecache page-sized units
* @req_size: hint: total size of the read which the caller is performing in
* pagecache pages
*
* page_cache_sync_readahead() should be called when a cache miss happened:
* it will submit the read. The readahead logic may decide to piggyback more
* pages onto the read request if access patterns suggest it will improve
* performance.
*/
void page_cache_sync_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
pgoff_t offset, unsigned long req_size)
{
/* no read-ahead */
if (!ra->ra_pages)
return;
/* be dumb */
if (filp && (filp->f_mode & FMODE_RANDOM)) {
force_page_cache_readahead(mapping, filp, offset, req_size);
return;
}
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, false, offset, req_size);
}
EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
/**
* page_cache_async_readahead - file readahead for marked pages
* @mapping: address_space which holds the pagecache and I/O vectors
* @ra: file_ra_state which holds the readahead state
* @filp: passed on to ->readpage() and ->readpages()
* @page: the page at @offset which has the PG_readahead flag set
* @offset: start offset into @mapping, in pagecache page-sized units
* @req_size: hint: total size of the read which the caller is performing in
* pagecache pages
*
* page_cache_async_readahead() should be called when a page is used which
* has the PG_readahead flag; this is a marker to suggest that the application
* has used up enough of the readahead window that we should start pulling in
* more pages.
*/
void
page_cache_async_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
struct page *page, pgoff_t offset,
unsigned long req_size)
{
/* no read-ahead */
if (!ra->ra_pages)
return;
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/
if (PageWriteback(page))
return;
ClearPageReadahead(page);
/*
* Defer asynchronous read-ahead on IO congestion.
*/
if (inode_read_congested(mapping->host))
return;
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, true, offset, req_size);
}
EXPORT_SYMBOL_GPL(page_cache_async_readahead);
static ssize_t
do_readahead(struct address_space *mapping, struct file *filp,
pgoff_t index, unsigned long nr)
{
if (!mapping || !mapping->a_ops)
return -EINVAL;
return force_page_cache_readahead(mapping, filp, index, nr);
}
SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
{
ssize_t ret;
struct fd f;
ret = -EBADF;
f = fdget(fd);
if (f.file) {
if (f.file->f_mode & FMODE_READ) {
struct address_space *mapping = f.file->f_mapping;
pgoff_t start = offset >> PAGE_CACHE_SHIFT;
pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
unsigned long len = end - start + 1;
ret = do_readahead(mapping, f.file, start, len);
}
fdput(f);
}
return ret;
}