From 0c079cd9d1e8092e9a123952c54517b648c1458b Mon Sep 17 00:00:00 2001 From: Nikhilesh Reddy Date: Mon, 24 Oct 2016 16:31:44 -0700 Subject: [PATCH] Block: Add support to measure bio latencies Add support to measure bio latencies as seen by upper layers. Change-Id: I9bfa84d2b57aa9b41d3be08744cb4cb327680ab4 Signed-off-by: Nikhilesh Reddy --- block/Kconfig | 9 +- block/bio.c | 6 +- block/blk-core.c | 468 +++++++++++++++++++++++++++++++++++++- block/blk.h | 9 + include/linux/blk_types.h | 13 ++ 5 files changed, 500 insertions(+), 5 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index 161491d0a879..39e956942b9d 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -33,7 +33,7 @@ config LBDAF This option is required to support the full capacity of large (2TB+) block devices, including RAID, disk, Network Block Device, Logical Volume Manager (LVM) and loopback. - + This option also enables support for single files larger than 2TB. @@ -111,6 +111,13 @@ config BLK_CMDLINE_PARSER See Documentation/block/cmdline-partition.txt for more information. +config BLOCK_PERF_FRAMEWORK + bool "Enable Block device performance measurement framework" + default n + ---help--- + Enabling this option allows you to measure the performance at the + block layer. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/bio.c b/block/bio.c index b9829b6504c8..02c4d9bf1590 100644 --- a/block/bio.c +++ b/block/bio.c @@ -31,6 +31,8 @@ #include +#include "blk.h" + /* * Test patch to inline a certain number of bi_io_vec's inside the bio * itself, to shrink a bio data allocation from two mempool calls to one @@ -1765,8 +1767,10 @@ void bio_endio(struct bio *bio) bio_put(bio); bio = parent; } else { - if (bio->bi_end_io) + if (bio->bi_end_io) { + blk_update_perf_stats(bio); bio->bi_end_io(bio); + } bio = NULL; } } diff --git a/block/blk-core.c b/block/blk-core.c index 450da06fa27e..4162327d8804 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -11,6 +11,12 @@ /* * This handles all read/write requests to block devices */ + +#ifdef CONFIG_BLOCK_PERF_FRAMEWORK +#define DRIVER_NAME "Block" +#define pr_fmt(fmt) DRIVER_NAME ": %s: " fmt, __func__ +#endif + #include #include #include @@ -34,6 +40,12 @@ #include #include +#ifdef CONFIG_BLOCK_PERF_FRAMEWORK +#include +#include +#include +#endif + #define CREATE_TRACE_POINTS #include @@ -2111,6 +2123,456 @@ static inline struct task_struct *get_dirty_task(struct bio *bio) } #endif +#ifdef CONFIG_BLOCK_PERF_FRAMEWORK +#define BLK_PERF_SIZE (1024 * 15) +#define BLK_PERF_HIST_SIZE (sizeof(u32) * BLK_PERF_SIZE) + +struct blk_perf_stats { + u32 *read_hist; + u32 *write_hist; + u32 *flush_hist; + int buffers_alloced; + ktime_t max_read_time; + ktime_t max_write_time; + ktime_t max_flush_time; + ktime_t min_write_time; + ktime_t min_read_time; + ktime_t min_flush_time; + ktime_t total_write_time; + ktime_t total_read_time; + u64 total_read_size; + u64 total_write_size; + spinlock_t lock; + int is_enabled; +}; + +static struct blk_perf_stats blk_perf; +static struct dentry *blk_perf_debug_dir; + +static int alloc_histogram_buffers(void) +{ + int ret = 0; + + if (!blk_perf.read_hist) + blk_perf.read_hist = kzalloc(BLK_PERF_HIST_SIZE, GFP_KERNEL); + + if (!blk_perf.write_hist) + blk_perf.write_hist = kzalloc(BLK_PERF_HIST_SIZE, GFP_KERNEL); + + if (!blk_perf.flush_hist) + blk_perf.flush_hist = kzalloc(BLK_PERF_HIST_SIZE, GFP_KERNEL); + + if (!blk_perf.read_hist || !blk_perf.write_hist || !blk_perf.flush_hist) + ret = -ENOMEM; + + if (!ret) + blk_perf.buffers_alloced = 1; + return ret; +} + +static void clear_histogram_buffers(void) +{ + if (!blk_perf.buffers_alloced) + return; + memset(blk_perf.read_hist, 0, BLK_PERF_HIST_SIZE); + memset(blk_perf.write_hist, 0, BLK_PERF_HIST_SIZE); + memset(blk_perf.flush_hist, 0, BLK_PERF_HIST_SIZE); +} + +static int enable_perf(void *data, u64 val) +{ + int ret; + + if (!blk_perf.buffers_alloced) + ret = alloc_histogram_buffers(); + + if (ret) + return ret; + + spin_lock(&blk_perf.lock); + blk_perf.is_enabled = val; + spin_unlock(&blk_perf.lock); + return 0; +} + +static int is_perf_enabled(void *data, u64 *val) +{ + spin_lock(&blk_perf.lock); + *val = blk_perf.is_enabled; + spin_unlock(&blk_perf.lock); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(enable_perf_fops, is_perf_enabled, enable_perf, + "%llu\n"); + +static char *blk_debug_buffer; +static u32 blk_debug_data_size; +static DEFINE_MUTEX(blk_perf_debug_buffer_mutex); + +static ssize_t blk_perf_read(struct file *file, char __user *buf, + size_t count, loff_t *file_pos) +{ + ssize_t ret = 0; + + mutex_lock(&blk_perf_debug_buffer_mutex); + ret = simple_read_from_buffer(buf, count, file_pos, blk_debug_buffer, + blk_debug_data_size); + mutex_unlock(&blk_perf_debug_buffer_mutex); + + return ret; +} + +static int blk_debug_buffer_alloc(u32 buffer_size) +{ + int ret = 0; + + mutex_lock(&blk_perf_debug_buffer_mutex); + if (blk_debug_buffer != NULL) { + pr_err("blk_debug_buffer is in use\n"); + ret = -EBUSY; + goto end; + } + blk_debug_buffer = kzalloc(buffer_size, GFP_KERNEL); + if (!blk_debug_buffer) + ret = -ENOMEM; +end: + mutex_unlock(&blk_perf_debug_buffer_mutex); + return ret; +} + +static int blk_perf_close(struct inode *inode, struct file *file) +{ + mutex_lock(&blk_perf_debug_buffer_mutex); + blk_debug_data_size = 0; + kfree(blk_debug_buffer); + blk_debug_buffer = NULL; + mutex_unlock(&blk_perf_debug_buffer_mutex); + return 0; +} + +static u32 fill_basic_perf_info(char *buffer, u32 buffer_size) +{ + u32 size = 0; + + size += scnprintf(buffer + size, buffer_size - size, "\n"); + + spin_lock(&blk_perf.lock); + size += scnprintf(buffer + size, buffer_size - size, + "max_read_time_ms: %llu\n", + ktime_to_ms(blk_perf.max_read_time)); + + size += scnprintf(buffer + size, buffer_size - size, + "min_read_time_ms: %llu\n", + ktime_to_ms(blk_perf.min_read_time)); + + size += scnprintf(buffer + size, buffer_size - size, + "total_read_time_ms: %llu\n", + ktime_to_ms(blk_perf.total_read_time)); + + size += scnprintf(buffer + size, buffer_size - size, + "total_read_size: %llu\n\n", + blk_perf.total_read_size); + + size += scnprintf(buffer + size, buffer_size - size, + "max_write_time_ms: %llu\n", + ktime_to_ms(blk_perf.max_write_time)); + + size += scnprintf(buffer + size, buffer_size - size, + "min_write_time_ms: %llu\n", + ktime_to_ms(blk_perf.min_write_time)); + + size += scnprintf(buffer + size, buffer_size - size, + "total_write_time_ms: %llu\n", + ktime_to_ms(blk_perf.total_write_time)); + + size += scnprintf(buffer + size, buffer_size - size, + "total_write_size: %llu\n\n", + blk_perf.total_write_size); + + size += scnprintf(buffer + size, buffer_size - size, + "max_flush_time_ms: %llu\n", + ktime_to_ms(blk_perf.max_flush_time)); + + size += scnprintf(buffer + size, buffer_size - size, + "min_flush_time_ms: %llu\n\n", + ktime_to_ms(blk_perf.min_flush_time)); + + spin_unlock(&blk_perf.lock); + + return size; +} + +static int basic_perf_open(struct inode *inode, struct file *file) +{ + u32 buffer_size; + int ret; + + buffer_size = BLK_PERF_HIST_SIZE; + ret = blk_debug_buffer_alloc(buffer_size); + if (ret) + return ret; + + mutex_lock(&blk_perf_debug_buffer_mutex); + blk_debug_data_size = fill_basic_perf_info(blk_debug_buffer, + buffer_size); + mutex_unlock(&blk_perf_debug_buffer_mutex); + return 0; +} + + +static const struct file_operations basic_perf_ops = { + .read = blk_perf_read, + .release = blk_perf_close, + .open = basic_perf_open, +}; + +static int hist_open_helper(void *hist_buf) +{ + int ret; + + if (!blk_perf.buffers_alloced) + return -EINVAL; + + ret = blk_debug_buffer_alloc(BLK_PERF_HIST_SIZE); + if (ret) + return ret; + + spin_lock(&blk_perf.lock); + memcpy(blk_debug_buffer, hist_buf, BLK_PERF_HIST_SIZE); + spin_unlock(&blk_perf.lock); + + mutex_lock(&blk_perf_debug_buffer_mutex); + blk_debug_data_size = BLK_PERF_HIST_SIZE; + mutex_unlock(&blk_perf_debug_buffer_mutex); + return 0; +} + +static int write_hist_open(struct inode *inode, struct file *file) +{ + return hist_open_helper(blk_perf.write_hist); +} + +static const struct file_operations write_hist_ops = { + .read = blk_perf_read, + .release = blk_perf_close, + .open = write_hist_open, +}; + + +static int read_hist_open(struct inode *inode, struct file *file) +{ + return hist_open_helper(blk_perf.read_hist); +} + +static const struct file_operations read_hist_ops = { + .read = blk_perf_read, + .release = blk_perf_close, + .open = read_hist_open, +}; + +static int flush_hist_open(struct inode *inode, struct file *file) +{ + return hist_open_helper(blk_perf.flush_hist); +} + +static const struct file_operations flush_hist_ops = { + .read = blk_perf_read, + .release = blk_perf_close, + .open = flush_hist_open, +}; + +static void clear_perf_stats_helper(void) +{ + spin_lock(&blk_perf.lock); + blk_perf.max_write_time = ktime_set(0, 0); + blk_perf.max_read_time = ktime_set(0, 0); + blk_perf.max_flush_time = ktime_set(0, 0); + blk_perf.min_write_time = ktime_set(KTIME_MAX, 0); + blk_perf.min_read_time = ktime_set(KTIME_MAX, 0); + blk_perf.min_flush_time = ktime_set(KTIME_MAX, 0); + blk_perf.total_write_time = ktime_set(0, 0); + blk_perf.total_read_time = ktime_set(0, 0); + blk_perf.total_read_size = 0; + blk_perf.total_write_size = 0; + blk_perf.is_enabled = 0; + clear_histogram_buffers(); + spin_unlock(&blk_perf.lock); +} + +static int clear_perf_stats(void *data, u64 val) +{ + clear_perf_stats_helper(); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(clear_perf_stats_fops, NULL, clear_perf_stats, + "%llu\n"); + +static void blk_debugfs_init(void) +{ + struct dentry *f_ent; + + blk_perf_debug_dir = debugfs_create_dir("block_perf", NULL); + if (IS_ERR(blk_perf_debug_dir)) { + pr_err("Failed to create block_perf debug_fs directory\n"); + return; + } + + f_ent = debugfs_create_file("basic_perf", 0400, blk_perf_debug_dir, + NULL, &basic_perf_ops); + if (IS_ERR(f_ent)) { + pr_err("Failed to create debug_fs basic_perf file\n"); + return; + } + + f_ent = debugfs_create_file("write_hist", 0400, blk_perf_debug_dir, + NULL, &write_hist_ops); + if (IS_ERR(f_ent)) { + pr_err("Failed to create debug_fs write_hist file\n"); + return; + } + + f_ent = debugfs_create_file("read_hist", 0400, blk_perf_debug_dir, + NULL, &read_hist_ops); + if (IS_ERR(f_ent)) { + pr_err("Failed to create debug_fs read_hist file\n"); + return; + } + + f_ent = debugfs_create_file("flush_hist", 0400, blk_perf_debug_dir, + NULL, &flush_hist_ops); + if (IS_ERR(f_ent)) { + pr_err("Failed to create debug_fs flush_hist file\n"); + return; + } + + f_ent = debugfs_create_file("enable_perf", 0600, blk_perf_debug_dir, + NULL, &enable_perf_fops); + if (IS_ERR(f_ent)) { + pr_err("Failed to create debug_fs enable_perf file\n"); + return; + } + + f_ent = debugfs_create_file("clear_perf_stats", 0200, + blk_perf_debug_dir, NULL, + &clear_perf_stats_fops); + if (IS_ERR(f_ent)) { + pr_err("Failed to create debug_fs clear_perf_stats file\n"); + return; + } +} + +static void blk_init_perf(void) +{ + blk_debugfs_init(); + spin_lock_init(&blk_perf.lock); + + clear_perf_stats_helper(); +} + + +static void set_submit_info(struct bio *bio, unsigned int count) +{ + ktime_t submit_time; + + if (unlikely(blk_perf.is_enabled)) { + submit_time = ktime_get(); + bio->submit_time.tv64 = submit_time.tv64; + bio->blk_sector_count = count; + return; + } + + bio->submit_time.tv64 = 0; + bio->blk_sector_count = 0; +} + +void blk_update_perf_read_write_stats(ktime_t bio_process_time, int is_write, + int count) +{ + u32 bio_process_time_ms; + + bio_process_time_ms = ktime_to_ms(bio_process_time); + if (bio_process_time_ms >= BLK_PERF_SIZE) + bio_process_time_ms = BLK_PERF_SIZE - 1; + + if (is_write) { + if (ktime_after(bio_process_time, blk_perf.max_write_time)) + blk_perf.max_write_time = bio_process_time; + + if (ktime_before(bio_process_time, blk_perf.min_write_time)) + blk_perf.min_write_time = bio_process_time; + blk_perf.total_write_time = + ktime_add(blk_perf.total_write_time, bio_process_time); + blk_perf.total_write_size += count; + blk_perf.write_hist[bio_process_time_ms] += count; + + } else { + if (ktime_after(bio_process_time, blk_perf.max_read_time)) + blk_perf.max_read_time = bio_process_time; + + if (ktime_before(bio_process_time, blk_perf.min_read_time)) + blk_perf.min_read_time = bio_process_time; + blk_perf.total_read_time = + ktime_add(blk_perf.total_read_time, bio_process_time); + blk_perf.total_read_size += count; + blk_perf.read_hist[bio_process_time_ms] += count; + } +} +void blk_update_perf_stats(struct bio *bio) +{ + ktime_t bio_process_time; + u32 bio_process_time_ms; + u32 count; + + spin_lock(&blk_perf.lock); + if (likely(!blk_perf.is_enabled)) + goto end; + if (!bio->submit_time.tv64) + goto end; + bio_process_time = ktime_sub(ktime_get(), bio->submit_time); + + count = bio->blk_sector_count; + + if (count) { + int is_write = 0; + + if (bio->bi_rw & WRITE || + unlikely(bio->bi_rw & REQ_WRITE_SAME)) + is_write = 1; + + blk_update_perf_read_write_stats(bio_process_time, is_write, + count); + } else { + + bio_process_time_ms = ktime_to_ms(bio_process_time); + if (bio_process_time_ms >= BLK_PERF_SIZE) + bio_process_time_ms = BLK_PERF_SIZE - 1; + + if (ktime_after(bio_process_time, blk_perf.max_flush_time)) + blk_perf.max_flush_time = bio_process_time; + + if (ktime_before(bio_process_time, blk_perf.min_flush_time)) + blk_perf.min_flush_time = bio_process_time; + + blk_perf.flush_hist[bio_process_time_ms] += 1; + } +end: + spin_unlock(&blk_perf.lock); + +} +#else +static inline void set_submit_info(struct bio *bio, unsigned int count) +{ + (void) bio; + (void) count; +} + +static inline void blk_init_perf(void) +{ +} +#endif /* #ifdef CONFIG_BLOCK_PERF_FRAMEWORK */ + /** * submit_bio - submit a bio to the block device layer for I/O * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) @@ -2123,6 +2585,7 @@ static inline struct task_struct *get_dirty_task(struct bio *bio) */ blk_qc_t submit_bio(int rw, struct bio *bio) { + unsigned int count = 0; bio->bi_rw |= rw; /* @@ -2130,8 +2593,6 @@ blk_qc_t submit_bio(int rw, struct bio *bio) * go through the normal accounting stuff before submission. */ if (bio_has_data(bio)) { - unsigned int count; - if (unlikely(rw & REQ_WRITE_SAME)) count = bdev_logical_block_size(bio->bi_bdev) >> 9; else @@ -2158,6 +2619,7 @@ blk_qc_t submit_bio(int rw, struct bio *bio) } } + set_submit_info(bio, count); return generic_make_request(bio); } EXPORT_SYMBOL(submit_bio); @@ -3578,7 +4040,7 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("blkdev_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); - + blk_init_perf(); return 0; } diff --git a/block/blk.h b/block/blk.h index ce2287639ab3..6ceebbd61afd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -112,6 +112,15 @@ void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); void blk_account_io_done(struct request *req); +#ifdef CONFIG_BLOCK_PERF_FRAMEWORK +void blk_update_perf_stats(struct bio *bio); +#else +static inline void blk_update_perf_stats(struct bio *bio) +{ + (void) bio; +} +#endif + /* * Internal atomic flags for request handling */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index da8ca5a7da58..6d73a04d0150 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -39,6 +39,15 @@ struct bvec_iter { current bvec */ }; +#ifdef CONFIG_BLOCK_PERF_FRAMEWORK +/* Double declaration from ktime.h so as to not break the include dependency + * chain. Should be kept up to date. + */ +union blk_ktime { + s64 tv64; +}; +#endif + /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) @@ -54,6 +63,10 @@ struct bio { struct bvec_iter bi_iter; +#ifdef CONFIG_BLOCK_PERF_FRAMEWORK + union blk_ktime submit_time; + unsigned int blk_sector_count; +#endif /* Number of segments in this BIO after * physical address coalescing is performed. */