From 73c5162aa362a543793f4a957c6c536dcbaa89ce Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Mar 2009 13:42:01 -0400 Subject: [PATCH 01/20] tracing: keep ring buffer to minimum size till used Impact: less memory impact on systems not using tracer When the kernel boots up that has tracing configured, it allocates the default size of the ring buffer. This currently happens to be 1.4Megs per possible CPU. This is quite a bit of wasted memory if the system is never using the tracer. The current solution is to keep the ring buffers to a minimum size until the user uses them. Once a tracer is piped into the current_tracer the ring buffer will be expanded to the default size. If the user changes the size of the ring buffer, it will take the size given by the user immediately. If the user adds a "ftrace=" to the kernel command line, then the ring buffers will be set to the default size on initialization. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 79 +++++++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 20 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4c97947650ae..0c1dc1850858 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -44,6 +44,12 @@ unsigned long __read_mostly tracing_max_latency; unsigned long __read_mostly tracing_thresh; +/* + * On boot up, the ring buffer is set to the minimum size, so that + * we do not waste memory on systems that are not using tracing. + */ +static int ring_buffer_expanded; + /* * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the @@ -128,6 +134,8 @@ static int __init set_ftrace(char *str) { strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; + /* We are using ftrace early, expand it */ + ring_buffer_expanded = 1; return 1; } __setup("ftrace=", set_ftrace); @@ -2315,6 +2323,40 @@ int tracer_init(struct tracer *t, struct trace_array *tr) return t->init(tr); } +static int tracing_resize_ring_buffer(unsigned long size) +{ + int ret; + + /* + * If kernel or user changes the size of the ring buffer + * it get completed. + */ + ring_buffer_expanded = 1; + + ret = ring_buffer_resize(global_trace.buffer, size); + if (ret < 0) + return ret; + + ret = ring_buffer_resize(max_tr.buffer, size); + if (ret < 0) { + int r; + + r = ring_buffer_resize(global_trace.buffer, + global_trace.entries); + if (r < 0) { + /* AARGH! We are left with different + * size max buffer!!!! */ + WARN_ON(1); + tracing_disabled = 1; + } + return ret; + } + + global_trace.entries = size; + + return ret; +} + struct trace_option_dentry; static struct trace_option_dentry * @@ -2330,6 +2372,13 @@ static int tracing_set_tracer(const char *buf) struct tracer *t; int ret = 0; + if (!ring_buffer_expanded) { + ret = tracing_resize_ring_buffer(trace_buf_size); + if (ret < 0) + return ret; + ret = 0; + } + mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) { if (strcmp(t->name, buf) == 0) @@ -2903,28 +2952,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, val <<= 10; if (val != global_trace.entries) { - ret = ring_buffer_resize(global_trace.buffer, val); + ret = tracing_resize_ring_buffer(val); if (ret < 0) { cnt = ret; goto out; } - - ret = ring_buffer_resize(max_tr.buffer, val); - if (ret < 0) { - int r; - cnt = ret; - r = ring_buffer_resize(global_trace.buffer, - global_trace.entries); - if (r < 0) { - /* AARGH! We are left with different - * size max buffer!!!! */ - WARN_ON(1); - tracing_disabled = 1; - } - goto out; - } - - global_trace.entries = val; } filp->f_pos += cnt; @@ -3916,6 +3948,7 @@ void ftrace_dump(void) __init static int tracer_alloc_buffers(void) { struct trace_array_cpu *data; + int ring_buf_size; int i; int ret = -ENOMEM; @@ -3928,12 +3961,18 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) goto out_free_tracing_cpumask; + /* To save memory, keep the ring buffer size to its minimum */ + if (ring_buffer_expanded) + ring_buf_size = trace_buf_size; + else + ring_buf_size = 1; + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); cpumask_clear(tracing_reader_cpumask); /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(trace_buf_size, + global_trace.buffer = ring_buffer_alloc(ring_buf_size, TRACE_BUFFER_FLAGS); if (!global_trace.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); @@ -3944,7 +3983,7 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(trace_buf_size, + max_tr.buffer = ring_buffer_alloc(ring_buf_size, TRACE_BUFFER_FLAGS); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); From 1852fcce181faa237c010a3dbedb473cf9d4555f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Mar 2009 14:33:00 -0400 Subject: [PATCH 02/20] tracing: expand the ring buffers when an event is activated To save memory, the tracer ring buffers are set to a minimum. The activating of a trace expands the ring buffer size. This patch adds this expanding, when an event is activated. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 20 ++++++++++++++++++++ kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 8 ++++++++ 3 files changed, 31 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0c1dc1850858..35ee63ae4122 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2357,6 +2357,26 @@ static int tracing_resize_ring_buffer(unsigned long size) return ret; } +/** + * tracing_update_buffers - used by tracing facility to expand ring buffers + * + * To save on memory when the tracing is never used on a system with it + * configured in. The ring buffers are set to a minimum size. But once + * a user starts to use the tracing facility, then they need to grow + * to their default size. + * + * This function is to be called when a tracer is about to be used. + */ +int tracing_update_buffers(void) +{ + int ret = 0; + + if (!ring_buffer_expanded) + ret = tracing_resize_ring_buffer(trace_buf_size); + + return ret; +} + struct trace_option_dentry; static struct trace_option_dentry * diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c5e1d8865fe4..336324d717f8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -737,6 +737,9 @@ static inline void trace_branch_disable(void) } #endif /* CONFIG_BRANCH_TRACER */ +/* set ring buffers to default size if not already done so */ +int tracing_update_buffers(void); + /* trace event type bit fields, not numeric */ enum { TRACE_EVENT_TYPE_PRINTF = 1, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 769dfd00fc85..ca624df73591 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -141,6 +141,10 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (!cnt || cnt < 0) return 0; + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + ret = get_user(ch, ubuf++); if (ret) return ret; @@ -331,6 +335,10 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + switch (val) { case 0: case 1: From 9aba60fe6eb20453de53a572143bef22fa929fba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Mar 2009 19:52:30 -0400 Subject: [PATCH 03/20] tracing: fix trace_wait to know to wait on all cpus or just one Impact: fix to task live locking on reading trace_pipe on one CPU The same code is used for both trace_pipe (all CPUS) and the per_cpu trace_pipe file. When there is no data to read, it will check for signals and wait on the trace wait queue. The problem happens with the per_cpu wait. The trace_wait code checks all CPUs. Thus, if there's data in another CPU buffer, then it will exit the wait, without checking for signals or waiting on the wait queue. It would then try to read the empty buffer, and since that will just return nothing, then it will try to wait again. Unfortunately, that will again fail due to there still being data in the other buffers. This ends up with a live lock for the task. This patch fixes the trace_wait to be aware that the iterator may only be waiting on a single buffer. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 35ee63ae4122..e60f4be10d64 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1666,6 +1666,19 @@ static int trace_empty(struct trace_iterator *iter) { int cpu; + /* If we are looking at one CPU buffer, only check that one */ + if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { + cpu = iter->cpu_file; + if (iter->buffer_iter[cpu]) { + if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + return 0; + } else { + if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + return 0; + } + return 1; + } + for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) { if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) From 554f786e284a6ce859d51f62240d615603944c8e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Mar 2009 22:00:13 -0400 Subject: [PATCH 04/20] ring-buffer: only allocate buffers for online cpus Impact: save on memory Currently, a ring buffer was allocated for each "possible_cpus". On some systems, this is the same as NR_CPUS. Thus, if a system defined NR_CPUS = 64 but it only had 1 CPU, we could have possibly 63 useless ring buffers taking up space. With a default buffer of 3 megs, this could be quite drastic. This patch changes the ring buffer code to only allocate ring buffers for online CPUs. If a CPU goes off line, we do not free the buffer. This is because the user may still have trace data in that buffer that they would like to look at. Perhaps in the future we could add code to delete a ring buffer if the CPU is offline and the ring buffer becomes empty. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 268 ++++++++++++++++++++++++++++++------- kernel/trace/trace.c | 6 - 2 files changed, 217 insertions(+), 57 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 178858492a89..d07c2888396f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "trace.h" @@ -301,6 +302,10 @@ struct ring_buffer { struct mutex mutex; struct ring_buffer_per_cpu **buffers; + +#ifdef CONFIG_HOTPLUG + struct notifier_block cpu_notify; +#endif }; struct ring_buffer_iter { @@ -459,6 +464,11 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) */ extern int ring_buffer_page_too_big(void); +#ifdef CONFIG_HOTPLUG +static int __cpuinit rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); +#endif + /** * ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. @@ -496,7 +506,8 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) if (buffer->pages == 1) buffer->pages++; - cpumask_copy(buffer->cpumask, cpu_possible_mask); + get_online_cpus(); + cpumask_copy(buffer->cpumask, cpu_online_mask); buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; @@ -512,6 +523,13 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) goto fail_free_buffers; } +#ifdef CONFIG_HOTPLUG + buffer->cpu_notify.notifier_call = rb_cpu_notify; + buffer->cpu_notify.priority = 0; + register_cpu_notifier(&buffer->cpu_notify); +#endif + + put_online_cpus(); mutex_init(&buffer->mutex); return buffer; @@ -525,6 +543,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) fail_free_cpumask: free_cpumask_var(buffer->cpumask); + put_online_cpus(); fail_free_buffer: kfree(buffer); @@ -541,9 +560,17 @@ ring_buffer_free(struct ring_buffer *buffer) { int cpu; + get_online_cpus(); + +#ifdef CONFIG_HOTPLUG + unregister_cpu_notifier(&buffer->cpu_notify); +#endif + for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); + put_online_cpus(); + free_cpumask_var(buffer->cpumask); kfree(buffer); @@ -649,16 +676,15 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) return size; mutex_lock(&buffer->mutex); + get_online_cpus(); nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); if (size < buffer_size) { /* easy case, just free pages */ - if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) { - mutex_unlock(&buffer->mutex); - return -1; - } + if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) + goto out_fail; rm_pages = buffer->pages - nr_pages; @@ -677,10 +703,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) * add these pages to the cpu_buffers. Otherwise we just free * them all and return -ENOMEM; */ - if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) { - mutex_unlock(&buffer->mutex); - return -1; - } + if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) + goto out_fail; new_pages = nr_pages - buffer->pages; @@ -705,13 +729,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) rb_insert_pages(cpu_buffer, &pages, new_pages); } - if (RB_WARN_ON(buffer, !list_empty(&pages))) { - mutex_unlock(&buffer->mutex); - return -1; - } + if (RB_WARN_ON(buffer, !list_empty(&pages))) + goto out_fail; out: buffer->pages = nr_pages; + put_online_cpus(); mutex_unlock(&buffer->mutex); return size; @@ -721,8 +744,18 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) list_del_init(&bpage->list); free_buffer_page(bpage); } + put_online_cpus(); mutex_unlock(&buffer->mutex); return -ENOMEM; + + /* + * Something went totally wrong, and we are too paranoid + * to even clean up the mess. + */ + out_fail: + put_online_cpus(); + mutex_unlock(&buffer->mutex); + return -1; } EXPORT_SYMBOL_GPL(ring_buffer_resize); @@ -1528,11 +1561,15 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + get_online_cpus(); + if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return; + goto out; cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->record_disabled); + out: + put_online_cpus(); } EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); @@ -1548,11 +1585,15 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + get_online_cpus(); + if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return; + goto out; cpu_buffer = buffer->buffers[cpu]; atomic_dec(&cpu_buffer->record_disabled); + out: + put_online_cpus(); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); @@ -1564,12 +1605,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret = 0; + + get_online_cpus(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; + goto out; cpu_buffer = buffer->buffers[cpu]; - return cpu_buffer->entries; + ret = cpu_buffer->entries; + out: + put_online_cpus(); + + return ret; } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); @@ -1581,12 +1629,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret = 0; + + get_online_cpus(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; + goto out; cpu_buffer = buffer->buffers[cpu]; - return cpu_buffer->overrun; + ret = cpu_buffer->overrun; + out: + put_online_cpus(); + + return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); @@ -1603,12 +1658,16 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) unsigned long entries = 0; int cpu; + get_online_cpus(); + /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += cpu_buffer->entries; } + put_online_cpus(); + return entries; } EXPORT_SYMBOL_GPL(ring_buffer_entries); @@ -1626,12 +1685,16 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) unsigned long overruns = 0; int cpu; + get_online_cpus(); + /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; overruns += cpu_buffer->overrun; } + put_online_cpus(); + return overruns; } EXPORT_SYMBOL_GPL(ring_buffer_overruns); @@ -1663,9 +1726,14 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) */ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) { - struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; + if (!iter) + return; + + cpu_buffer = iter->cpu_buffer; + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); @@ -1900,9 +1968,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct buffer_page *reader; int nr_loops = 0; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return NULL; - cpu_buffer = buffer->buffers[cpu]; again: @@ -2028,13 +2093,21 @@ struct ring_buffer_event * ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_event *event; + struct ring_buffer_event *event = NULL; unsigned long flags; + get_online_cpus(); + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + out: + put_online_cpus(); + return event; } @@ -2071,24 +2144,31 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event * ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) { - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_event *event; + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event = NULL; unsigned long flags; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return NULL; + /* might be called in atomic */ + preempt_disable(); + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_buffer_peek(buffer, cpu, ts); if (!event) - goto out; + goto out_unlock; rb_advance_reader(cpu_buffer); - out: + out_unlock: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + out: + preempt_enable(); + return event; } EXPORT_SYMBOL_GPL(ring_buffer_consume); @@ -2109,15 +2189,17 @@ struct ring_buffer_iter * ring_buffer_read_start(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_iter *iter; + struct ring_buffer_iter *iter = NULL; unsigned long flags; + get_online_cpus(); + if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return NULL; + goto out; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) - return NULL; + goto out; cpu_buffer = buffer->buffers[cpu]; @@ -2132,6 +2214,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + out: + put_online_cpus(); + return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -2224,9 +2309,13 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; unsigned long flags; + int resched; + + /* Can't use get_online_cpus because this can be in atomic */ + resched = ftrace_preempt_disable(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return; + goto out; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2237,6 +2326,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + out: + ftrace_preempt_enable(resched); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -2246,10 +2337,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); */ void ring_buffer_reset(struct ring_buffer *buffer) { + int resched; int cpu; + /* Can't use get_online_cpus because this can be in atomic */ + resched = ftrace_preempt_disable(); + for_each_buffer_cpu(buffer, cpu) ring_buffer_reset_cpu(buffer, cpu); + + ftrace_preempt_enable(resched); } EXPORT_SYMBOL_GPL(ring_buffer_reset); @@ -2262,12 +2359,17 @@ int ring_buffer_empty(struct ring_buffer *buffer) struct ring_buffer_per_cpu *cpu_buffer; int cpu; + get_online_cpus(); + /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!rb_per_cpu_empty(cpu_buffer)) return 0; } + + put_online_cpus(); + return 1; } EXPORT_SYMBOL_GPL(ring_buffer_empty); @@ -2280,12 +2382,20 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + int ret = 1; + + get_online_cpus(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 1; + goto out; cpu_buffer = buffer->buffers[cpu]; - return rb_per_cpu_empty(cpu_buffer); + ret = rb_per_cpu_empty(cpu_buffer); + + out: + put_online_cpus(); + + return ret; } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); @@ -2304,32 +2414,37 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, { struct ring_buffer_per_cpu *cpu_buffer_a; struct ring_buffer_per_cpu *cpu_buffer_b; + int ret = -EINVAL; + + get_online_cpus(); if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) - return -EINVAL; + goto out; /* At least make sure the two buffers are somewhat the same */ if (buffer_a->pages != buffer_b->pages) - return -EINVAL; + goto out; + + ret = -EAGAIN; if (ring_buffer_flags != RB_BUFFERS_ON) - return -EAGAIN; + goto out; if (atomic_read(&buffer_a->record_disabled)) - return -EAGAIN; + goto out; if (atomic_read(&buffer_b->record_disabled)) - return -EAGAIN; + goto out; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; if (atomic_read(&cpu_buffer_a->record_disabled)) - return -EAGAIN; + goto out; if (atomic_read(&cpu_buffer_b->record_disabled)) - return -EAGAIN; + goto out; /* * We can't do a synchronize_sched here because this @@ -2349,7 +2464,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); - return 0; + ret = 0; +out: + put_online_cpus(); + + return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); @@ -2464,27 +2583,32 @@ int ring_buffer_read_page(struct ring_buffer *buffer, u64 save_timestamp; int ret = -1; + get_online_cpus(); + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) - return -1; + goto out; len -= BUF_PAGE_HDR_SIZE; if (!data_page) - return -1; + goto out; bpage = *data_page; if (!bpage) - return -1; + goto out; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) - goto out; + goto out_unlock; event = rb_reader_event(cpu_buffer); @@ -2506,7 +2630,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, unsigned int size; if (full) - goto out; + goto out_unlock; if (len > (commit - read)) len = (commit - read); @@ -2514,7 +2638,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, size = rb_event_length(event); if (len < size) - goto out; + goto out_unlock; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; @@ -2553,9 +2677,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } ret = read; - out: + out_unlock: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + out: + put_online_cpus(); + return ret; } @@ -2629,3 +2756,42 @@ static __init int rb_init_debugfs(void) } fs_initcall(rb_init_debugfs); + +#ifdef CONFIG_HOTPLUG +static int __cpuinit rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct ring_buffer *buffer = + container_of(self, struct ring_buffer, cpu_notify); + long cpu = (long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (cpu_isset(cpu, *buffer->cpumask)) + return NOTIFY_OK; + + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, cpu); + if (!buffer->buffers[cpu]) { + WARN(1, "failed to allocate ring buffer on CPU %ld\n", + cpu); + return NOTIFY_OK; + } + smp_wmb(); + cpu_set(cpu, *buffer->cpumask); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* + * Do nothing. + * If we were to free the buffer, then the user would + * lose any trace that was in the buffer. + */ + break; + default: + break; + } + return NOTIFY_OK; +} +#endif diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e60f4be10d64..14c98f6a47bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1805,17 +1805,11 @@ __tracing_open(struct inode *inode, struct file *file) iter->buffer_iter[cpu] = ring_buffer_read_start(iter->tr->buffer, cpu); - - if (!iter->buffer_iter[cpu]) - goto fail_buffer; } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = ring_buffer_read_start(iter->tr->buffer, cpu); - - if (!iter->buffer_iter[cpu]) - goto fail; } /* TODO stop tracer */ From a123c52b46a1f84bcec3dc963351896c6d6afaf7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 11:21:08 -0400 Subject: [PATCH 05/20] tracing: fix comments about trace buffer resizing Impact: cleanup Some of the comments about the trace buffer resizing is gobbledygook. And I wonder why people question if I'm a native English speaker. This patch makes the comments make a bit more sense. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c3946a6df34e..c61ee85c50bb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2336,7 +2336,8 @@ static int tracing_resize_ring_buffer(unsigned long size) /* * If kernel or user changes the size of the ring buffer - * it get completed. + * we use the size that was given, and we can forget about + * expanding it later. */ ring_buffer_expanded = 1; @@ -2351,8 +2352,20 @@ static int tracing_resize_ring_buffer(unsigned long size) r = ring_buffer_resize(global_trace.buffer, global_trace.entries); if (r < 0) { - /* AARGH! We are left with different - * size max buffer!!!! */ + /* + * AARGH! We are left with different + * size max buffer!!!! + * The max buffer is our "snapshot" buffer. + * When a tracer needs a snapshot (one of the + * latency tracers), it swaps the max buffer + * with the saved snap shot. We succeeded to + * update the size of the main buffer, but failed to + * update the size of the max buffer. But when we tried + * to reset the main buffer to the original size, we + * failed there too. This is very unlikely to + * happen, but if it does, warn and kill all + * tracing. + */ WARN_ON(1); tracing_disabled = 1; } From 1027fcb206a0fb8348e63aff078c74bdee1c2698 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 11:33:20 -0400 Subject: [PATCH 06/20] tracing: protect ring_buffer_expanded with trace_types_lock Impact: prevent races with ring_buffer_expanded This patch places the expanding of the tracing buffer under the protection of the trace_types_lock mutex. It is highly unlikely that there would be any contention, but better safe than sorry. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c61ee85c50bb..04ab8243a13d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2391,8 +2391,10 @@ int tracing_update_buffers(void) { int ret = 0; + mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) ret = tracing_resize_ring_buffer(trace_buf_size); + mutex_unlock(&trace_types_lock); return ret; } @@ -2412,6 +2414,8 @@ static int tracing_set_tracer(const char *buf) struct tracer *t; int ret = 0; + mutex_lock(&trace_types_lock); + if (!ring_buffer_expanded) { ret = tracing_resize_ring_buffer(trace_buf_size); if (ret < 0) @@ -2419,7 +2423,6 @@ static int tracing_set_tracer(const char *buf) ret = 0; } - mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) { if (strcmp(t->name, buf) == 0) break; From 59222efe2d184956464abe5b637bc842ff053b93 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 11:46:03 -0400 Subject: [PATCH 07/20] ring-buffer: use CONFIG_HOTPLUG_CPU not CONFIG_HOTPLUG The hotplug code in the ring buffers is for use with CPU hotplug, not generic hotplug. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d07c2888396f..035b56c3a6c9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -303,7 +303,7 @@ struct ring_buffer { struct ring_buffer_per_cpu **buffers; -#ifdef CONFIG_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU struct notifier_block cpu_notify; #endif }; @@ -464,7 +464,7 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) */ extern int ring_buffer_page_too_big(void); -#ifdef CONFIG_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU static int __cpuinit rb_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu); #endif @@ -523,7 +523,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) goto fail_free_buffers; } -#ifdef CONFIG_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU buffer->cpu_notify.notifier_call = rb_cpu_notify; buffer->cpu_notify.priority = 0; register_cpu_notifier(&buffer->cpu_notify); @@ -562,7 +562,7 @@ ring_buffer_free(struct ring_buffer *buffer) get_online_cpus(); -#ifdef CONFIG_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU unregister_cpu_notifier(&buffer->cpu_notify); #endif @@ -2757,7 +2757,7 @@ static __init int rb_init_debugfs(void) fs_initcall(rb_init_debugfs); -#ifdef CONFIG_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU static int __cpuinit rb_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { From 8aabee573dff131a085c63de7667eacd94ba4ccb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 13:13:49 -0400 Subject: [PATCH 08/20] ring-buffer: remove unneeded get_online_cpus Impact: speed up and remove possible races The get_online_cpus was added to the ring buffer because the original design would free the ring buffer on a CPU that was being taken off line. The final design kept the ring buffer around even when the CPU was taken off line. This is to allow a user to still read the information on that ring buffer. Most of the get_online_cpus are no longer needed since the ring buffer will not disappear from the use cases. Reported-by: KOSAKI Motohiro Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 90 ++++++-------------------------------- 1 file changed, 14 insertions(+), 76 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 035b56c3a6c9..2c36be9fac2e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1561,15 +1561,11 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - get_online_cpus(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return; cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->record_disabled); - out: - put_online_cpus(); } EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); @@ -1585,15 +1581,11 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - get_online_cpus(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return; cpu_buffer = buffer->buffers[cpu]; atomic_dec(&cpu_buffer->record_disabled); - out: - put_online_cpus(); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); @@ -1605,17 +1597,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret = 0; - - get_online_cpus(); + unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return 0; cpu_buffer = buffer->buffers[cpu]; ret = cpu_buffer->entries; - out: - put_online_cpus(); return ret; } @@ -1629,17 +1617,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret = 0; - - get_online_cpus(); + unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return 0; cpu_buffer = buffer->buffers[cpu]; ret = cpu_buffer->overrun; - out: - put_online_cpus(); return ret; } @@ -1658,16 +1642,12 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) unsigned long entries = 0; int cpu; - get_online_cpus(); - /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += cpu_buffer->entries; } - put_online_cpus(); - return entries; } EXPORT_SYMBOL_GPL(ring_buffer_entries); @@ -1685,16 +1665,12 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) unsigned long overruns = 0; int cpu; - get_online_cpus(); - /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; overruns += cpu_buffer->overrun; } - put_online_cpus(); - return overruns; } EXPORT_SYMBOL_GPL(ring_buffer_overruns); @@ -2093,21 +2069,16 @@ struct ring_buffer_event * ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_event *event = NULL; + struct ring_buffer_event *event; unsigned long flags; - get_online_cpus(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return NULL; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - out: - put_online_cpus(); - return event; } @@ -2189,17 +2160,15 @@ struct ring_buffer_iter * ring_buffer_read_start(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_iter *iter = NULL; + struct ring_buffer_iter *iter; unsigned long flags; - get_online_cpus(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return NULL; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) - goto out; + return NULL; cpu_buffer = buffer->buffers[cpu]; @@ -2214,9 +2183,6 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - out: - put_online_cpus(); - return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -2309,13 +2275,9 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; unsigned long flags; - int resched; - - /* Can't use get_online_cpus because this can be in atomic */ - resched = ftrace_preempt_disable(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2326,8 +2288,6 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - out: - ftrace_preempt_enable(resched); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -2337,16 +2297,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); */ void ring_buffer_reset(struct ring_buffer *buffer) { - int resched; int cpu; - /* Can't use get_online_cpus because this can be in atomic */ - resched = ftrace_preempt_disable(); - for_each_buffer_cpu(buffer, cpu) ring_buffer_reset_cpu(buffer, cpu); - - ftrace_preempt_enable(resched); } EXPORT_SYMBOL_GPL(ring_buffer_reset); @@ -2359,8 +2313,6 @@ int ring_buffer_empty(struct ring_buffer *buffer) struct ring_buffer_per_cpu *cpu_buffer; int cpu; - get_online_cpus(); - /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -2368,8 +2320,6 @@ int ring_buffer_empty(struct ring_buffer *buffer) return 0; } - put_online_cpus(); - return 1; } EXPORT_SYMBOL_GPL(ring_buffer_empty); @@ -2382,18 +2332,14 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - int ret = 1; - - get_online_cpus(); + int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return 1; cpu_buffer = buffer->buffers[cpu]; ret = rb_per_cpu_empty(cpu_buffer); - out: - put_online_cpus(); return ret; } @@ -2416,8 +2362,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, struct ring_buffer_per_cpu *cpu_buffer_b; int ret = -EINVAL; - get_online_cpus(); - if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) goto out; @@ -2466,8 +2410,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, ret = 0; out: - put_online_cpus(); - return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); @@ -2583,8 +2525,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, u64 save_timestamp; int ret = -1; - get_online_cpus(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; @@ -2681,8 +2621,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); out: - put_online_cpus(); - return ret; } From db526ca329f855510e8ce672332eba3304aed590 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 13:53:25 -0400 Subject: [PATCH 09/20] tracing: show that buffer size is not expanded Impact: do not confuse user on small trace buffer sizes When the system boots up, the trace buffer is small to conserve memory. It is only two pages per online CPU. When the tracer is used, it expands to the default value. This can confuse the user if they look at the buffer size and see only 7, but then later they see 1408. # cat /debug/tracing/buffer_size_kb 7 # echo sched_switch > /debug/tracing/current_tracer # cat /debug/tracing/buffer_size_kb 1408 This patch tries to help remove this confustion by showing that the buffer has not been expanded. # cat /debug/tracing/buffer_size_kb 7 (expanded: 1408) Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 04ab8243a13d..62a63b2b33dd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2948,10 +2948,18 @@ tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - char buf[64]; + char buf[96]; int r; - r = sprintf(buf, "%lu\n", tr->entries >> 10); + mutex_lock(&trace_types_lock); + if (!ring_buffer_expanded) + r = sprintf(buf, "%lu (expanded: %lu)\n", + tr->entries >> 10, + trace_buf_size >> 10); + else + r = sprintf(buf, "%lu\n", tr->entries >> 10); + mutex_unlock(&trace_types_lock); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } From 48ead02030f849d011259244bb4ea9b985479006 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 12 Mar 2009 18:24:49 +0100 Subject: [PATCH 10/20] tracing/core: bring back raw trace_printk for dynamic formats strings Impact: fix callsites with dynamic format strings Since its new binary implementation, trace_printk() internally uses static containers for the format strings on each callsites. But the value is assigned once at build time, which means that it can't take dynamic formats. So this patch unearthes the raw trace_printk implementation for the callers that will need trace_printk to be able to carry these dynamic format strings. The trace_printk() macro will use the appropriate implementation for each callsite. Most of the time however, the binary implementation will still be used. The other impact of this patch is that mmiotrace_printk() will use the old implementation because it calls the low level trace_vprintk and we can't guess here whether the format passed in it is dynamic or not. Some parts of this patch have been written by Steven Rostedt (most notably the part that chooses the appropriate implementation for each callsites). Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- include/linux/kernel.h | 40 ++++++++----- kernel/trace/trace.c | 85 ++++++++++++++++++++++++++-- kernel/trace/trace.h | 13 ++++- kernel/trace/trace_event_types.h | 11 +++- kernel/trace/trace_functions_graph.c | 6 +- kernel/trace/trace_mmiotrace.c | 7 +-- kernel/trace/trace_output.c | 57 +++++++++++++++++-- kernel/trace/trace_printk.c | 35 ++++++++++-- 8 files changed, 214 insertions(+), 40 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7742798c9208..1daca3b062bb 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -452,31 +452,45 @@ do { \ #define trace_printk(fmt, args...) \ do { \ - static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))); \ - \ - if (!trace_printk_fmt) \ - trace_printk_fmt = fmt; \ - \ __trace_printk_check_format(fmt, ##args); \ - __trace_printk(_THIS_IP_, trace_printk_fmt, ##args); \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))) = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args); \ + } else \ + __trace_printk(_THIS_IP_, fmt, ##args); \ } while (0) +extern int +__trace_bprintk(unsigned long ip, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); + extern int __trace_printk(unsigned long ip, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); +/* + * The double __builtin_constant_p is because gcc will give us an error + * if we try to allocate the static variable to fmt if it is not a + * constant. Even with the outer if statement. + */ #define ftrace_vprintk(fmt, vargs) \ do { \ - static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))); \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))) = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ \ - if (!trace_printk_fmt) \ - trace_printk_fmt = fmt; \ - \ - __ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs); \ + __ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs); \ + } else \ + __ftrace_vprintk(_THIS_IP_, fmt, vargs); \ } while (0) +extern int +__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap); + extern int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 62a63b2b33dd..dbb077d8a172 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1179,10 +1179,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace) /** - * trace_vprintk - write binary msg to tracing buffer + * trace_vbprintk - write binary msg to tracing buffer * */ -int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) +int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args) { static raw_spinlock_t trace_buf_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; @@ -1191,7 +1191,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; - struct print_entry *entry; + struct bprint_entry *entry; unsigned long flags; int resched; int cpu, len = 0, size, pc; @@ -1219,7 +1219,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) goto out_unlock; size = sizeof(*entry) + sizeof(u32) * len; - event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc); + event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); if (!event) goto out_unlock; entry = ring_buffer_event_data(event); @@ -1240,6 +1240,60 @@ out: return len; } +EXPORT_SYMBOL_GPL(trace_vbprintk); + +int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) +{ + static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; + static char trace_buf[TRACE_BUF_SIZE]; + + struct ring_buffer_event *event; + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + int cpu, len = 0, size, pc; + struct print_entry *entry; + unsigned long irq_flags; + + if (tracing_disabled || tracing_selftest_running) + return 0; + + pc = preempt_count(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(atomic_read(&data->disabled))) + goto out; + + pause_graph_tracing(); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&trace_buf_lock); + len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); + + len = min(len, TRACE_BUF_SIZE-1); + trace_buf[len] = 0; + + size = sizeof(*entry) + len + 1; + event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); + if (!event) + goto out_unlock; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->depth = depth; + + memcpy(&entry->buf, trace_buf, len); + entry->buf[len] = 0; + ring_buffer_unlock_commit(tr->buffer, event); + + out_unlock: + __raw_spin_unlock(&trace_buf_lock); + raw_local_irq_restore(irq_flags); + unpause_graph_tracing(); + out: + preempt_enable_notrace(); + + return len; +} EXPORT_SYMBOL_GPL(trace_vprintk); enum trace_file_type { @@ -1628,6 +1682,22 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } +static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bprint_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_bprintf(s, field->fmt, field->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1637,7 +1707,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_bprintf(s, field->fmt, field->buf); + ret = trace_seq_printf(s, "%s", field->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -1702,6 +1772,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) return ret; } + if (iter->ent->type == TRACE_BPRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return print_bprintk_msg_only(iter); + if (iter->ent->type == TRACE_PRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 336324d717f8..cede1ab49d07 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -20,6 +20,7 @@ enum trace_type { TRACE_WAKE, TRACE_STACK, TRACE_PRINT, + TRACE_BPRINT, TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, @@ -117,7 +118,7 @@ struct userstack_entry { /* * trace_printk entry: */ -struct print_entry { +struct bprint_entry { struct trace_entry ent; unsigned long ip; int depth; @@ -125,6 +126,13 @@ struct print_entry { u32 buf[]; }; +struct print_entry { + struct trace_entry ent; + unsigned long ip; + int depth; + char buf[]; +}; + #define TRACE_OLD_SIZE 88 struct trace_field_cont { @@ -286,6 +294,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ + IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ @@ -570,6 +579,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace, extern void *head_page(struct trace_array_cpu *data); extern long ns2usecs(cycle_t nsec); extern int +trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args); +extern int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); extern unsigned long trace_flags; diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 5cca4c978bde..d0907d746425 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -102,7 +102,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") ); -TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, +TRACE_EVENT_FORMAT(bprint, TRACE_PRINT, bprint_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, ip, ip) TRACE_FIELD(unsigned int, depth, depth) @@ -112,6 +112,15 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TP_RAW_FMT("%08lx (%d) fmt:%p %s") ); +TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, ip, ip) + TRACE_FIELD(unsigned int, depth, depth) + TRACE_FIELD_ZERO_CHAR(buf) + ), + TP_RAW_FMT("%08lx (%d) fmt:%p %s") +); + TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned int, line, line) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8566c14b3e9a..4c388607ed67 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -684,7 +684,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } static enum print_line_t -print_graph_comment(struct print_entry *trace, struct trace_seq *s, +print_graph_comment(struct bprint_entry *trace, struct trace_seq *s, struct trace_entry *ent, struct trace_iterator *iter) { int i; @@ -781,8 +781,8 @@ print_graph_function(struct trace_iterator *iter) trace_assign_type(field, entry); return print_graph_return(&field->ret, s, entry, iter); } - case TRACE_PRINT: { - struct print_entry *field; + case TRACE_BPRINT: { + struct bprint_entry *field; trace_assign_type(field, entry); return print_graph_comment(field, s, entry, iter); } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 23e346a734ca..f095916e477f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -254,6 +254,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; struct print_entry *print = (struct print_entry *)entry; + const char *msg = print->buf; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); @@ -261,11 +262,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) int ret; /* The trailing newline must be in the message. */ - ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_bprintf(s, print->fmt, print->buf); + ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 491832af9ba1..ea9d3b410c7a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -832,13 +832,13 @@ static struct trace_event trace_user_stack_event = { .binary = trace_special_bin, }; -/* TRACE_PRINT */ +/* TRACE_BPRINT */ static enum print_line_t -trace_print_print(struct trace_iterator *iter, int flags) +trace_bprint_print(struct trace_iterator *iter, int flags) { struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; - struct print_entry *field; + struct bprint_entry *field; trace_assign_type(field, entry); @@ -858,9 +858,10 @@ trace_print_print(struct trace_iterator *iter, int flags) } -static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) +static enum print_line_t +trace_bprint_raw(struct trace_iterator *iter, int flags) { - struct print_entry *field; + struct bprint_entry *field; struct trace_seq *s = &iter->seq; trace_assign_type(field, iter->ent); @@ -878,12 +879,55 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) } +static struct trace_event trace_bprint_event = { + .type = TRACE_BPRINT, + .trace = trace_bprint_print, + .raw = trace_bprint_raw, +}; + +/* TRACE_PRINT */ +static enum print_line_t trace_print_print(struct trace_iterator *iter, + int flags) +{ + struct print_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (!trace_seq_printf(s, ": %s", field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) +{ + struct print_entry *field; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + static struct trace_event trace_print_event = { - .type = TRACE_PRINT, + .type = TRACE_PRINT, .trace = trace_print_print, .raw = trace_print_raw, }; + static struct trace_event *events[] __initdata = { &trace_fn_event, &trace_ctx_event, @@ -891,6 +935,7 @@ static struct trace_event *events[] __initdata = { &trace_special_event, &trace_stack_event, &trace_user_stack_event, + &trace_bprint_event, &trace_print_event, NULL }; diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index a50aea22e929..f307a11e2332 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -99,7 +99,7 @@ struct notifier_block module_trace_bprintk_format_nb = { .notifier_call = module_trace_bprintk_format_notify, }; -int __trace_printk(unsigned long ip, const char *fmt, ...) +int __trace_bprintk(unsigned long ip, const char *fmt, ...) { int ret; va_list ap; @@ -107,6 +107,33 @@ int __trace_printk(unsigned long ip, const char *fmt, ...) if (unlikely(!fmt)) return 0; + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__trace_bprintk); + +int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) + { + if (unlikely(!fmt)) + return 0; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap); +} +EXPORT_SYMBOL_GPL(__ftrace_vbprintk); + +int __trace_printk(unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + if (!(trace_flags & TRACE_ITER_PRINTK)) return 0; @@ -118,10 +145,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...) EXPORT_SYMBOL_GPL(__trace_printk); int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) - { - if (unlikely(!fmt)) - return 0; - +{ if (!(trace_flags & TRACE_ITER_PRINTK)) return 0; @@ -129,7 +153,6 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) } EXPORT_SYMBOL_GPL(__ftrace_vprintk); - static __init int init_trace_printk(void) { return register_module_notifier(&module_trace_bprintk_format_nb); From 828275574e0161bdddb5817d4bd76a0265ef0470 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 14:14:31 -0400 Subject: [PATCH 11/20] tracing: make bprint event use the proper event id The bprint record is using TRACE_PRINT when it should be TRACE_BPRINT. Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index d0907d746425..019915063fe6 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -102,7 +102,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") ); -TRACE_EVENT_FORMAT(bprint, TRACE_PRINT, bprint_entry, ignore, +TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, ip, ip) TRACE_FIELD(unsigned int, depth, depth) From e9fb2b6d5845e24f104713591286b6f39761c027 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 14:19:25 -0400 Subject: [PATCH 12/20] tracing: have event_trace_printk use static tracer Impact: speed up on event tracing The event_trace_printk is currently a wrapper function that calls trace_vprintk. Because it uses a variable for the fmt it misses out on the optimization of using the binary printk. This patch makes event_trace_printk into a macro wrapper to use the fmt as the same as the trace_printks. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 17 +++++++++++++++++ kernel/trace/trace_events.c | 10 ---------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cede1ab49d07..35cfa7bbaf38 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -773,4 +773,21 @@ void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; +extern const char *__start___trace_bprintk_fmt[]; +extern const char *__stop___trace_bprintk_fmt[]; + +#define event_trace_printk(ip, fmt, args...) \ +do { \ + __trace_printk_check_format(fmt, ##args); \ + tracing_record_cmdline(current); \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))) = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __trace_bprintk(ip, trace_printk_fmt, ##args); \ + } else \ + __trace_printk(ip, fmt, ##args); \ +} while (0) + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ca624df73591..238ea95a4115 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -24,16 +24,6 @@ static DEFINE_MUTEX(event_mutex); (unsigned long)event < (unsigned long)__stop_ftrace_events; \ event++) -void event_trace_printk(unsigned long ip, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - tracing_record_cmdline(current); - trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); - va_end(ap); -} - static void ftrace_clear_events(void) { struct ftrace_event_call *call = (void *)__start_ftrace_events; From 7975a2be16dd42df2cab80c80cb6ece382edb6ec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 14:23:17 -0400 Subject: [PATCH 13/20] tracing: export trace formats to user space The binary printk saves a pointer to the format string in the ring buffer. On output, the format is processed. But if the user is reading the ring buffer through a binary interface, the pointer is meaningless. This patch creates a file called printk_formats that maps the pointers to the formats. # cat /debug/tracing/printk_formats 0xffffffff80713d40 : "irq_handler_entry: irq=%d handler=%s\n" 0xffffffff80713d48 : "lock_acquire: %s%s%s\n" 0xffffffff80713d50 : "lock_release: %s\n" Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 119 ++++++++++++++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index f307a11e2332..486785214e3e 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -4,18 +4,19 @@ * Copyright (C) 2008 Lai Jiangshan * */ +#include +#include +#include #include #include #include +#include +#include +#include #include #include -#include #include -#include -#include #include -#include -#include #include "trace.h" @@ -153,6 +154,114 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) } EXPORT_SYMBOL_GPL(__ftrace_vprintk); +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + const char **fmt = m->private; + const char **next = fmt; + + (*pos)++; + + if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) + return NULL; + + next = fmt; + m->private = ++next; + + return fmt; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + return t_next(m, NULL, pos); +} + +static int t_show(struct seq_file *m, void *v) +{ + const char **fmt = v; + const char *str = *fmt; + int i; + + seq_printf(m, "0x%lx : \"", (unsigned long)fmt); + + /* + * Tabs and new lines need to be converted. + */ + for (i = 0; str[i]; i++) { + switch (str[i]) { + case '\n': + seq_puts(m, "\\n"); + break; + case '\t': + seq_puts(m, "\\t"); + break; + case '\\': + seq_puts(m, "\\"); + break; + case '"': + seq_puts(m, "\\\""); + break; + default: + seq_putc(m, str[i]); + } + } + seq_puts(m, "\"\n"); + + return 0; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations show_format_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show, + .stop = t_stop, +}; + +static int +ftrace_formats_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &show_format_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + + m->private = __start___trace_bprintk_fmt; + } + return ret; +} + +static const struct file_operations ftrace_formats_fops = { + .open = ftrace_formats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int init_trace_printk_function_export(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("printk_formats", 0444, d_tracer, + NULL, &ftrace_formats_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'printk_formats' entry\n"); + + return 0; +} + +fs_initcall(init_trace_printk_function_export); + static __init int init_trace_printk(void) { return register_module_notifier(&module_trace_bprintk_format_nb); From 2da03ecee6308ea174e8a02b92a3c4ec92e886c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 18:57:51 -0400 Subject: [PATCH 14/20] tracing: fix stack tracer header The stack tracer use to look like this: # cat /debug/tracing/stack_trace Depth Size Location (57 entries) ----- ---- -------- 0) 5088 16 mempool_alloc_slab+0x16/0x18 1) 5072 144 mempool_alloc+0x4d/0xfe 2) 4928 16 scsi_sg_alloc+0x48/0x4a [scsi_mod] Now it looks like this: # cat /debug/tracing/stack_trace Depth Size Location (57 entries) ----- ---- -------- 0) 5088 16 mempool_alloc_slab+0x16/0x18 1) 5072 144 mempool_alloc+0x4d/0xfe 2) 4928 16 scsi_sg_alloc+0x48/0x4a [scsi_mod] Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index d0871bc0aca5..4564fd94b0cd 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -251,9 +251,9 @@ static int t_show(struct seq_file *m, void *v) int size; if (v == SEQ_START_TOKEN) { - seq_printf(m, " Depth Size Location" + seq_printf(m, " Depth Size Location" " (%d entries)\n" - " ----- ---- --------\n", + " ----- ---- --------\n", max_stack_trace.nr_entries); return 0; } From e447e1df2e568cd43d1918963c9f09fae85aea57 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 19:42:29 -0400 Subject: [PATCH 15/20] tracing: explain why stack tracer is empty If the stack tracing is disabled (by default) the stack_trace file will only contain the header: # cat /debug/tracing/stack_trace Depth Size Location (0 entries) ----- ---- -------- This can be frustrating to a developer that does not realize that the stack tracer is disabled. This patch adds the following text: # cat /debug/tracing/stack_trace Depth Size Location (0 entries) ----- ---- -------- # # Stack tracer disabled # # To enable the stack tracer, either add 'stacktrace' to the # kernel command line # or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled' # Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4564fd94b0cd..91ccbf396c9a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -245,6 +245,17 @@ static int trace_lookup_stack(struct seq_file *m, long i) #endif } +static void print_disabled(struct seq_file *m) +{ + seq_puts(m, "#\n" + "# Stack tracer disabled\n" + "#\n" + "# To enable the stack tracer, either add 'stacktrace' to the\n" + "# kernel command line\n" + "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n" + "#\n"); +} + static int t_show(struct seq_file *m, void *v) { long i; @@ -255,6 +266,10 @@ static int t_show(struct seq_file *m, void *v) " (%d entries)\n" " ----- ---- --------\n", max_stack_trace.nr_entries); + + if (!stack_tracer_enabled && !max_stack_size) + print_disabled(m); + return 0; } From 5d592b44b29a1d73e13d5c9e3426eed843bdc359 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 12 Mar 2009 14:33:36 -0400 Subject: [PATCH 16/20] tracing: tracepoints for softirq entry/exit - add softirq-to-name array Create a 'softirq_to_name' array, which is indexed by softirq #, so that we can easily convert between the softirq index # and its name, in order to get more meaningful output messages. LKML-Reference: <20090312183336.GB3352@redhat.com> Signed-off-by: Jason Baron Signed-off-by: Steven Rostedt --- include/linux/interrupt.h | 5 +++++ kernel/softirq.c | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 472f11765f60..9b7e9d743476 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -258,6 +258,11 @@ enum NR_SOFTIRQS }; +/* map softirq index to softirq name. update 'softirq_to_name' in + * kernel/softirq.c when adding a new softirq. + */ +extern char *softirq_to_name[NR_SOFTIRQS]; + /* softirq mask and active fields moved to irq_cpustat_t in * asm/hardirq.h to get better cache usage. KAO */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 7571bcb71be4..9f90fdc039f4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -53,6 +53,12 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +char *softirq_to_name[NR_SOFTIRQS] = { + "HI_SOFTIRQ", "TIMER_SOFTIRQ", "NET_TX_SOFTIRQ", "NET_RX_SOFTIRQ", + "BLOCK_SOFTIRQ", "TASKLET_SOFTIRQ", "SCHED_SOFTIRQ", "HRTIMER_SOFTIRQ", + "RCU_SOFTIRQ" +}; + /* * we cannot loop indefinitely here to avoid userspace starvation, * but we also don't want to introduce a worst case 1/HZ latency @@ -209,9 +215,10 @@ restart: h->action(h); if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %td %p" + printk(KERN_ERR "huh, entered softirq %td %s %p" "with preempt_count %08x," " exited with %08x?\n", h - softirq_vec, + softirq_to_name[h - softirq_vec], h->action, prev_count, preempt_count()); preempt_count() = prev_count; } From 39842323ceb368d2ea36ab7696aedbe296e13b61 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 12 Mar 2009 14:36:03 -0400 Subject: [PATCH 17/20] tracing: tracepoints for softirq entry/exit - tracepoints Introduce softirq entry/exit tracepoints. These are useful for augmenting existing tracers, and to figure out softirq frequencies and timings. [ s/irq_softirq_/softirq_/ for trace point names and Fixed printf format in TRACE_FORMAT macro - Steven Rostedt ] LKML-Reference: <20090312183603.GC3352@redhat.com> Signed-off-by: Jason Baron Signed-off-by: Steven Rostedt --- include/trace/irq_event_types.h | 12 ++++++++++++ kernel/softirq.c | 7 ++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h index 214bb928fe9e..85964ebd47ec 100644 --- a/include/trace/irq_event_types.h +++ b/include/trace/irq_event_types.h @@ -40,4 +40,16 @@ TRACE_EVENT(irq_handler_exit, __entry->irq, __entry->ret ? "handled" : "unhandled") ); +TRACE_FORMAT(softirq_entry, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), + TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) + ); + +TRACE_FORMAT(softirq_exit, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), + TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) + ); + #undef TRACE_SYSTEM diff --git a/kernel/softirq.c b/kernel/softirq.c index 9f90fdc039f4..a5e81231ca7a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,6 +24,7 @@ #include #include #include +#include #include /* @@ -186,6 +187,9 @@ EXPORT_SYMBOL(local_bh_enable_ip); */ #define MAX_SOFTIRQ_RESTART 10 +DEFINE_TRACE(softirq_entry); +DEFINE_TRACE(softirq_exit); + asmlinkage void __do_softirq(void) { struct softirq_action *h; @@ -212,8 +216,9 @@ restart: if (pending & 1) { int prev_count = preempt_count(); + trace_softirq_entry(h, softirq_vec); h->action(h); - + trace_softirq_exit(h, softirq_vec); if (unlikely(prev_count != preempt_count())) { printk(KERN_ERR "huh, entered softirq %td %s %p" "with preempt_count %08x," From 889a6c367283709a80dad9413488472596a1a1d2 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 13 Mar 2009 09:03:04 +0900 Subject: [PATCH 18/20] tracing: Don't use tracing_record_cmdline() in workqueue tracer fix commit c3ffc7a40b7e94b094efe1c8ab4e24370a782b65 "Don't use tracing_record_cmdline() in workqueue tracer" has a race window. find_task_by_vpid() requires task_list_lock(). LKML-Reference: <20090313090042.43CD.A69D9226@jp.fujitsu.com> Signed-off-by: KOSAKI Motohiro Signed-off-by: Steven Rostedt --- kernel/trace/trace_workqueue.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index fb5ccac8bbc0..9ab035b58cf1 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -193,12 +193,20 @@ static int workqueue_stat_show(struct seq_file *s, void *p) struct cpu_workqueue_stats *cws = p; unsigned long flags; int cpu = cws->cpu; - struct task_struct *tsk = find_task_by_vpid(cws->pid); + struct pid *pid; + struct task_struct *tsk; - seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, - atomic_read(&cws->inserted), - cws->executed, - tsk ? tsk->comm : "<...>"); + pid = find_get_pid(cws->pid); + if (pid) { + tsk = get_pid_task(pid, PIDTYPE_PID); + if (tsk) { + seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, + atomic_read(&cws->inserted), cws->executed, + tsk->comm); + put_task_struct(tsk); + } + put_pid(pid); + } spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); if (&cws->list == workqueue_cpu_stat(cpu)->list.next) From f28e55765e40450c127e44d00ae65d0cd1a4efec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 22:00:19 -0400 Subject: [PATCH 19/20] tracing: show event name in trace for TRACE_EVENT created events Unlike TRACE_FORMAT() macros, the TRACE_EVENT() macros do not show the event name in the trace file. Knowing the event type in the trace output is very useful. Instead of: task swapper:0 [140] ==> ntpd:3308 [120] We now have: sched_switch: task swapper:0 [140] ==> ntpd:3308 [120] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_stage_2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index ca347afd6aa0..5117c43f5c67 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -57,7 +57,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ \ field = (typeof(field))entry; \ \ - ret = trace_seq_printf(s, print); \ + ret = trace_seq_printf(s, #call ": " print); \ if (!ret) \ return TRACE_TYPE_PARTIAL_LINE; \ \ From 5cc985488845ec7227a2c5cfd2fd62cf57fb411a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 22:24:17 -0400 Subject: [PATCH 20/20] ring-buffer: document reader page design In a private email conversation I explained how the ring buffer page worked by using silly ASCII art. Ingo suggested that I add that to the comments of the code. Here it is. Requested-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 68 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2c36be9fac2e..58128ad2fde0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -21,6 +21,74 @@ #include "trace.h" +/* + * The ring buffer is made up of a list of pages. A separate list of pages is + * allocated for each CPU. A writer may only write to a buffer that is + * associated with the CPU it is currently executing on. A reader may read + * from any per cpu buffer. + * + * The reader is special. For each per cpu buffer, the reader has its own + * reader page. When a reader has read the entire reader page, this reader + * page is swapped with another page in the ring buffer. + * + * Now, as long as the writer is off the reader page, the reader can do what + * ever it wants with that page. The writer will never write to that page + * again (as long as it is out of the ring buffer). + * + * Here's some silly ASCII art. + * + * +------+ + * |reader| RING BUFFER + * |page | + * +------+ +---+ +---+ +---+ + * | |-->| |-->| | + * +---+ +---+ +---+ + * ^ | + * | | + * +---------------+ + * + * + * +------+ + * |reader| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * | |-->| |-->| | + * +---+ +---+ +---+ + * ^ | + * | | + * +---------------+ + * + * + * +------+ + * |reader| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * ^ | |-->| |-->| | + * | +---+ +---+ +---+ + * | | + * | | + * +------------------------------+ + * + * + * +------+ + * |buffer| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * ^ | | | |-->| | + * | New +---+ +---+ +---+ + * | Reader------^ | + * | page | + * +------------------------------+ + * + * + * After we make this swap, the reader can hand this page off to the splice + * code and be done with it. It can even allocate a new page if it needs to + * and swap that into the ring buffer. + * + * We will be using cmpxchg soon to make all this lockless. + * + */ + /* * A fast way to enable or disable all ring buffers is to * call tracing_on or tracing_off. Turning off the ring buffers