From bc44fb5f7d3e764ed7698c835a1a0f35aba2eb3d Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:42:18 +0100 Subject: [PATCH 001/900] x86, bts: detect size of DS fields Impact: more robust DS feature enumeration Detect the size of the pointer-type fields in the DS area configuration via the DTES64 features rather than based on the cpuid. Rename a variable to denote that size to reflect that it only covers the pointer-type fields. Add more boot-time diagnostics giving the detected size and the sizes of BTS and PEBS records. Use the size of the BTS/PEBS record to indicate that the respective feature is not available (if the record size is zero). Signed-off-by: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 84 +++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 87b67e3a765..6e5ec679a0c 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -39,7 +39,7 @@ struct ds_configuration { /* the size of one pointer-typed field in the DS structure and in the BTS and PEBS buffers in bytes; this covers the first 8 DS fields related to buffer management. */ - unsigned char sizeof_field; + unsigned char sizeof_ptr_field; /* the size of a BTS/PEBS record in bytes */ unsigned char sizeof_rec[2]; /* a series of bit-masks to control various features indexed @@ -142,14 +142,14 @@ enum ds_qualifier { static inline unsigned long ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field) { - base += (ds_cfg.sizeof_field * (field + (4 * qual))); + base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); return *(unsigned long *)base; } static inline void ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, unsigned long value) { - base += (ds_cfg.sizeof_field * (field + (4 * qual))); + base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); (*(unsigned long *)base) = value; } @@ -410,7 +410,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, * Later architectures use 64bit pointers throughout, whereas earlier * architectures use 32bit pointers in 32bit mode. * - * We compute the base address for the first 8 fields based on: + * We compute the base address for the fields based on: * - the field size stored in the DS configuration * - the relative field position * @@ -441,13 +441,13 @@ enum bts_field { static inline unsigned long bts_get(const char *base, enum bts_field field) { - base += (ds_cfg.sizeof_field * field); + base += (ds_cfg.sizeof_ptr_field * field); return *(unsigned long *)base; } static inline void bts_set(char *base, enum bts_field field, unsigned long val) { - base += (ds_cfg.sizeof_field * field);; + base += (ds_cfg.sizeof_ptr_field * field);; (*(unsigned long *)base) = val; } @@ -593,6 +593,10 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, struct ds_context *context; int error; + error = -EOPNOTSUPP; + if (!ds_cfg.sizeof_rec[qual]) + goto out; + error = -EINVAL; if (!base) goto out; @@ -635,10 +639,6 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, unsigned long irq; int error; - error = -EOPNOTSUPP; - if (!ds_cfg.ctl[dsf_bts]) - goto out; - /* buffer overflow notification is not yet implemented */ error = -EOPNOTSUPP; if (ovfl) @@ -848,7 +848,8 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); tracer->trace.reset_value = - *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); + *(u64 *)(tracer->ds.context->ds + + (ds_cfg.sizeof_ptr_field * 8)); return &tracer->trace; } @@ -884,7 +885,8 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) if (!tracer) return -EINVAL; - *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; + *(u64 *)(tracer->ds.context->ds + + (ds_cfg.sizeof_ptr_field * 8)) = value; return 0; } @@ -894,52 +896,54 @@ static const struct ds_configuration ds_cfg_netburst = { .ctl[dsf_bts] = (1 << 2) | (1 << 3), .ctl[dsf_bts_kernel] = (1 << 5), .ctl[dsf_bts_user] = (1 << 6), - - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, -#ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10, -#else - .sizeof_rec[ds_pebs] = sizeof(long) * 18, -#endif }; static const struct ds_configuration ds_cfg_pentium_m = { .name = "Pentium M", .ctl[dsf_bts] = (1 << 6) | (1 << 7), - - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, -#ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10, -#else - .sizeof_rec[ds_pebs] = sizeof(long) * 18, -#endif }; static const struct ds_configuration ds_cfg_core2_atom = { .name = "Core 2/Atom", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .ctl[dsf_bts_kernel] = (1 << 9), .ctl[dsf_bts_user] = (1 << 10), - - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, - .sizeof_rec[ds_pebs] = 8 * 18, }; static void -ds_configure(const struct ds_configuration *cfg) +ds_configure(const struct ds_configuration *cfg, + struct cpuinfo_x86 *cpu) { + unsigned long nr_pebs_fields = 0; + + printk(KERN_INFO "[ds] using %s configuration\n", cfg->name); + +#ifdef __i386__ + nr_pebs_fields = 10; +#else + nr_pebs_fields = 18; +#endif + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; - printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + ds_cfg.sizeof_ptr_field = + (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4); - if (!cpu_has_bts) { - ds_cfg.ctl[dsf_bts] = 0; + ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3; + ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields; + + if (!cpu_has(cpu, X86_FEATURE_BTS)) { + ds_cfg.sizeof_rec[ds_bts] = 0; printk(KERN_INFO "[ds] bts not available\n"); } - if (!cpu_has_pebs) + if (!cpu_has(cpu, X86_FEATURE_PEBS)) { + ds_cfg.sizeof_rec[ds_pebs] = 0; printk(KERN_INFO "[ds] pebs not available\n"); + } + + printk(KERN_INFO "[ds] sizes: address: %u bit, ", + 8 * ds_cfg.sizeof_ptr_field); + printk("bts/pebs record: %u/%u bytes\n", + ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); } @@ -951,12 +955,12 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x9: case 0xd: /* Pentium M */ - ds_configure(&ds_cfg_pentium_m); + ds_configure(&ds_cfg_pentium_m, c); break; case 0xf: case 0x17: /* Core2 */ case 0x1c: /* Atom */ - ds_configure(&ds_cfg_core2_atom); + ds_configure(&ds_cfg_core2_atom, c); break; case 0x1a: /* i7 */ default: @@ -969,7 +973,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_netburst); + ds_configure(&ds_cfg_netburst, c); break; default: /* sorry, don't know about them */ From 8a327f6d1b05f5ce16572b4413a5df1d0e872283 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:45:07 +0100 Subject: [PATCH 002/900] x86, bts: add selftest for BTS Perform a selftest of branch trace store when a cpu is initialized. WARN and disable branch trace store support if the selftest fails. Signed-off-by: Markus Metzger LKML-Reference: <20090313104507.A30125@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 9 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/ds.c | 21 +++ arch/x86/kernel/ds_selftest.c | 241 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/ds_selftest.h | 15 +++ 5 files changed, 287 insertions(+) create mode 100644 arch/x86/kernel/ds_selftest.c create mode 100644 arch/x86/kernel/ds_selftest.h diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index fdb45df608b..dfd74abc03f 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -175,6 +175,15 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. +config X86_DS_SELFTEST + bool "DS selftest" + default y + depends on DEBUG_KERNEL + depends on X86_DS + ---help--- + Perform Debug Store selftests at boot time. + If in doubt, say "N". + config HAVE_MMIOTRACE_SUPPORT def_bool y diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 339ce35648e..a0c9e138b00 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,6 +44,7 @@ obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o obj-$(CONFIG_X86_DS) += ds.o +obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 6e5ec679a0c..51c936c1a39 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -29,6 +29,7 @@ #include #include +#include "ds_selftest.h" /* * The configuration for a particular DS hardware implementation. @@ -940,6 +941,26 @@ ds_configure(const struct ds_configuration *cfg, printk(KERN_INFO "[ds] pebs not available\n"); } + if (ds_cfg.sizeof_rec[ds_bts]) { + int error; + + error = ds_selftest_bts(); + if (error) { + WARN(1, "[ds] selftest failed. disabling bts.\n"); + ds_cfg.sizeof_rec[ds_bts] = 0; + } + } + + if (ds_cfg.sizeof_rec[ds_pebs]) { + int error; + + error = ds_selftest_pebs(); + if (error) { + WARN(1, "[ds] selftest failed. disabling pebs.\n"); + ds_cfg.sizeof_rec[ds_pebs] = 0; + } + } + printk(KERN_INFO "[ds] sizes: address: %u bit, ", 8 * ds_cfg.sizeof_ptr_field); printk("bts/pebs record: %u/%u bytes\n", diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c new file mode 100644 index 00000000000..8c46fbf38c4 --- /dev/null +++ b/arch/x86/kernel/ds_selftest.c @@ -0,0 +1,241 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger , 2009 + */ + +#include "ds_selftest.h" + +#include +#include + +#include + + +#define DS_SELFTEST_BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ + + +static int ds_selftest_bts_consistency(const struct bts_trace *trace) +{ + int error = 0; + + if (!trace) { + printk(KERN_CONT "failed to access trace..."); + /* Bail out. Other tests are pointless. */ + return -1; + } + + if (!trace->read) { + printk(KERN_CONT "bts read not available..."); + error = -1; + } + + /* Do some sanity checks on the trace configuration. */ + if (!trace->ds.n) { + printk(KERN_CONT "empty bts buffer..."); + error = -1; + } + if (!trace->ds.size) { + printk(KERN_CONT "bad bts trace setup..."); + error = -1; + } + if (trace->ds.end != + (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) { + printk(KERN_CONT "bad bts buffer setup..."); + error = -1; + } + if ((trace->ds.top < trace->ds.begin) || + (trace->ds.end <= trace->ds.top)) { + printk(KERN_CONT "bts top out of bounds..."); + error = -1; + } + + return error; +} + +static int ds_selftest_bts_read(struct bts_tracer *tracer, + const struct bts_trace *trace, + const void *from, const void *to) +{ + const unsigned char *at; + + /* + * Check a few things which do not belong to this test. + * They should be covered by other tests. + */ + if (!trace) + return -1; + + if (!trace->read) + return -1; + + if (to < from) + return -1; + + if (from < trace->ds.begin) + return -1; + + if (trace->ds.end < to) + return -1; + + if (!trace->ds.size) + return -1; + + /* Now to the test itself. */ + for (at = from; (void *)at < to; at += trace->ds.size) { + struct bts_struct bts; + size_t index; + int error; + + if (((void *)at - trace->ds.begin) % trace->ds.size) { + printk(KERN_CONT + "read from non-integer index..."); + return -1; + } + index = ((void *)at - trace->ds.begin) / trace->ds.size; + + memset(&bts, 0, sizeof(bts)); + error = trace->read(tracer, at, &bts); + if (error < 0) { + printk(KERN_CONT + "error reading bts trace at [%lu] (0x%p)...", + index, at); + return error; + } + + switch (bts.qualifier) { + case BTS_BRANCH: + break; + default: + printk(KERN_CONT + "unexpected bts entry %llu at [%lu] (0x%p)...", + bts.qualifier, index, at); + return -1; + } + } + + return 0; +} + +int ds_selftest_bts(void) +{ + const struct bts_trace *trace; + struct bts_tracer *tracer; + int error = 0; + void *top; + unsigned char buffer[DS_SELFTEST_BUFFER_SIZE]; + + printk(KERN_INFO "[ds] bts selftest..."); + + tracer = ds_request_bts(NULL, buffer, DS_SELFTEST_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + if (IS_ERR(tracer)) { + error = PTR_ERR(tracer); + tracer = NULL; + + printk(KERN_CONT + "initialization failed (err: %d)...", error); + goto out; + } + + /* The return should already give us enough trace. */ + ds_suspend_bts(tracer); + + /* Let's see if we can access the trace. */ + trace = ds_read_bts(tracer); + + error = ds_selftest_bts_consistency(trace); + if (error < 0) + goto out; + + /* If everything went well, we should have a few trace entries. */ + if (trace->ds.top == trace->ds.begin) { + /* + * It is possible but highly unlikely that we got a + * buffer overflow and end up at exactly the same + * position we started from. + * Let's issue a warning, but continue. + */ + printk(KERN_CONT "no trace/overflow..."); + } + + /* Let's try to read the trace we collected. */ + error = ds_selftest_bts_read(tracer, trace, + trace->ds.begin, trace->ds.top); + if (error < 0) + goto out; + + /* + * Let's read the trace again. + * Since we suspended tracing, we should get the same result. + */ + top = trace->ds.top; + + trace = ds_read_bts(tracer); + error = ds_selftest_bts_consistency(trace); + if (error < 0) + goto out; + + if (top != trace->ds.top) { + printk(KERN_CONT "suspend not working..."); + error = -1; + goto out; + } + + /* Let's collect some more trace - see if resume is working. */ + ds_resume_bts(tracer); + ds_suspend_bts(tracer); + + trace = ds_read_bts(tracer); + + error = ds_selftest_bts_consistency(trace); + if (error < 0) + goto out; + + if (trace->ds.top == top) { + /* + * It is possible but highly unlikely that we got a + * buffer overflow and end up at exactly the same + * position we started from. + * Let's issue a warning and check the full trace. + */ + printk(KERN_CONT + "no resume progress/overflow..."); + + error = ds_selftest_bts_read(tracer, trace, + trace->ds.begin, trace->ds.end); + } else if (trace->ds.top < top) { + /* + * We had a buffer overflow - the entire buffer should + * contain trace records. + */ + error = ds_selftest_bts_read(tracer, trace, + trace->ds.begin, trace->ds.end); + } else { + /* + * It is quite likely that the buffer did not overflow. + * Let's just check the delta trace. + */ + error = ds_selftest_bts_read(tracer, trace, + top, trace->ds.top); + } + if (error < 0) + goto out; + + error = 0; + + /* The final test: release the tracer while tracing is suspended. */ + out: + ds_release_bts(tracer); + + printk(KERN_CONT "%s.\n", (error ? "failed" : "passed")); + + return error; +} + +int ds_selftest_pebs(void) +{ + return 0; +} diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h new file mode 100644 index 00000000000..0e6e19d4c7d --- /dev/null +++ b/arch/x86/kernel/ds_selftest.h @@ -0,0 +1,15 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger , 2009 + */ + +#ifdef CONFIG_X86_DS_SELFTEST +extern int ds_selftest_bts(void); +extern int ds_selftest_pebs(void); +#else +static inline int ds_selftest_bts(void) { return 0; } +static inline int ds_selftest_pebs(void) { return 0; } +#endif /* CONFIG_X86_DS_SELFTEST */ From b8e47195451c5d3f62620b2b1b5928669afd56eb Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:46:42 +0100 Subject: [PATCH 003/900] x86, bts: correct comment style in ds.c Correct the comment style in ds.c. Signed-off-by: Markus Metzger LKML-Reference: <20090313104642.A30149@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 79 ++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 51c936c1a39..d9cab716805 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -35,25 +35,22 @@ * The configuration for a particular DS hardware implementation. */ struct ds_configuration { - /* the name of the configuration */ + /* The name of the configuration. */ const char *name; - /* the size of one pointer-typed field in the DS structure and - in the BTS and PEBS buffers in bytes; - this covers the first 8 DS fields related to buffer management. */ + /* The size of pointer-typed fields in DS, BTS, and PEBS. */ unsigned char sizeof_ptr_field; - /* the size of a BTS/PEBS record in bytes */ + /* The size of a BTS/PEBS record in bytes. */ unsigned char sizeof_rec[2]; - /* a series of bit-masks to control various features indexed - * by enum ds_feature */ + /* Control bit-masks indexed by enum ds_feature. */ unsigned long ctl[dsf_ctl_max]; }; static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); #define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) -#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ -#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ +#define MAX_SIZEOF_DS (12 * 8) /* Maximal size of a DS configuration. */ +#define MAX_SIZEOF_BTS (3 * 8) /* Maximal size of a BTS record. */ +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment. */ #define BTS_CONTROL \ (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ @@ -67,28 +64,28 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); * to identify tracers. */ struct ds_tracer { - /* the DS context (partially) owned by this tracer */ + /* The DS context (partially) owned by this tracer. */ struct ds_context *context; - /* the buffer provided on ds_request() and its size in bytes */ + /* The buffer provided on ds_request() and its size in bytes. */ void *buffer; size_t size; }; struct bts_tracer { - /* the common DS part */ + /* The common DS part. */ struct ds_tracer ds; - /* the trace including the DS configuration */ + /* The trace including the DS configuration. */ struct bts_trace trace; - /* buffer overflow notification function */ + /* Buffer overflow notification function. */ bts_ovfl_callback_t ovfl; }; struct pebs_tracer { - /* the common DS part */ + /* The common DS part. */ struct ds_tracer ds; - /* the trace including the DS configuration */ + /* The trace including the DS configuration. */ struct pebs_trace trace; - /* buffer overflow notification function */ + /* Buffer overflow notification function. */ pebs_ovfl_callback_t ovfl; }; @@ -214,18 +211,16 @@ static inline int check_tracer(struct task_struct *task) * deallocated when the last user puts the context. */ struct ds_context { - /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ + /* The DS configuration; goes into MSR_IA32_DS_AREA. */ unsigned char ds[MAX_SIZEOF_DS]; - /* the owner of the BTS and PEBS configuration, respectively */ + /* The owner of the BTS and PEBS configuration, respectively. */ struct bts_tracer *bts_master; struct pebs_tracer *pebs_master; - /* use count */ + /* Use count. */ unsigned long count; - /* a pointer to the context location inside the thread_struct - * or the per_cpu context array */ + /* Pointer to the context pointer field. */ struct ds_context **this; - /* a pointer to the task owning this context, or NULL, if the - * context is owned by a cpu */ + /* The traced task; NULL for current cpu. */ struct task_struct *task; }; @@ -350,14 +345,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, unsigned long write_size, adj_write_size; /* - * write as much as possible without producing an + * Write as much as possible without producing an * overflow interrupt. * - * interrupt_threshold must either be + * Interrupt_threshold must either be * - bigger than absolute_maximum or * - point to a record between buffer_base and absolute_maximum * - * index points to a valid record. + * Index points to a valid record. */ base = ds_get(context->ds, qual, ds_buffer_base); index = ds_get(context->ds, qual, ds_index); @@ -366,8 +361,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, write_end = min(end, int_th); - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ + /* + * If we are already beyond the interrupt threshold, + * we fill the entire buffer. + */ if (write_end <= index) write_end = end; @@ -384,7 +381,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; adj_write_size *= ds_cfg.sizeof_rec[qual]; - /* zero out trailing bytes */ + /* Zero out trailing bytes. */ memset((char *)index + write_size, 0, adj_write_size - write_size); index += adj_write_size; @@ -556,7 +553,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, unsigned int flags) { unsigned long buffer, adj; - /* adjust the buffer address and size to meet alignment + /* + * Adjust the buffer address and size to meet alignment * constraints: * - buffer is double-word aligned * - size is multiple of record size @@ -578,7 +576,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, trace->begin = (void *)buffer; trace->top = trace->begin; trace->end = (void *)(buffer + size); - /* The value for 'no threshold' is -1, which will set the + /* + * The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ trace->ith = (void *)(buffer + size - ith); @@ -602,7 +601,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, if (!base) goto out; - /* we require some space to do alignment adjustments below */ + /* We require some space to do alignment adjustments below. */ error = -EINVAL; if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) goto out; @@ -640,7 +639,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, unsigned long irq; int error; - /* buffer overflow notification is not yet implemented */ + /* Buffer overflow notification is not yet implemented. */ error = -EOPNOTSUPP; if (ovfl) goto out; @@ -700,7 +699,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, unsigned long irq; int error; - /* buffer overflow notification is not yet implemented */ + /* Buffer overflow notification is not yet implemented. */ error = -EOPNOTSUPP; if (ovfl) goto out; @@ -983,9 +982,9 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x1c: /* Atom */ ds_configure(&ds_cfg_core2_atom, c); break; - case 0x1a: /* i7 */ + case 0x1a: /* Core i7 */ default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } break; @@ -997,12 +996,12 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) ds_configure(&ds_cfg_netburst, c); break; default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } break; default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } } From ba9372a8f306c4e53a5f61dcbcd6c1e4a8c2e9ac Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:48:52 +0100 Subject: [PATCH 004/900] x86, hw-branch-tracer: keep resources on stop Distinguish init/reset and start/stop: init/reset will allocate and release bts tracing resources stop/start will suspend and resume bts tracing Return an error on init() if no cpu can be traced. Signed-off-by: Markus Metzger LKML-Reference: <20090313104852.A30168@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_hw_branches.c | 119 ++++++++++++++++++++++--------- 1 file changed, 85 insertions(+), 34 deletions(-) diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 7bfdf4c2347..a99a04c5e9c 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -19,7 +19,7 @@ #include "trace_output.h" -#define SIZEOF_BTS (1 << 13) +#define BTS_BUFFER_SIZE (1 << 13) /* * The tracer lock protects the below per-cpu tracer array. @@ -33,53 +33,68 @@ */ static DEFINE_SPINLOCK(bts_tracer_lock); static DEFINE_PER_CPU(struct bts_tracer *, tracer); -static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); +static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); #define this_tracer per_cpu(tracer, smp_processor_id()) #define this_buffer per_cpu(buffer, smp_processor_id()) -static int __read_mostly trace_hw_branches_enabled; +static int trace_hw_branches_enabled __read_mostly; +static int trace_hw_branches_suspended __read_mostly; static struct trace_array *hw_branch_trace __read_mostly; /* - * Start tracing on the current cpu. + * Initialize the tracer for the current cpu. * The argument is ignored. * * pre: bts_tracer_lock must be locked. */ -static void bts_trace_start_cpu(void *arg) +static void bts_trace_init_cpu(void *arg) { if (this_tracer) ds_release_bts(this_tracer); - this_tracer = - ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, - /* ovfl = */ NULL, /* th = */ (size_t)-1, - BTS_KERNEL); + this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); if (IS_ERR(this_tracer)) { this_tracer = NULL; return; } } -static void bts_trace_start(struct trace_array *tr) +static int bts_trace_init(struct trace_array *tr) { + int cpu, avail; + spin_lock(&bts_tracer_lock); - on_each_cpu(bts_trace_start_cpu, NULL, 1); - trace_hw_branches_enabled = 1; + hw_branch_trace = tr; + + on_each_cpu(bts_trace_init_cpu, NULL, 1); + + /* Check on how many cpus we could enable tracing */ + avail = 0; + for_each_online_cpu(cpu) + if (per_cpu(tracer, cpu)) + avail++; + + trace_hw_branches_enabled = (avail ? 1 : 0); + trace_hw_branches_suspended = 0; spin_unlock(&bts_tracer_lock); + + + /* If we could not enable tracing on a single cpu, we fail. */ + return avail ? 0 : -EOPNOTSUPP; } /* - * Stop tracing on the current cpu. + * Release the tracer for the current cpu. * The argument is ignored. * * pre: bts_tracer_lock must be locked. */ -static void bts_trace_stop_cpu(void *arg) +static void bts_trace_release_cpu(void *arg) { if (this_tracer) { ds_release_bts(this_tracer); @@ -87,12 +102,57 @@ static void bts_trace_stop_cpu(void *arg) } } +static void bts_trace_reset(struct trace_array *tr) +{ + spin_lock(&bts_tracer_lock); + + on_each_cpu(bts_trace_release_cpu, NULL, 1); + trace_hw_branches_enabled = 0; + trace_hw_branches_suspended = 0; + + spin_unlock(&bts_tracer_lock); +} + +/* + * Resume tracing on the current cpu. + * The argument is ignored. + * + * pre: bts_tracer_lock must be locked. + */ +static void bts_trace_resume_cpu(void *arg) +{ + if (this_tracer) + ds_resume_bts(this_tracer); +} + +static void bts_trace_start(struct trace_array *tr) +{ + spin_lock(&bts_tracer_lock); + + on_each_cpu(bts_trace_resume_cpu, NULL, 1); + trace_hw_branches_suspended = 0; + + spin_unlock(&bts_tracer_lock); +} + +/* + * Suspend tracing on the current cpu. + * The argument is ignored. + * + * pre: bts_tracer_lock must be locked. + */ +static void bts_trace_suspend_cpu(void *arg) +{ + if (this_tracer) + ds_suspend_bts(this_tracer); +} + static void bts_trace_stop(struct trace_array *tr) { spin_lock(&bts_tracer_lock); - trace_hw_branches_enabled = 0; - on_each_cpu(bts_trace_stop_cpu, NULL, 1); + on_each_cpu(bts_trace_suspend_cpu, NULL, 1); + trace_hw_branches_suspended = 1; spin_unlock(&bts_tracer_lock); } @@ -110,10 +170,14 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_DOWN_FAILED: - smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); + smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1); + + if (trace_hw_branches_suspended) + smp_call_function_single(cpu, bts_trace_suspend_cpu, + NULL, 1); break; case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); + smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1); break; } @@ -126,20 +190,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { .notifier_call = bts_hotcpu_handler }; -static int bts_trace_init(struct trace_array *tr) -{ - hw_branch_trace = tr; - - bts_trace_start(tr); - - return 0; -} - -static void bts_trace_reset(struct trace_array *tr) -{ - bts_trace_stop(tr); -} - static void bts_trace_print_header(struct seq_file *m) { seq_puts(m, "# CPU# TO <- FROM\n"); @@ -228,7 +278,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at) */ static void trace_bts_cpu(void *arg) { - struct trace_array *tr = (struct trace_array *) arg; + struct trace_array *tr = (struct trace_array *)arg; const struct bts_trace *trace; unsigned char *at; @@ -276,7 +326,8 @@ void trace_hw_branch_oops(void) { spin_lock(&bts_tracer_lock); - trace_bts_cpu(hw_branch_trace); + if (trace_hw_branches_enabled) + trace_bts_cpu(hw_branch_trace); spin_unlock(&bts_tracer_lock); } From 321bb5e1ac461c04b6a93f795010d6eb01d8c5ca Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:50:27 +0100 Subject: [PATCH 005/900] x86, hw-branch-tracer: add selftest Add a selftest for the hw-branch-tracer. Signed-off-by: Markus Metzger LKML-Reference: <20090313105027.A30183@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 2 ++ kernel/trace/trace_hw_branches.c | 5 ++- kernel/trace/trace_selftest.c | 53 ++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 56ce34d90b0..e7fbc826f1e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -576,6 +576,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_hw_branches(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index a99a04c5e9c..4ca82700c04 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -342,7 +342,10 @@ struct tracer bts_tracer __read_mostly = .start = bts_trace_start, .stop = bts_trace_stop, .open = trace_bts_prepare, - .close = trace_bts_close + .close = trace_bts_close, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_hw_branches, +#endif /* CONFIG_FTRACE_SELFTEST */ }; __init static int init_bts_trace(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index f907a2b2902..3c7b797d0d2 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: + case TRACE_HW_BRANCHES: return 1; } return 0; @@ -691,3 +692,55 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) return ret; } #endif /* CONFIG_BRANCH_TRACER */ + +#ifdef CONFIG_HW_BRANCH_TRACER +int +trace_selftest_startup_hw_branches(struct tracer *trace, + struct trace_array *tr) +{ + unsigned long count; + int ret; + struct trace_iterator iter; + struct tracer tracer; + + if (!trace->open) { + printk(KERN_CONT "missing open function..."); + return -1; + } + + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* + * The hw-branch tracer needs to collect the trace from the various + * cpu trace buffers - before tracing is stopped. + */ + memset(&iter, 0, sizeof(iter)); + memcpy(&tracer, trace, sizeof(tracer)); + + iter.trace = &tracer; + iter.tr = tr; + iter.pos = -1; + mutex_init(&iter.mutex); + + trace->open(&iter); + + mutex_destroy(&iter.mutex); + + tracing_stop(); + + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT "no entries found.."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_HW_BRANCH_TRACER */ From e9a22d1fb94050b7d600019c32e6b672d539054b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 11:54:40 +0100 Subject: [PATCH 006/900] x86, bts: cleanups Impact: cleanup, no code changed Cc: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 144 +++++++++++++++++-------------- arch/x86/kernel/ds_selftest.h | 2 +- kernel/trace/trace_hw_branches.c | 6 +- kernel/trace/trace_selftest.c | 5 +- 4 files changed, 88 insertions(+), 69 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index d9cab716805..7363e01ba08 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -19,43 +19,52 @@ * Markus Metzger , 2007-2009 */ +#include +#include +#include +#include +#include +#include #include -#include -#include -#include -#include -#include -#include - #include "ds_selftest.h" /* - * The configuration for a particular DS hardware implementation. + * The configuration for a particular DS hardware implementation: */ struct ds_configuration { - /* The name of the configuration. */ - const char *name; - /* The size of pointer-typed fields in DS, BTS, and PEBS. */ - unsigned char sizeof_ptr_field; - /* The size of a BTS/PEBS record in bytes. */ - unsigned char sizeof_rec[2]; - /* Control bit-masks indexed by enum ds_feature. */ - unsigned long ctl[dsf_ctl_max]; + /* The name of the configuration: */ + const char *name; + + /* The size of pointer-typed fields in DS, BTS, and PEBS: */ + unsigned char sizeof_ptr_field; + + /* The size of a BTS/PEBS record in bytes: */ + unsigned char sizeof_rec[2]; + + /* Control bit-masks indexed by enum ds_feature: */ + unsigned long ctl[dsf_ctl_max]; }; static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); #define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) -#define MAX_SIZEOF_DS (12 * 8) /* Maximal size of a DS configuration. */ -#define MAX_SIZEOF_BTS (3 * 8) /* Maximal size of a BTS record. */ -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment. */ +/* Maximal size of a DS configuration: */ +#define MAX_SIZEOF_DS (12 * 8) -#define BTS_CONTROL \ - (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ - ds_cfg.ctl[dsf_bts_overflow]) +/* Maximal size of a BTS record: */ +#define MAX_SIZEOF_BTS (3 * 8) +/* BTS and PEBS buffer alignment: */ +#define DS_ALIGNMENT (1 << 3) + +/* Mask of control bits in the DS MSR register: */ +#define BTS_CONTROL \ + ( ds_cfg.ctl[dsf_bts] | \ + ds_cfg.ctl[dsf_bts_kernel] | \ + ds_cfg.ctl[dsf_bts_user] | \ + ds_cfg.ctl[dsf_bts_overflow] ) /* * A BTS or PEBS tracer. @@ -65,28 +74,32 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); */ struct ds_tracer { /* The DS context (partially) owned by this tracer. */ - struct ds_context *context; + struct ds_context *context; /* The buffer provided on ds_request() and its size in bytes. */ - void *buffer; - size_t size; + void *buffer; + size_t size; }; struct bts_tracer { - /* The common DS part. */ - struct ds_tracer ds; - /* The trace including the DS configuration. */ - struct bts_trace trace; - /* Buffer overflow notification function. */ - bts_ovfl_callback_t ovfl; + /* The common DS part: */ + struct ds_tracer ds; + + /* The trace including the DS configuration: */ + struct bts_trace trace; + + /* Buffer overflow notification function: */ + bts_ovfl_callback_t ovfl; }; struct pebs_tracer { - /* The common DS part. */ - struct ds_tracer ds; - /* The trace including the DS configuration. */ - struct pebs_trace trace; - /* Buffer overflow notification function. */ - pebs_ovfl_callback_t ovfl; + /* The common DS part: */ + struct ds_tracer ds; + + /* The trace including the DS configuration: */ + struct pebs_trace trace; + + /* Buffer overflow notification function: */ + pebs_ovfl_callback_t ovfl; }; /* @@ -95,6 +108,7 @@ struct pebs_tracer { * * The DS configuration consists of the following fields; different * architetures vary in the size of those fields. + * * - double-word aligned base linear address of the BTS buffer * - write pointer into the BTS buffer * - end linear address of the BTS buffer (one byte beyond the end of @@ -133,19 +147,20 @@ enum ds_field { }; enum ds_qualifier { - ds_bts = 0, + ds_bts = 0, ds_pebs }; -static inline unsigned long ds_get(const unsigned char *base, - enum ds_qualifier qual, enum ds_field field) +static inline unsigned long +ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field) { base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); return *(unsigned long *)base; } -static inline void ds_set(unsigned char *base, enum ds_qualifier qual, - enum ds_field field, unsigned long value) +static inline void +ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, + unsigned long value) { base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); (*(unsigned long *)base) = value; @@ -157,7 +172,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, */ static DEFINE_SPINLOCK(ds_lock); - /* * We either support (system-wide) per-cpu or per-thread allocation. * We distinguish the two based on the task_struct pointer, where a @@ -211,17 +225,21 @@ static inline int check_tracer(struct task_struct *task) * deallocated when the last user puts the context. */ struct ds_context { - /* The DS configuration; goes into MSR_IA32_DS_AREA. */ - unsigned char ds[MAX_SIZEOF_DS]; - /* The owner of the BTS and PEBS configuration, respectively. */ - struct bts_tracer *bts_master; - struct pebs_tracer *pebs_master; - /* Use count. */ + /* The DS configuration; goes into MSR_IA32_DS_AREA: */ + unsigned char ds[MAX_SIZEOF_DS]; + + /* The owner of the BTS and PEBS configuration, respectively: */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; + + /* Use count: */ unsigned long count; - /* Pointer to the context pointer field. */ - struct ds_context **this; - /* The traced task; NULL for current cpu. */ - struct task_struct *task; + + /* Pointer to the context pointer field: */ + struct ds_context **this; + + /* The traced task; NULL for current cpu: */ + struct task_struct *task; }; static DEFINE_PER_CPU(struct ds_context *, system_context_array); @@ -328,9 +346,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) * The remainder of any partially written record is zeroed out. * * context: the DS context - * qual: the buffer type - * record: the data to write - * size: the size of the data + * qual: the buffer type + * record: the data to write + * size: the size of the data */ static int ds_write(struct ds_context *context, enum ds_qualifier qual, const void *record, size_t size) @@ -429,12 +447,12 @@ enum bts_field { bts_to, bts_flags, - bts_qual = bts_from, - bts_jiffies = bts_to, - bts_pid = bts_flags, + bts_qual = bts_from, + bts_jiffies = bts_to, + bts_pid = bts_flags, - bts_qual_mask = (bts_qual_max - 1), - bts_escape = ((unsigned long)-1 & ~bts_qual_mask) + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) }; static inline unsigned long bts_get(const char *base, enum bts_field field) @@ -461,8 +479,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val) * * return: bytes read/written on success; -Eerrno, otherwise */ -static int bts_read(struct bts_tracer *tracer, const void *at, - struct bts_struct *out) +static int +bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out) { if (!tracer) return -EINVAL; diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h index 0e6e19d4c7d..2ba8745c666 100644 --- a/arch/x86/kernel/ds_selftest.h +++ b/arch/x86/kernel/ds_selftest.h @@ -12,4 +12,4 @@ extern int ds_selftest_pebs(void); #else static inline int ds_selftest_bts(void) { return 0; } static inline int ds_selftest_pebs(void) { return 0; } -#endif /* CONFIG_X86_DS_SELFTEST */ +#endif diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 4ca82700c04..8b2109a6c61 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -1,5 +1,5 @@ /* - * h/w branch tracer for x86 based on bts + * h/w branch tracer for x86 based on BTS * * Copyright (C) 2008-2009 Intel Corporation. * Markus Metzger , 2008-2009 @@ -15,8 +15,8 @@ #include -#include "trace.h" #include "trace_output.h" +#include "trace.h" #define BTS_BUFFER_SIZE (1 << 13) @@ -197,10 +197,10 @@ static void bts_trace_print_header(struct seq_file *m) static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) { + unsigned long symflags = TRACE_ITER_SYM_OFFSET; struct trace_entry *entry = iter->ent; struct trace_seq *seq = &iter->seq; struct hw_branch_entry *it; - unsigned long symflags = TRACE_ITER_SYM_OFFSET; trace_assign_type(it, entry); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 3c7b797d0d2..b9109126706 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -189,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, #else # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) #endif /* CONFIG_DYNAMIC_FTRACE */ + /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace @@ -698,10 +699,10 @@ int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr) { - unsigned long count; - int ret; struct trace_iterator iter; struct tracer tracer; + unsigned long count; + int ret; if (!trace->open) { printk(KERN_CONT "missing open function..."); From 79258a354e0c69be94ae2871809a195bf4a647b1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 12:02:08 +0100 Subject: [PATCH 007/900] x86, bts: detect size of DS fields, fix Impact: build fix One usage site was missed in the sizeof_field -> sizeof_ptr_field rename. Cc: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 7363e01ba08..5fd53333c1d 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -983,7 +983,7 @@ ds_configure(const struct ds_configuration *cfg, printk("bts/pebs record: %u/%u bytes\n", ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); - WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); + WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field)); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) From c78a3956b982418186e40978a51636a2b43221bc Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Wed, 18 Mar 2009 19:27:00 +0100 Subject: [PATCH 008/900] x86, bts: use atomic memory allocation Ds_request_bts() needs to allocate memory. It uses GFP_KERNEL. Hw-branch-tracer calls ds_request_bts() within on_each_cpu(). Use atomic memory allocation to allow it to be used in that context. Signed-off-by: Markus Metzger LKML-Reference: <20090318192700.A6038@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 5fd53333c1d..b1d6e1f502f 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -255,8 +255,13 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) struct ds_context *new_context = NULL; unsigned long irq; - /* Chances are small that we already have a context. */ - new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); + /* + * Chances are small that we already have a context. + * + * Contexts for per-cpu tracing are allocated using + * smp_call_function(). We must not sleep. + */ + new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC); if (!new_context) return NULL; @@ -662,8 +667,12 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, if (ovfl) goto out; + /* + * Per-cpu tracing is typically requested using smp_call_function(). + * We must not sleep. + */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) goto out; tracer->ovfl = ovfl; @@ -722,8 +731,12 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, if (ovfl) goto out; + /* + * Per-cpu tracing is typically requested using smp_call_function(). + * We must not sleep. + */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) goto out; tracer->ovfl = ovfl; From 425480081e936d8725f0d44b8829d699bf088c6b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 13:38:36 -0400 Subject: [PATCH 009/900] tracing: add handler to trace_stat Currently, if a trace_stat user wants a handle to some private data, the trace_stat infrastructure does not supply a way to do that. This patch passes the trace_stat structure to the start function of the trace_stat code. Signed-off-by: Steven Rostedt --- kernel/trace/trace_branch.c | 4 ++-- kernel/trace/trace_stat.c | 2 +- kernel/trace/trace_stat.h | 2 +- kernel/trace/trace_workqueue.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index ad8c22efff4..e6e32912ffb 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -263,7 +263,7 @@ static int branch_stat_show(struct seq_file *m, void *v) return 0; } -static void *annotated_branch_stat_start(void) +static void *annotated_branch_stat_start(struct tracer_stat *trace) { return __start_annotated_branch_profile; } @@ -338,7 +338,7 @@ static int all_branch_stat_headers(struct seq_file *m) return 0; } -static void *all_branch_stat_start(void) +static void *all_branch_stat_start(struct tracer_stat *trace) { return __start_branch_profile; } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index f71b85b22cf..f8f48d84b2c 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -85,7 +85,7 @@ static int stat_seq_init(struct tracer_stat_session *session) if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; - stat = ts->stat_start(); + stat = ts->stat_start(ts); if (!stat) goto exit; diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 202274cf7f3..f3546a2cd82 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -12,7 +12,7 @@ struct tracer_stat { /* The name of your stat file */ const char *name; /* Iteration over statistic entries */ - void *(*stat_start)(void); + void *(*stat_start)(struct tracer_stat *trace); void *(*stat_next)(void *prev, int idx); /* Compare two entries for stats sorting */ int (*stat_cmp)(void *p1, void *p2); diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 9ab035b58cf..ee533c2e161 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -152,7 +152,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) return ret; } -static void *workqueue_stat_start(void) +static void *workqueue_stat_start(struct tracer_stat *trace) { int cpu; void *ret = NULL; From bac429f037f1a51a74d62bad6d1518c3be065df3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Mar 2009 12:50:56 -0400 Subject: [PATCH 010/900] tracing: add function profiler Impact: new profiling feature This patch adds a function profiler. In debugfs/tracing/ two new files are created. function_profile_enabled - to enable or disable profiling trace_stat/functions - the profiled functions. For example: echo 1 > /debugfs/tracing/function_profile_enabled ./hackbench 50 echo 0 > /debugfs/tracing/function_profile_enabled yields: cat /debugfs/tracing/trace_stat/functions Function Hit -------- --- _spin_lock 10106442 _spin_unlock 10097492 kfree 6013704 _spin_unlock_irqrestore 4423941 _spin_lock_irqsave 4406825 __phys_addr 4181686 __slab_free 4038222 dput 4030130 path_put 4023387 unroll_tree_refs 4019532 [...] The most hit functions are listed first. Functions that are not hit are not listed. This feature depends on and uses dynamic function tracing. When the function profiling is disabled, no overhead occurs. But it still takes up around 300KB to hold the data, thus it is not recomended to keep it enabled for systems low on memory. When a '1' is echoed into the function_profile_enabled file, the counters for is function is reset back to zero. Thus you can see what functions are hit most by different programs. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 4 + kernel/trace/Kconfig | 19 +++ kernel/trace/ftrace.c | 313 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 334 insertions(+), 2 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 015a3d22cf7..0456c3a51c6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -153,6 +153,10 @@ struct dyn_ftrace { unsigned long flags; struct dyn_ftrace *newlist; }; +#ifdef CONFIG_FUNCTION_PROFILER + unsigned long counter; + struct hlist_node node; +#endif struct dyn_arch_ftrace arch; }; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8a4d7293104..95e9ad5735d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -105,6 +105,7 @@ config FUNCTION_GRAPH_TRACER This is done by setting the current return address on the current task structure into a stack of calls. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n @@ -376,6 +377,24 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config FUNCTION_PROFILER + bool "Kernel function profiler" + depends on DYNAMIC_FTRACE + default n + help + This option enables the kernel function profiler. When the dynamic + function tracing is enabled, a counter is added into the function + records used by the dynamic function tracer. A file is created in + debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A file in the trace_stats + directory called functions, that show the list of functions that + have been hit and their counters. + + This takes up around 320K more memory. + + If in doubt, say N + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7b8722baf15..11f364c776d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -34,6 +34,7 @@ #include #include "trace.h" +#include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ do { \ @@ -261,7 +262,6 @@ struct ftrace_func_probe { struct rcu_head rcu; }; - enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -309,6 +309,307 @@ static struct dyn_ftrace *ftrace_free_records; } \ } +#ifdef CONFIG_FUNCTION_PROFILER +static struct hlist_head *ftrace_profile_hash; +static int ftrace_profile_bits; +static int ftrace_profile_enabled; +static DEFINE_MUTEX(ftrace_profile_lock); + +static void * +function_stat_next(void *v, int idx) +{ + struct dyn_ftrace *rec = v; + struct ftrace_page *pg; + + pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK); + + again: + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { + pg = pg->next; + if (!pg) + return NULL; + rec = &pg->records[0]; + } + + if (rec->flags & FTRACE_FL_FREE || + rec->flags & FTRACE_FL_FAILED || + !(rec->flags & FTRACE_FL_CONVERTED) || + /* ignore non hit functions */ + !rec->counter) + goto again; + + return rec; +} + +static void *function_stat_start(struct tracer_stat *trace) +{ + return function_stat_next(&ftrace_pages_start->records[0], 0); +} + +static int function_stat_cmp(void *p1, void *p2) +{ + struct dyn_ftrace *a = p1; + struct dyn_ftrace *b = p2; + + if (a->counter < b->counter) + return -1; + if (a->counter > b->counter) + return 1; + else + return 0; +} + +static int function_stat_headers(struct seq_file *m) +{ + seq_printf(m, " Function Hit\n" + " -------- ---\n"); + return 0; +} + +static int function_stat_show(struct seq_file *m, void *v) +{ + struct dyn_ftrace *rec = v; + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + + seq_printf(m, " %-30.30s %10lu\n", str, rec->counter); + return 0; +} + +static struct tracer_stat function_stats = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + +static void ftrace_profile_init(int nr_funcs) +{ + unsigned long addr; + int order; + int size; + + /* + * We are profiling all functions, lets make it 1/4th of the + * number of functions that are in core kernel. So we have to + * iterate 4 times. + */ + order = (sizeof(struct hlist_head) * nr_funcs) / 4; + order = get_order(order); + size = 1 << (PAGE_SHIFT + order); + + pr_info("Allocating %d KB for profiler hash\n", size >> 10); + + addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!addr) { + pr_warning("Could not allocate function profiler hash\n"); + return; + } + + ftrace_profile_hash = (void *)addr; + + /* + * struct hlist_head should be a pointer of 4 or 8 bytes. + * And a simple bit manipulation can be done, but if for + * some reason struct hlist_head is not a mulitple of 2, + * then we play it safe, and simply count. This function + * is done once at boot up, so it is not that critical in + * performance. + */ + + size--; + size /= sizeof(struct hlist_head); + + for (; size; size >>= 1) + ftrace_profile_bits++; + + pr_info("Function profiler has %d hash buckets\n", + 1 << ftrace_profile_bits); + + return; +} + +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void ftrace_profile_reset(void) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + do_for_each_ftrace_rec(pg, rec) { + rec->counter = 0; + } while_for_each_ftrace_rec(); +} + +static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip) +{ + struct dyn_ftrace *rec; + struct hlist_head *hhd; + struct hlist_node *n; + unsigned long flags; + unsigned long key; + + if (!ftrace_profile_hash) + return NULL; + + key = hash_long(ip, ftrace_profile_bits); + hhd = &ftrace_profile_hash[key]; + + if (hlist_empty(hhd)) + return NULL; + + local_irq_save(flags); + hlist_for_each_entry_rcu(rec, n, hhd, node) { + if (rec->ip == ip) + goto out; + } + rec = NULL; + out: + local_irq_restore(flags); + + return rec; +} + +static void +function_profile_call(unsigned long ip, unsigned long parent_ip) +{ + struct dyn_ftrace *rec; + unsigned long flags; + + if (!ftrace_profile_enabled) + return; + + local_irq_save(flags); + rec = ftrace_find_profiled_func(ip); + if (!rec) + goto out; + + rec->counter++; + out: + local_irq_restore(flags); +} + +static struct ftrace_ops ftrace_profile_ops __read_mostly = +{ + .func = function_profile_call, +}; + +static ssize_t +ftrace_profile_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; + int ret; + + if (!ftrace_profile_hash) { + pr_info("Can not enable hash due to earlier problems\n"); + return -ENODEV; + } + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&ftrace_profile_lock); + if (ftrace_profile_enabled ^ val) { + if (val) { + ftrace_profile_reset(); + register_ftrace_function(&ftrace_profile_ops); + ftrace_profile_enabled = 1; + } else { + ftrace_profile_enabled = 0; + unregister_ftrace_function(&ftrace_profile_ops); + } + } + mutex_unlock(&ftrace_profile_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static const struct file_operations ftrace_profile_fops = { + .open = tracing_open_generic, + .read = ftrace_profile_read, + .write = ftrace_profile_write, +}; + +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ + struct dentry *entry; + int ret; + + ret = register_stat_tracer(&function_stats); + if (ret) { + pr_warning("Warning: could not register " + "function stats\n"); + return; + } + + entry = debugfs_create_file("function_profile_enabled", 0644, + d_tracer, NULL, &ftrace_profile_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'function_profile_enabled' entry\n"); +} + +static void ftrace_add_profile(struct dyn_ftrace *rec) +{ + unsigned long key; + + if (!ftrace_profile_hash) + return; + + key = hash_long(rec->ip, ftrace_profile_bits); + hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); +} + +static void ftrace_profile_release(struct dyn_ftrace *rec) +{ + mutex_lock(&ftrace_profile_lock); + hlist_del(&rec->node); + mutex_unlock(&ftrace_profile_lock); +} + +#else /* CONFIG_FUNCTION_PROFILER */ +static void ftrace_profile_init(int nr_funcs) +{ +} +static void ftrace_add_profile(struct dyn_ftrace *rec) +{ +} +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ +} +static void ftrace_profile_release(struct dyn_ftrace *rec) +{ +} +#endif /* CONFIG_FUNCTION_PROFILER */ + #ifdef CONFIG_KPROBES static int frozen_record_count; @@ -359,8 +660,10 @@ void ftrace_release(void *start, unsigned long size) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if ((rec->ip >= s) && (rec->ip < e) && - !(rec->flags & FTRACE_FL_FREE)) + !(rec->flags & FTRACE_FL_FREE)) { ftrace_free_rec(rec); + ftrace_profile_release(rec); + } } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); } @@ -414,6 +717,8 @@ ftrace_record_ip(unsigned long ip) rec->newlist = ftrace_new_addrs; ftrace_new_addrs = rec; + ftrace_add_profile(rec); + return rec; } @@ -2157,6 +2462,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + ftrace_profile_debugfs(d_tracer); + return 0; } @@ -2225,6 +2532,8 @@ void __init ftrace_init(void) if (ret) goto failed; + ftrace_profile_init(count); + last_ftrace_enabled = ftrace_enabled = 1; ret = ftrace_convert_nops(NULL, From 493762fc534c71d11d489f872c4b4a2c61173668 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Mar 2009 17:12:36 -0400 Subject: [PATCH 011/900] tracing: move function profiler data out of function struct Impact: reduce size of memory in function profiler The function profiler originally introduces its counters into the function records itself. There is 20 thousand different functions on a normal system, and that is adding 20 thousand counters for profiling event when not needed. A normal run of the profiler yields only a couple of thousand functions executed, depending on what is being profiled. This means we have around 18 thousand useless counters. This patch rectifies this by moving the data out of the function records used by dynamic ftrace. Data is preallocated to hold the functions when the profiling begins. Checks are made during profiling to see if more recorcds should be allocated, and they are allocated if it is safe to do so. This also removes the dependency from using dynamic ftrace, and also removes the overhead by having it enabled. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 4 - kernel/trace/Kconfig | 10 +- kernel/trace/ftrace.c | 700 +++++++++++++++++++++++------------------ 3 files changed, 393 insertions(+), 321 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0456c3a51c6..015a3d22cf7 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -153,10 +153,6 @@ struct dyn_ftrace { unsigned long flags; struct dyn_ftrace *newlist; }; -#ifdef CONFIG_FUNCTION_PROFILER - unsigned long counter; - struct hlist_node node; -#endif struct dyn_arch_ftrace arch; }; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 95e9ad5735d..8a4136096d7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -379,20 +379,16 @@ config DYNAMIC_FTRACE config FUNCTION_PROFILER bool "Kernel function profiler" - depends on DYNAMIC_FTRACE + depends on FUNCTION_TRACER default n help - This option enables the kernel function profiler. When the dynamic - function tracing is enabled, a counter is added into the function - records used by the dynamic function tracer. A file is created in - debugfs called function_profile_enabled which defaults to zero. + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. When a 1 is echoed into this file profiling begins, and when a zero is entered, profiling stops. A file in the trace_stats directory called functions, that show the list of functions that have been hit and their counters. - This takes up around 320K more memory. - If in doubt, say N config FTRACE_MCOUNT_RECORD diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 11f364c776d..24dac448cdc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -241,6 +241,392 @@ static void ftrace_update_pid_func(void) #endif } +#ifdef CONFIG_FUNCTION_PROFILER +struct ftrace_profile { + struct hlist_node node; + unsigned long ip; + unsigned long counter; +}; + +struct ftrace_profile_page { + struct ftrace_profile_page *next; + unsigned long index; + struct ftrace_profile records[]; +}; + +#define PROFILE_RECORDS_SIZE \ + (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) + +#define PROFILES_PER_PAGE \ + (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) + +/* TODO: make these percpu, to prevent cache line bouncing */ +static struct ftrace_profile_page *profile_pages_start; +static struct ftrace_profile_page *profile_pages; + +static struct hlist_head *ftrace_profile_hash; +static int ftrace_profile_bits; +static int ftrace_profile_enabled; +static DEFINE_MUTEX(ftrace_profile_lock); + +static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable); + +#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ + +static raw_spinlock_t ftrace_profile_rec_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +static void * +function_stat_next(void *v, int idx) +{ + struct ftrace_profile *rec = v; + struct ftrace_profile_page *pg; + + pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); + + again: + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { + pg = pg->next; + if (!pg) + return NULL; + rec = &pg->records[0]; + if (!rec->counter) + goto again; + } + + return rec; +} + +static void *function_stat_start(struct tracer_stat *trace) +{ + return function_stat_next(&profile_pages_start->records[0], 0); +} + +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->counter < b->counter) + return -1; + if (a->counter > b->counter) + return 1; + else + return 0; +} + +static int function_stat_headers(struct seq_file *m) +{ + seq_printf(m, " Function Hit\n" + " -------- ---\n"); + return 0; +} + +static int function_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_profile *rec = v; + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + + seq_printf(m, " %-30.30s %10lu\n", str, rec->counter); + return 0; +} + +static struct tracer_stat function_stats = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + +static void ftrace_profile_reset(void) +{ + struct ftrace_profile_page *pg; + + pg = profile_pages = profile_pages_start; + + while (pg) { + memset(pg->records, 0, PROFILE_RECORDS_SIZE); + pg->index = 0; + pg = pg->next; + } + + memset(ftrace_profile_hash, 0, + FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); +} + +int ftrace_profile_pages_init(void) +{ + struct ftrace_profile_page *pg; + int i; + + /* If we already allocated, do nothing */ + if (profile_pages) + return 0; + + profile_pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!profile_pages) + return -ENOMEM; + + pg = profile_pages_start = profile_pages; + + /* allocate 10 more pages to start */ + for (i = 0; i < 10; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + /* + * We only care about allocating profile_pages, if + * we failed to allocate here, hopefully we will allocate + * later. + */ + if (!pg->next) + break; + pg = pg->next; + } + + return 0; +} + +static int ftrace_profile_init(void) +{ + int size; + + if (ftrace_profile_hash) { + /* If the profile is already created, simply reset it */ + ftrace_profile_reset(); + return 0; + } + + /* + * We are profiling all functions, but usually only a few thousand + * functions are hit. We'll make a hash of 1024 items. + */ + size = FTRACE_PROFILE_HASH_SIZE; + + ftrace_profile_hash = + kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + + if (!ftrace_profile_hash) + return -ENOMEM; + + size--; + + for (; size; size >>= 1) + ftrace_profile_bits++; + + /* Preallocate a few pages */ + if (ftrace_profile_pages_init() < 0) { + kfree(ftrace_profile_hash); + ftrace_profile_hash = NULL; + return -ENOMEM; + } + + return 0; +} + +/* interrupts must be disabled */ +static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) +{ + struct ftrace_profile *rec; + struct hlist_head *hhd; + struct hlist_node *n; + unsigned long key; + + key = hash_long(ip, ftrace_profile_bits); + hhd = &ftrace_profile_hash[key]; + + if (hlist_empty(hhd)) + return NULL; + + hlist_for_each_entry_rcu(rec, n, hhd, node) { + if (rec->ip == ip) + return rec; + } + + return NULL; +} + +static void ftrace_add_profile(struct ftrace_profile *rec) +{ + unsigned long key; + + key = hash_long(rec->ip, ftrace_profile_bits); + hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); +} + +/* Interrupts must be disabled calling this */ +static struct ftrace_profile * +ftrace_profile_alloc(unsigned long ip, bool alloc_safe) +{ + struct ftrace_profile *rec = NULL; + + /* prevent recursion */ + if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1) + goto out; + + __raw_spin_lock(&ftrace_profile_rec_lock); + + /* Try to always keep another page available */ + if (!profile_pages->next && alloc_safe) + profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC); + + /* + * Try to find the function again since another + * task on another CPU could have added it + */ + rec = ftrace_find_profiled_func(ip); + if (rec) + goto out_unlock; + + if (profile_pages->index == PROFILES_PER_PAGE) { + if (!profile_pages->next) + goto out_unlock; + profile_pages = profile_pages->next; + } + + rec = &profile_pages->records[profile_pages->index++]; + rec->ip = ip; + ftrace_add_profile(rec); + + out_unlock: + __raw_spin_unlock(&ftrace_profile_rec_lock); + out: + atomic_dec(&__get_cpu_var(ftrace_profile_disable)); + + return rec; +} + +/* + * If we are not in an interrupt, or softirq and + * and interrupts are disabled and preemption is not enabled + * (not in a spinlock) then it should be safe to allocate memory. + */ +static bool ftrace_safe_to_allocate(void) +{ + return !in_interrupt() && irqs_disabled() && !preempt_count(); +} + +static void +function_profile_call(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_profile *rec; + unsigned long flags; + bool alloc_safe; + + if (!ftrace_profile_enabled) + return; + + alloc_safe = ftrace_safe_to_allocate(); + + local_irq_save(flags); + rec = ftrace_find_profiled_func(ip); + if (!rec) { + rec = ftrace_profile_alloc(ip, alloc_safe); + if (!rec) + goto out; + } + + rec->counter++; + out: + local_irq_restore(flags); +} + +static struct ftrace_ops ftrace_profile_ops __read_mostly = +{ + .func = function_profile_call, +}; + +static ssize_t +ftrace_profile_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&ftrace_profile_lock); + if (ftrace_profile_enabled ^ val) { + if (val) { + ret = ftrace_profile_init(); + if (ret < 0) { + cnt = ret; + goto out; + } + + register_ftrace_function(&ftrace_profile_ops); + ftrace_profile_enabled = 1; + } else { + ftrace_profile_enabled = 0; + unregister_ftrace_function(&ftrace_profile_ops); + } + } + out: + mutex_unlock(&ftrace_profile_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static const struct file_operations ftrace_profile_fops = { + .open = tracing_open_generic, + .read = ftrace_profile_read, + .write = ftrace_profile_write, +}; + +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ + struct dentry *entry; + int ret; + + ret = register_stat_tracer(&function_stats); + if (ret) { + pr_warning("Warning: could not register " + "function stats\n"); + return; + } + + entry = debugfs_create_file("function_profile_enabled", 0644, + d_tracer, NULL, &ftrace_profile_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'function_profile_enabled' entry\n"); +} + +#else /* CONFIG_FUNCTION_PROFILER */ +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ +} +#endif /* CONFIG_FUNCTION_PROFILER */ + /* set when tracing only a pid */ struct pid *ftrace_pid_trace; static struct pid * const ftrace_swapper_pid = &init_struct_pid; @@ -309,307 +695,6 @@ static struct dyn_ftrace *ftrace_free_records; } \ } -#ifdef CONFIG_FUNCTION_PROFILER -static struct hlist_head *ftrace_profile_hash; -static int ftrace_profile_bits; -static int ftrace_profile_enabled; -static DEFINE_MUTEX(ftrace_profile_lock); - -static void * -function_stat_next(void *v, int idx) -{ - struct dyn_ftrace *rec = v; - struct ftrace_page *pg; - - pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK); - - again: - rec++; - if ((void *)rec >= (void *)&pg->records[pg->index]) { - pg = pg->next; - if (!pg) - return NULL; - rec = &pg->records[0]; - } - - if (rec->flags & FTRACE_FL_FREE || - rec->flags & FTRACE_FL_FAILED || - !(rec->flags & FTRACE_FL_CONVERTED) || - /* ignore non hit functions */ - !rec->counter) - goto again; - - return rec; -} - -static void *function_stat_start(struct tracer_stat *trace) -{ - return function_stat_next(&ftrace_pages_start->records[0], 0); -} - -static int function_stat_cmp(void *p1, void *p2) -{ - struct dyn_ftrace *a = p1; - struct dyn_ftrace *b = p2; - - if (a->counter < b->counter) - return -1; - if (a->counter > b->counter) - return 1; - else - return 0; -} - -static int function_stat_headers(struct seq_file *m) -{ - seq_printf(m, " Function Hit\n" - " -------- ---\n"); - return 0; -} - -static int function_stat_show(struct seq_file *m, void *v) -{ - struct dyn_ftrace *rec = v; - char str[KSYM_SYMBOL_LEN]; - - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - - seq_printf(m, " %-30.30s %10lu\n", str, rec->counter); - return 0; -} - -static struct tracer_stat function_stats = { - .name = "functions", - .stat_start = function_stat_start, - .stat_next = function_stat_next, - .stat_cmp = function_stat_cmp, - .stat_headers = function_stat_headers, - .stat_show = function_stat_show -}; - -static void ftrace_profile_init(int nr_funcs) -{ - unsigned long addr; - int order; - int size; - - /* - * We are profiling all functions, lets make it 1/4th of the - * number of functions that are in core kernel. So we have to - * iterate 4 times. - */ - order = (sizeof(struct hlist_head) * nr_funcs) / 4; - order = get_order(order); - size = 1 << (PAGE_SHIFT + order); - - pr_info("Allocating %d KB for profiler hash\n", size >> 10); - - addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!addr) { - pr_warning("Could not allocate function profiler hash\n"); - return; - } - - ftrace_profile_hash = (void *)addr; - - /* - * struct hlist_head should be a pointer of 4 or 8 bytes. - * And a simple bit manipulation can be done, but if for - * some reason struct hlist_head is not a mulitple of 2, - * then we play it safe, and simply count. This function - * is done once at boot up, so it is not that critical in - * performance. - */ - - size--; - size /= sizeof(struct hlist_head); - - for (; size; size >>= 1) - ftrace_profile_bits++; - - pr_info("Function profiler has %d hash buckets\n", - 1 << ftrace_profile_bits); - - return; -} - -static ssize_t -ftrace_profile_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - r = sprintf(buf, "%u\n", ftrace_profile_enabled); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static void ftrace_profile_reset(void) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - - do_for_each_ftrace_rec(pg, rec) { - rec->counter = 0; - } while_for_each_ftrace_rec(); -} - -static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip) -{ - struct dyn_ftrace *rec; - struct hlist_head *hhd; - struct hlist_node *n; - unsigned long flags; - unsigned long key; - - if (!ftrace_profile_hash) - return NULL; - - key = hash_long(ip, ftrace_profile_bits); - hhd = &ftrace_profile_hash[key]; - - if (hlist_empty(hhd)) - return NULL; - - local_irq_save(flags); - hlist_for_each_entry_rcu(rec, n, hhd, node) { - if (rec->ip == ip) - goto out; - } - rec = NULL; - out: - local_irq_restore(flags); - - return rec; -} - -static void -function_profile_call(unsigned long ip, unsigned long parent_ip) -{ - struct dyn_ftrace *rec; - unsigned long flags; - - if (!ftrace_profile_enabled) - return; - - local_irq_save(flags); - rec = ftrace_find_profiled_func(ip); - if (!rec) - goto out; - - rec->counter++; - out: - local_irq_restore(flags); -} - -static struct ftrace_ops ftrace_profile_ops __read_mostly = -{ - .func = function_profile_call, -}; - -static ssize_t -ftrace_profile_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long val; - char buf[64]; - int ret; - - if (!ftrace_profile_hash) { - pr_info("Can not enable hash due to earlier problems\n"); - return -ENODEV; - } - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) - return ret; - - val = !!val; - - mutex_lock(&ftrace_profile_lock); - if (ftrace_profile_enabled ^ val) { - if (val) { - ftrace_profile_reset(); - register_ftrace_function(&ftrace_profile_ops); - ftrace_profile_enabled = 1; - } else { - ftrace_profile_enabled = 0; - unregister_ftrace_function(&ftrace_profile_ops); - } - } - mutex_unlock(&ftrace_profile_lock); - - filp->f_pos += cnt; - - return cnt; -} - -static const struct file_operations ftrace_profile_fops = { - .open = tracing_open_generic, - .read = ftrace_profile_read, - .write = ftrace_profile_write, -}; - -static void ftrace_profile_debugfs(struct dentry *d_tracer) -{ - struct dentry *entry; - int ret; - - ret = register_stat_tracer(&function_stats); - if (ret) { - pr_warning("Warning: could not register " - "function stats\n"); - return; - } - - entry = debugfs_create_file("function_profile_enabled", 0644, - d_tracer, NULL, &ftrace_profile_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'function_profile_enabled' entry\n"); -} - -static void ftrace_add_profile(struct dyn_ftrace *rec) -{ - unsigned long key; - - if (!ftrace_profile_hash) - return; - - key = hash_long(rec->ip, ftrace_profile_bits); - hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); -} - -static void ftrace_profile_release(struct dyn_ftrace *rec) -{ - mutex_lock(&ftrace_profile_lock); - hlist_del(&rec->node); - mutex_unlock(&ftrace_profile_lock); -} - -#else /* CONFIG_FUNCTION_PROFILER */ -static void ftrace_profile_init(int nr_funcs) -{ -} -static void ftrace_add_profile(struct dyn_ftrace *rec) -{ -} -static void ftrace_profile_debugfs(struct dentry *d_tracer) -{ -} -static void ftrace_profile_release(struct dyn_ftrace *rec) -{ -} -#endif /* CONFIG_FUNCTION_PROFILER */ - #ifdef CONFIG_KPROBES static int frozen_record_count; @@ -660,10 +745,8 @@ void ftrace_release(void *start, unsigned long size) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if ((rec->ip >= s) && (rec->ip < e) && - !(rec->flags & FTRACE_FL_FREE)) { + !(rec->flags & FTRACE_FL_FREE)) ftrace_free_rec(rec); - ftrace_profile_release(rec); - } } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); } @@ -717,8 +800,6 @@ ftrace_record_ip(unsigned long ip) rec->newlist = ftrace_new_addrs; ftrace_new_addrs = rec; - ftrace_add_profile(rec); - return rec; } @@ -2462,8 +2543,6 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - ftrace_profile_debugfs(d_tracer); - return 0; } @@ -2532,8 +2611,6 @@ void __init ftrace_init(void) if (ret) goto failed; - ftrace_profile_init(count); - last_ftrace_enabled = ftrace_enabled = 1; ret = ftrace_convert_nops(NULL, @@ -2734,6 +2811,9 @@ static __init int ftrace_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs " "'set_ftrace_pid' entry\n"); + + ftrace_profile_debugfs(d_tracer); + return 0; } fs_initcall(ftrace_init_debugfs); From 0706f1c48ca8a7ab478090b4e38f2e578ae2bfe0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Mar 2009 23:12:58 -0400 Subject: [PATCH 012/900] tracing: adding function timings to function profiler If the function graph trace is enabled, the function profiler will use it to take the timing of the functions. cat /debug/tracing/trace_stat/functions Function Hit Time -------- --- ---- mwait_idle 127 183028.4 us schedule 26 151997.7 us __schedule 31 151975.1 us sys_wait4 2 74080.53 us do_wait 2 74077.80 us sys_newlstat 138 39929.16 us do_path_lookup 179 39845.79 us vfs_lstat_fd 138 39761.97 us user_path_at 153 39469.58 us path_walk 179 39435.76 us __link_path_walk 189 39143.73 us [...] Note the times are skewed due to the function graph tracer not taking into account schedules. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 93 ++++++++++++++++++++++++++-- kernel/trace/trace.c | 11 ---- kernel/trace/trace.h | 3 +- kernel/trace/trace_functions_graph.c | 17 ++++- kernel/trace/trace_output.c | 10 +++ kernel/trace/trace_output.h | 2 + 6 files changed, 117 insertions(+), 19 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 24dac448cdc..a9ccd71fc92 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -33,7 +33,7 @@ #include -#include "trace.h" +#include "trace_output.h" #include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ @@ -246,6 +246,9 @@ struct ftrace_profile { struct hlist_node node; unsigned long ip; unsigned long counter; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned long long time; +#endif }; struct ftrace_profile_page { @@ -303,6 +306,22 @@ static void *function_stat_start(struct tracer_stat *trace) return function_stat_next(&profile_pages_start->records[0], 0); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* function graph compares on total time */ +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->time < b->time) + return -1; + if (a->time > b->time) + return 1; + else + return 0; +} +#else +/* not function graph compares against hits */ static int function_stat_cmp(void *p1, void *p2) { struct ftrace_profile *a = p1; @@ -315,11 +334,17 @@ static int function_stat_cmp(void *p1, void *p2) else return 0; } +#endif static int function_stat_headers(struct seq_file *m) { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " Function Hit Time\n" + " -------- --- ----\n"); +#else seq_printf(m, " Function Hit\n" " -------- ---\n"); +#endif return 0; } @@ -327,10 +352,25 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + static struct trace_seq s; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); +#endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " "); + trace_print_seq(m, &s); + mutex_unlock(&mutex); +#endif + seq_putc(m, '\n'); - seq_printf(m, " %-30.30s %10lu\n", str, rec->counter); return 0; } @@ -534,11 +574,52 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) local_irq_restore(flags); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int profile_graph_entry(struct ftrace_graph_ent *trace) +{ + function_profile_call(trace->func, 0); + return 1; +} + +static void profile_graph_return(struct ftrace_graph_ret *trace) +{ + unsigned long flags; + struct ftrace_profile *rec; + + local_irq_save(flags); + rec = ftrace_find_profiled_func(trace->func); + if (rec) + rec->time += trace->rettime - trace->calltime; + local_irq_restore(flags); +} + +static int register_ftrace_profiler(void) +{ + return register_ftrace_graph(&profile_graph_return, + &profile_graph_entry); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_graph(); +} +#else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, }; +static int register_ftrace_profiler(void) +{ + return register_ftrace_function(&ftrace_profile_ops); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_function(&ftrace_profile_ops); +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static ssize_t ftrace_profile_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -570,11 +651,15 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, goto out; } - register_ftrace_function(&ftrace_profile_ops); + ret = register_ftrace_profiler(); + if (ret < 0) { + cnt = ret; + goto out; + } ftrace_profile_enabled = 1; } else { ftrace_profile_enabled = 0; - unregister_ftrace_function(&ftrace_profile_ops); + unregister_ftrace_profiler(); } } out: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 67c6a21dd42..821bf49771d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -402,17 +402,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } -static void -trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ - int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - - s->buffer[len] = 0; - seq_puts(m, s->buffer); - - trace_seq_init(s); -} - /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d7410bbb9a8..c66ca3b6605 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -605,6 +605,8 @@ extern unsigned long trace_flags; /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern enum print_line_t print_graph_function(struct trace_iterator *iter); +extern enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ @@ -636,7 +638,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 1; } #endif /* CONFIG_DYNAMIC_FTRACE */ - #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function(struct trace_iterator *iter) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d28687e7b3a..85bba0f018b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -426,8 +426,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return TRACE_TYPE_HANDLED; } -static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s) +enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); /* log10(ULONG_MAX) + '\0' */ @@ -464,12 +464,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) if (!ret) return TRACE_TYPE_PARTIAL_LINE; } + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + int ret; + + ret = trace_print_graph_duration(duration, s); + if (ret != TRACE_TYPE_HANDLED) + return ret; ret = trace_seq_printf(s, "| "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + return TRACE_TYPE_HANDLED; } /* Case of a leaf function on its call entry */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 19261fdd245..a3b6e3fd704 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -19,6 +19,16 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; +void trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + + s->buffer[len] = 0; + seq_puts(m, s->buffer); + + trace_seq_init(s); +} + enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 35c422fb51a..1eac2973374 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -20,6 +20,8 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); +extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); + extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); extern int From cafb168a1c92e4c9e1731fe3d666c39611762c49 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 20:50:39 -0400 Subject: [PATCH 013/900] tracing: make the function profiler per cpu Impact: speed enhancement By making the function profiler record in per cpu data we not only get better readings, avoid races, we also do not have to take any locks. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 199 +++++++++++++++++++++++++++--------------- 1 file changed, 130 insertions(+), 69 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a9ccd71fc92..ed1fc5021d4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -257,28 +257,28 @@ struct ftrace_profile_page { struct ftrace_profile records[]; }; +struct ftrace_profile_stat { + atomic_t disabled; + struct hlist_head *hash; + struct ftrace_profile_page *pages; + struct ftrace_profile_page *start; + struct tracer_stat stat; +}; + #define PROFILE_RECORDS_SIZE \ (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) #define PROFILES_PER_PAGE \ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -/* TODO: make these percpu, to prevent cache line bouncing */ -static struct ftrace_profile_page *profile_pages_start; -static struct ftrace_profile_page *profile_pages; - -static struct hlist_head *ftrace_profile_hash; static int ftrace_profile_bits; static int ftrace_profile_enabled; static DEFINE_MUTEX(ftrace_profile_lock); -static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable); +static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); #define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ -static raw_spinlock_t ftrace_profile_rec_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - static void * function_stat_next(void *v, int idx) { @@ -303,7 +303,13 @@ function_stat_next(void *v, int idx) static void *function_stat_start(struct tracer_stat *trace) { - return function_stat_next(&profile_pages_start->records[0], 0); + struct ftrace_profile_stat *stat = + container_of(trace, struct ftrace_profile_stat, stat); + + if (!stat || !stat->start) + return NULL; + + return function_stat_next(&stat->start->records[0], 0); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -374,20 +380,11 @@ static int function_stat_show(struct seq_file *m, void *v) return 0; } -static struct tracer_stat function_stats = { - .name = "functions", - .stat_start = function_stat_start, - .stat_next = function_stat_next, - .stat_cmp = function_stat_cmp, - .stat_headers = function_stat_headers, - .stat_show = function_stat_show -}; - -static void ftrace_profile_reset(void) +static void ftrace_profile_reset(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; - pg = profile_pages = profile_pages_start; + pg = stat->pages = stat->start; while (pg) { memset(pg->records, 0, PROFILE_RECORDS_SIZE); @@ -395,24 +392,24 @@ static void ftrace_profile_reset(void) pg = pg->next; } - memset(ftrace_profile_hash, 0, + memset(stat->hash, 0, FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); } -int ftrace_profile_pages_init(void) +int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; int i; /* If we already allocated, do nothing */ - if (profile_pages) + if (stat->pages) return 0; - profile_pages = (void *)get_zeroed_page(GFP_KERNEL); - if (!profile_pages) + stat->pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!stat->pages) return -ENOMEM; - pg = profile_pages_start = profile_pages; + pg = stat->start = stat->pages; /* allocate 10 more pages to start */ for (i = 0; i < 10; i++) { @@ -430,13 +427,16 @@ int ftrace_profile_pages_init(void) return 0; } -static int ftrace_profile_init(void) +static int ftrace_profile_init_cpu(int cpu) { + struct ftrace_profile_stat *stat; int size; - if (ftrace_profile_hash) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + if (stat->hash) { /* If the profile is already created, simply reset it */ - ftrace_profile_reset(); + ftrace_profile_reset(stat); return 0; } @@ -446,29 +446,45 @@ static int ftrace_profile_init(void) */ size = FTRACE_PROFILE_HASH_SIZE; - ftrace_profile_hash = - kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); - if (!ftrace_profile_hash) + if (!stat->hash) return -ENOMEM; - size--; + if (!ftrace_profile_bits) { + size--; - for (; size; size >>= 1) - ftrace_profile_bits++; + for (; size; size >>= 1) + ftrace_profile_bits++; + } /* Preallocate a few pages */ - if (ftrace_profile_pages_init() < 0) { - kfree(ftrace_profile_hash); - ftrace_profile_hash = NULL; + if (ftrace_profile_pages_init(stat) < 0) { + kfree(stat->hash); + stat->hash = NULL; return -ENOMEM; } return 0; } +static int ftrace_profile_init(void) +{ + int cpu; + int ret = 0; + + for_each_online_cpu(cpu) { + ret = ftrace_profile_init_cpu(cpu); + if (ret) + break; + } + + return ret; +} + /* interrupts must be disabled */ -static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) +static struct ftrace_profile * +ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) { struct ftrace_profile *rec; struct hlist_head *hhd; @@ -476,7 +492,7 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) unsigned long key; key = hash_long(ip, ftrace_profile_bits); - hhd = &ftrace_profile_hash[key]; + hhd = &stat->hash[key]; if (hlist_empty(hhd)) return NULL; @@ -489,52 +505,50 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) return NULL; } -static void ftrace_add_profile(struct ftrace_profile *rec) +static void ftrace_add_profile(struct ftrace_profile_stat *stat, + struct ftrace_profile *rec) { unsigned long key; key = hash_long(rec->ip, ftrace_profile_bits); - hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); + hlist_add_head_rcu(&rec->node, &stat->hash[key]); } /* Interrupts must be disabled calling this */ static struct ftrace_profile * -ftrace_profile_alloc(unsigned long ip, bool alloc_safe) +ftrace_profile_alloc(struct ftrace_profile_stat *stat, + unsigned long ip, bool alloc_safe) { struct ftrace_profile *rec = NULL; /* prevent recursion */ - if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1) + if (atomic_inc_return(&stat->disabled) != 1) goto out; - __raw_spin_lock(&ftrace_profile_rec_lock); - /* Try to always keep another page available */ - if (!profile_pages->next && alloc_safe) - profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC); + if (!stat->pages->next && alloc_safe) + stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC); /* * Try to find the function again since another * task on another CPU could have added it */ - rec = ftrace_find_profiled_func(ip); + rec = ftrace_find_profiled_func(stat, ip); if (rec) - goto out_unlock; + goto out; - if (profile_pages->index == PROFILES_PER_PAGE) { - if (!profile_pages->next) - goto out_unlock; - profile_pages = profile_pages->next; + if (stat->pages->index == PROFILES_PER_PAGE) { + if (!stat->pages->next) + goto out; + stat->pages = stat->pages->next; } - rec = &profile_pages->records[profile_pages->index++]; + rec = &stat->pages->records[stat->pages->index++]; rec->ip = ip; - ftrace_add_profile(rec); + ftrace_add_profile(stat, rec); - out_unlock: - __raw_spin_unlock(&ftrace_profile_rec_lock); out: - atomic_dec(&__get_cpu_var(ftrace_profile_disable)); + atomic_dec(&stat->disabled); return rec; } @@ -552,6 +566,7 @@ static bool ftrace_safe_to_allocate(void) static void function_profile_call(unsigned long ip, unsigned long parent_ip) { + struct ftrace_profile_stat *stat; struct ftrace_profile *rec; unsigned long flags; bool alloc_safe; @@ -562,9 +577,14 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) alloc_safe = ftrace_safe_to_allocate(); local_irq_save(flags); - rec = ftrace_find_profiled_func(ip); + + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash) + goto out; + + rec = ftrace_find_profiled_func(stat, ip); if (!rec) { - rec = ftrace_profile_alloc(ip, alloc_safe); + rec = ftrace_profile_alloc(stat, ip, alloc_safe); if (!rec) goto out; } @@ -583,13 +603,19 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) static void profile_graph_return(struct ftrace_graph_ret *trace) { - unsigned long flags; + struct ftrace_profile_stat *stat; struct ftrace_profile *rec; + unsigned long flags; local_irq_save(flags); - rec = ftrace_find_profiled_func(trace->func); + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash) + goto out; + + rec = ftrace_find_profiled_func(stat, trace->func); if (rec) rec->time += trace->rettime - trace->calltime; + out: local_irq_restore(flags); } @@ -687,16 +713,51 @@ static const struct file_operations ftrace_profile_fops = { .write = ftrace_profile_write, }; +/* used to initialize the real stat files */ +static struct tracer_stat function_stats __initdata = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + static void ftrace_profile_debugfs(struct dentry *d_tracer) { + struct ftrace_profile_stat *stat; struct dentry *entry; + char *name; int ret; + int cpu; - ret = register_stat_tracer(&function_stats); - if (ret) { - pr_warning("Warning: could not register " - "function stats\n"); - return; + for_each_possible_cpu(cpu) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + /* allocate enough for function name + cpu number */ + name = kmalloc(32, GFP_KERNEL); + if (!name) { + /* + * The files created are permanent, if something happens + * we still do not free memory. + */ + kfree(stat); + WARN(1, + "Could not allocate stat file for cpu %d\n", + cpu); + return; + } + stat->stat = function_stats; + snprintf(name, 32, "function%d", cpu); + stat->stat.name = name; + ret = register_stat_tracer(&stat->stat); + if (ret) { + WARN(1, + "Could not register function stat for cpu %d\n", + cpu); + kfree(name); + return; + } } entry = debugfs_create_file("function_profile_enabled", 0644, From a2a16d6a3156ef7309ca7328a20c35df9418e670 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 23:17:58 -0400 Subject: [PATCH 014/900] function-graph: add option to calculate graph time or not graph time is the time that a function is executing another function. Thus if function A calls B, if graph-time is set, then the time for A includes B. This is the default behavior. But if graph-time is off, then the time spent executing B is subtracted from A. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 +-- kernel/trace/ftrace.c | 21 ++++++++++++++++++++- kernel/trace/trace.c | 4 +++- kernel/trace/trace.h | 1 + kernel/trace/trace_functions_graph.c | 8 ++++---- 5 files changed, 29 insertions(+), 8 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 015a3d22cf7..9e0a8d245e5 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -365,6 +365,7 @@ struct ftrace_ret_stack { unsigned long ret; unsigned long func; unsigned long long calltime; + unsigned long long subtime; }; /* @@ -376,8 +377,6 @@ extern void return_to_handler(void); extern int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth); -extern void -ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret); /* * Sometimes we don't want to trace a function with the function diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ed1fc5021d4..71e5faef12a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -604,6 +604,7 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) static void profile_graph_return(struct ftrace_graph_ret *trace) { struct ftrace_profile_stat *stat; + unsigned long long calltime; struct ftrace_profile *rec; unsigned long flags; @@ -612,9 +613,27 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) if (!stat->hash) goto out; + calltime = trace->rettime - trace->calltime; + + if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { + int index; + + index = trace->depth; + + /* Append this call time to the parent time to subtract */ + if (index) + current->ret_stack[index - 1].subtime += calltime; + + if (current->ret_stack[index].subtime < calltime) + calltime -= current->ret_stack[index].subtime; + else + calltime = 0; + } + rec = ftrace_find_profiled_func(stat, trace->func); if (rec) - rec->time += trace->rettime - trace->calltime; + rec->time += calltime; + out: local_irq_restore(flags); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 821bf49771d..5d1a16cae37 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -255,7 +255,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | + TRACE_ITER_GRAPH_TIME; /** * trace_wake_up - wake up tasks waiting for trace input @@ -317,6 +318,7 @@ static const char *trace_options[] = { "latency-format", "global-clock", "sleep-time", + "graph-time", NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c66ca3b6605..e3429a8ab05 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -685,6 +685,7 @@ enum trace_iterator_flags { TRACE_ITER_LATENCY_FMT = 0x40000, TRACE_ITER_GLOBAL_CLK = 0x80000, TRACE_ITER_SLEEP_TIME = 0x100000, + TRACE_ITER_GRAPH_TIME = 0x200000, }; /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 85bba0f018b..10f6ad7d85f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -78,13 +78,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; + current->ret_stack[index].subtime = 0; *depth = index; return 0; } /* Retrieve a function return address to the trace stack on thread info.*/ -void +static void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) { int index; @@ -104,9 +105,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); trace->depth = index; - barrier(); - current->curr_ret_stack--; - } /* @@ -121,6 +119,8 @@ unsigned long ftrace_return_to_handler(void) ftrace_pop_return_trace(&trace, &ret); trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); + barrier(); + current->curr_ret_stack--; if (unlikely(!ret)) { ftrace_graph_stop(); From fb9fb015e92123fa3a8e0c2e2fff491d4a56b470 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 13:26:41 -0400 Subject: [PATCH 015/900] tracing: clean up tracing profiler Ingo Molnar suggested clean ups for the profiling code. This patch makes those updates. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 71e5faef12a..a141d8499ab 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -69,7 +69,7 @@ static DEFINE_MUTEX(ftrace_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { - .func = ftrace_stub, + .func = ftrace_stub, }; static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; @@ -271,8 +271,10 @@ struct ftrace_profile_stat { #define PROFILES_PER_PAGE \ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static int ftrace_profile_bits; -static int ftrace_profile_enabled; +static int ftrace_profile_bits __read_mostly; +static int ftrace_profile_enabled __read_mostly; + +/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ static DEFINE_MUTEX(ftrace_profile_lock); static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); @@ -651,7 +653,7 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { - .func = function_profile_call, + .func = function_profile_call, }; static int register_ftrace_profiler(void) @@ -670,7 +672,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; - char buf[64]; + char buf[64]; /* big enough to hold a number */ int ret; if (cnt >= sizeof(buf)) @@ -719,7 +721,7 @@ static ssize_t ftrace_profile_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[64]; + char buf[64]; /* big enough to hold a number */ int r; r = sprintf(buf, "%u\n", ftrace_profile_enabled); @@ -734,12 +736,12 @@ static const struct file_operations ftrace_profile_fops = { /* used to initialize the real stat files */ static struct tracer_stat function_stats __initdata = { - .name = "functions", - .stat_start = function_stat_start, - .stat_next = function_stat_next, - .stat_cmp = function_stat_cmp, - .stat_headers = function_stat_headers, - .stat_show = function_stat_show + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show }; static void ftrace_profile_debugfs(struct dentry *d_tracer) @@ -1954,7 +1956,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_probe_ops __read_mostly = { - .func = function_trace_probe_call, + .func = function_trace_probe_call, }; static int ftrace_probe_registered; From 318e0a73c9e41b9a17241829bcd0605a39b87cb9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 20:06:34 -0400 Subject: [PATCH 016/900] tracing: remove on the fly allocator from function profiler Impact: safer code The on the fly allocator for the function profiler was to save memory. But at the expense of stability. Although it survived several tests, allocating from the function tracer is just too risky, just to save space. This patch removes the allocator and simply allocates enough entries at start up. Each function gets a profiling structure of 40 bytes. With an average of 20K functions, and this is for each CPU, we have 800K per online CPU. This is not too bad, at least for non-embedded. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 76 ++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 33 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a141d8499ab..4d90c916b2b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -401,6 +401,8 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat) int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; + int functions; + int pages; int i; /* If we already allocated, do nothing */ @@ -411,22 +413,46 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) if (!stat->pages) return -ENOMEM; +#ifdef CONFIG_DYNAMIC_FTRACE + functions = ftrace_update_tot_cnt; +#else + /* + * We do not know the number of functions that exist because + * dynamic tracing is what counts them. With past experience + * we have around 20K functions. That should be more than enough. + * It is highly unlikely we will execute every function in + * the kernel. + */ + functions = 20000; +#endif + pg = stat->start = stat->pages; - /* allocate 10 more pages to start */ - for (i = 0; i < 10; i++) { + pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); + + for (i = 0; i < pages; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); - /* - * We only care about allocating profile_pages, if - * we failed to allocate here, hopefully we will allocate - * later. - */ if (!pg->next) - break; + goto out_free; pg = pg->next; } return 0; + + out_free: + pg = stat->start; + while (pg) { + unsigned long tmp = (unsigned long)pg; + + pg = pg->next; + free_page(tmp); + } + + free_page((unsigned long)stat->pages); + stat->pages = NULL; + stat->start = NULL; + + return -ENOMEM; } static int ftrace_profile_init_cpu(int cpu) @@ -460,7 +486,7 @@ static int ftrace_profile_init_cpu(int cpu) ftrace_profile_bits++; } - /* Preallocate a few pages */ + /* Preallocate the function profiling pages */ if (ftrace_profile_pages_init(stat) < 0) { kfree(stat->hash); stat->hash = NULL; @@ -516,24 +542,21 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat, hlist_add_head_rcu(&rec->node, &stat->hash[key]); } -/* Interrupts must be disabled calling this */ +/* + * The memory is already allocated, this simply finds a new record to use. + */ static struct ftrace_profile * -ftrace_profile_alloc(struct ftrace_profile_stat *stat, - unsigned long ip, bool alloc_safe) +ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) { struct ftrace_profile *rec = NULL; - /* prevent recursion */ + /* prevent recursion (from NMIs) */ if (atomic_inc_return(&stat->disabled) != 1) goto out; - /* Try to always keep another page available */ - if (!stat->pages->next && alloc_safe) - stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC); - /* - * Try to find the function again since another - * task on another CPU could have added it + * Try to find the function again since an NMI + * could have added it */ rec = ftrace_find_profiled_func(stat, ip); if (rec) @@ -555,29 +578,16 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, return rec; } -/* - * If we are not in an interrupt, or softirq and - * and interrupts are disabled and preemption is not enabled - * (not in a spinlock) then it should be safe to allocate memory. - */ -static bool ftrace_safe_to_allocate(void) -{ - return !in_interrupt() && irqs_disabled() && !preempt_count(); -} - static void function_profile_call(unsigned long ip, unsigned long parent_ip) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; unsigned long flags; - bool alloc_safe; if (!ftrace_profile_enabled) return; - alloc_safe = ftrace_safe_to_allocate(); - local_irq_save(flags); stat = &__get_cpu_var(ftrace_profile_stats); @@ -586,7 +596,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) rec = ftrace_find_profiled_func(stat, ip); if (!rec) { - rec = ftrace_profile_alloc(stat, ip, alloc_safe); + rec = ftrace_profile_alloc(stat, ip); if (!rec) goto out; } From 34886c8bc590f078d4c0b88f50d061326639198d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 21:00:47 -0400 Subject: [PATCH 017/900] tracing: add average time in function to function profiler Show the average time in the function (Time / Hit) Function Hit Time Avg -------- --- ---- --- mwait_idle 51 140326.6 us 2751.503 us smp_apic_timer_interrupt 47 3517.735 us 74.845 us schedule 10 2738.754 us 273.875 us __schedule 10 2732.857 us 273.285 us hrtimer_interrupt 47 1896.104 us 40.342 us irq_exit 56 1711.833 us 30.568 us __run_hrtimer 47 1315.589 us 27.991 us tick_sched_timer 47 1138.690 us 24.227 us do_softirq 56 1116.829 us 19.943 us __do_softirq 56 1066.932 us 19.052 us do_IRQ 9 926.153 us 102.905 us Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4d90c916b2b..c7f4a4be05d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -347,8 +347,10 @@ static int function_stat_cmp(void *p1, void *p2) static int function_stat_headers(struct seq_file *m) { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - seq_printf(m, " Function Hit Time\n" - " -------- --- ----\n"); + seq_printf(m, " Function " + "Hit Time Avg\n" + " -------- " + "--- ---- ---\n"); #else seq_printf(m, " Function Hit\n" " -------- ---\n"); @@ -361,12 +363,9 @@ static int function_stat_show(struct seq_file *m, void *v) struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - static struct trace_seq s; static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); - trace_seq_init(&s); - trace_print_graph_duration(rec->time, &s); + static struct trace_seq s; + unsigned long long avg; #endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); @@ -374,6 +373,14 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_printf(m, " "); + avg = rec->time; + do_div(avg, rec->counter); + + mutex_lock(&mutex); + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(avg, &s); trace_print_seq(m, &s); mutex_unlock(&mutex); #endif From a8a93f3f03b7a8008d720e8d91798efe599d416c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Feb 2009 13:45:34 -0800 Subject: [PATCH 018/900] mm: disable preemption in apply_to_pte_range Impact: bugfix Lazy mmu mode needs preemption disabled, so if we're apply to init_mm (which doesn't require any pte locks), then explicitly disable preemption. (Do it unconditionally after checking we've successfully done the allocation to simplify the error handling.) Signed-off-by: Jeremy Fitzhardinge --- mm/memory.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index baa999e87cd..b80cc31292b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1718,6 +1718,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, BUG_ON(pmd_huge(*pmd)); + preempt_disable(); arch_enter_lazy_mmu_mode(); token = pmd_pgtable(*pmd); @@ -1729,6 +1730,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); + preempt_enable(); if (mm != &init_mm) pte_unmap_unlock(pte-1, ptl); From b8bcfe997e46150fedcc3f5b26b846400122fdd9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:05:19 -0800 Subject: [PATCH 019/900] x86/paravirt: remove lazy mode in interrupts Impact: simplification, robustness Make paravirt_lazy_mode() always return PARAVIRT_LAZY_NONE when in an interrupt. This prevents interrupt code from accidentally inheriting an outer lazy state, and instead does everything synchronously. Outer batched operations are left deferred. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra Cc: Thomas Gleixner --- arch/x86/kernel/paravirt.c | 3 +++ arch/x86/mm/fault.c | 6 ++---- arch/x86/mm/highmem_32.c | 2 -- arch/x86/mm/iomap_32.c | 1 - arch/x86/mm/pageattr.c | 14 -------------- 5 files changed, 5 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 63dd358d8ee..8ab250ac498 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -282,6 +282,9 @@ void paravirt_leave_lazy_cpu(void) enum paravirt_lazy_mode paravirt_get_lazy_mode(void) { + if (in_interrupt()) + return PARAVIRT_LAZY_NONE; + return __get_cpu_var(paravirt_lazy_mode); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a03b7279efa..cfbb4a73801 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -225,12 +225,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) if (!pmd_present(*pmd_k)) return NULL; - if (!pmd_present(*pmd)) { + if (!pmd_present(*pmd)) set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else { + else BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - } return pmd_k; } diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 00f127c80b0..e81dfa40815 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -87,7 +87,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); set_pte(kmap_pte-idx, mk_pte(page, prot)); - arch_flush_lazy_mmu_mode(); return (void *)vaddr; } @@ -117,7 +116,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type) #endif } - arch_flush_lazy_mmu_mode(); pagefault_enable(); } diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 04102d42ff4..b6a61f3d7ef 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -74,7 +74,6 @@ iounmap_atomic(void *kvaddr, enum km_type type) if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) kpte_clear_flush(kmap_pte-idx, vaddr); - arch_flush_lazy_mmu_mode(); pagefault_enable(); } EXPORT_SYMBOL_GPL(iounmap_atomic); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 9c4294986af..9015e5e412b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -824,13 +824,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, vm_unmap_aliases(); - /* - * If we're called with lazy mmu updates enabled, the - * in-memory pte state may be stale. Flush pending updates to - * bring them up to date. - */ - arch_flush_lazy_mmu_mode(); - cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; @@ -873,13 +866,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, } else cpa_flush_all(cache); - /* - * If we've been called with lazy mmu updates enabled, then - * make sure that everything gets flushed out before we - * return. - */ - arch_flush_lazy_mmu_mode(); - out: return ret; } From 7fd7d83d49914f03aefffba6aee09032fcd54cce Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:24:03 -0800 Subject: [PATCH 020/900] x86/pvops: replace arch_enter_lazy_cpu_mode with arch_start_context_switch Impact: simplification, prepare for later changes Make lazy cpu mode more specific to context switching, so that it makes sense to do more context-switch specific things in the callbacks. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/include/asm/paravirt.h | 8 +++----- arch/x86/kernel/paravirt.c | 13 ------------- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/xen/mmu.c | 5 +---- include/asm-frv/pgtable.h | 4 ++-- include/asm-generic/pgtable.h | 21 +++++++++++---------- kernel/sched.c | 2 +- 8 files changed, 20 insertions(+), 37 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 0617d5cc971..7b28abac323 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -1420,19 +1420,17 @@ void paravirt_enter_lazy_mmu(void); void paravirt_leave_lazy_mmu(void); void paravirt_leave_lazy(enum paravirt_lazy_mode mode); -#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE -static inline void arch_enter_lazy_cpu_mode(void) +#define __HAVE_ARCH_START_CONTEXT_SWITCH +static inline void arch_start_context_switch(void) { PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); } -static inline void arch_leave_lazy_cpu_mode(void) +static inline void arch_end_context_switch(void) { PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); } -void arch_flush_lazy_cpu_mode(void); - #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8ab250ac498..5eea9548216 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -301,19 +301,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_enable(); } -void arch_flush_lazy_cpu_mode(void) -{ - preempt_disable(); - - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { - WARN_ON(preempt_count() == 1); - arch_leave_lazy_cpu_mode(); - arch_enter_lazy_cpu_mode(); - } - - preempt_enable(); -} - struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 14014d766ca..57e49a8278a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -407,7 +407,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index abb7e6a7f0c..7115e608532 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -428,7 +428,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(); /* * Switch FS and GS. diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index cb6afa4ec95..6b98f87232a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1119,10 +1119,8 @@ static void drop_other_mm_ref(void *info) /* If this cpu still has a stale cr3 reference, then make sure it has been flushed. */ - if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) { + if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) load_cr3(swapper_pg_dir); - arch_flush_lazy_cpu_mode(); - } } static void xen_drop_mm_ref(struct mm_struct *mm) @@ -1135,7 +1133,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm) load_cr3(swapper_pg_dir); else leave_mm(smp_processor_id()); - arch_flush_lazy_cpu_mode(); } /* Get the "official" set of cpus referring to our pagetable. */ diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h index e16fdb1f4f4..235e34a7a34 100644 --- a/include/asm-frv/pgtable.h +++ b/include/asm-frv/pgtable.h @@ -73,8 +73,8 @@ static inline int pte_file(pte_t pte) { return 0; } #define pgtable_cache_init() do {} while (0) #define arch_enter_lazy_mmu_mode() do {} while (0) #define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_enter_lazy_cpu_mode() do {} while (0) -#define arch_leave_lazy_cpu_mode() do {} while (0) + +#define arch_start_context_switch() do {} while (0) #else /* !CONFIG_MMU */ /*****************************************************************************/ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 8e6d0ca70ab..922f03671dd 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -280,17 +280,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #endif /* - * A facility to provide batching of the reload of page tables with the - * actual context switch code for paravirtualized guests. By convention, - * only one of the lazy modes (CPU, MMU) should be active at any given - * time, entry should never be nested, and entry and exits should always - * be paired. This is for sanity of maintaining and reasoning about the - * kernel code. + * A facility to provide batching of the reload of page tables and + * other process state with the actual context switch code for + * paravirtualized guests. By convention, only one of the batched + * update (lazy) modes (CPU, MMU) should be active at any given time, + * entry should never be nested, and entry and exits should always be + * paired. This is for sanity of maintaining and reasoning about the + * kernel code. In this case, the exit (end of the context switch) is + * in architecture-specific code, and so doesn't need a generic + * definition. */ -#ifndef __HAVE_ARCH_ENTER_LAZY_CPU_MODE -#define arch_enter_lazy_cpu_mode() do {} while (0) -#define arch_leave_lazy_cpu_mode() do {} while (0) -#define arch_flush_lazy_cpu_mode() do {} while (0) +#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH +#define arch_start_context_switch() do {} while (0) #endif #ifndef __HAVE_PFNMAP_TRACKING diff --git a/kernel/sched.c b/kernel/sched.c index 5757e03cfac..7530fdd7c98 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2746,7 +2746,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_enter_lazy_cpu_mode(); + arch_start_context_switch(); if (unlikely(!mm)) { next->active_mm = oldmm; From b407fc57b815b2016186220baabc76cc8264206e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:46:21 -0800 Subject: [PATCH 021/900] x86/paravirt: flush pending mmu updates on context switch Impact: allow preemption during lazy mmu updates If we're in lazy mmu mode when context switching, leave lazy mmu mode, but remember the task's state in TIF_LAZY_MMU_UPDATES. When we resume the task, check this flag and re-enter lazy mmu mode if its set. This sets things up for allowing lazy mmu mode while preemptible, though that won't actually be active until the next change. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/include/asm/paravirt.h | 1 - arch/x86/include/asm/thread_info.h | 2 ++ arch/x86/kernel/kvm.c | 2 +- arch/x86/kernel/paravirt.c | 13 ++++++++++--- arch/x86/kernel/vmi_32.c | 14 ++++++++++---- arch/x86/lguest/boot.c | 14 ++++++++++---- arch/x86/xen/enlighten.c | 6 +++--- arch/x86/xen/mmu.c | 7 ++++++- arch/x86/xen/xen-ops.h | 1 - 9 files changed, 42 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 7b28abac323..58d2481b01a 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -1418,7 +1418,6 @@ void paravirt_enter_lazy_cpu(void); void paravirt_leave_lazy_cpu(void); void paravirt_enter_lazy_mmu(void); void paravirt_leave_lazy_mmu(void); -void paravirt_leave_lazy(enum paravirt_lazy_mode mode); #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(void) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index df9d5f78385..2f34d643b56 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -94,6 +94,7 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ +#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -115,6 +116,7 @@ struct thread_info { #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) +#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 478bca986ec..5d7f6e76b5d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -201,7 +201,7 @@ static void kvm_leave_lazy_mmu(void) struct kvm_para_state *state = kvm_para_state(); mmu_queue_flush(state); - paravirt_leave_lazy(paravirt_get_lazy_mode()); + paravirt_leave_lazy_mmu(); state->mode = paravirt_get_lazy_mode(); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 5eea9548216..430a0e30577 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -252,7 +252,7 @@ static inline void enter_lazy(enum paravirt_lazy_mode mode) __get_cpu_var(paravirt_lazy_mode) = mode; } -void paravirt_leave_lazy(enum paravirt_lazy_mode mode) +static void leave_lazy(enum paravirt_lazy_mode mode) { BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); BUG_ON(preemptible()); @@ -267,17 +267,24 @@ void paravirt_enter_lazy_mmu(void) void paravirt_leave_lazy_mmu(void) { - paravirt_leave_lazy(PARAVIRT_LAZY_MMU); + leave_lazy(PARAVIRT_LAZY_MMU); } void paravirt_enter_lazy_cpu(void) { + if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_thread_flag(TIF_LAZY_MMU_UPDATES); + } enter_lazy(PARAVIRT_LAZY_CPU); } void paravirt_leave_lazy_cpu(void) { - paravirt_leave_lazy(PARAVIRT_LAZY_CPU); + leave_lazy(PARAVIRT_LAZY_CPU); + + if (test_and_clear_thread_flag(TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); } enum paravirt_lazy_mode paravirt_get_lazy_mode(void) diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 2cc4a90e2cb..950929c607d 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -473,16 +473,22 @@ static void vmi_enter_lazy_cpu(void) vmi_ops.set_lazy_mode(2); } +static void vmi_leave_lazy_cpu(void) +{ + vmi_ops.set_lazy_mode(0); + paravirt_leave_lazy_cpu(); +} + static void vmi_enter_lazy_mmu(void) { paravirt_enter_lazy_mmu(); vmi_ops.set_lazy_mode(1); } -static void vmi_leave_lazy(void) +static void vmi_leave_lazy_mmu(void) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); vmi_ops.set_lazy_mode(0); + paravirt_leave_lazy_mmu(); } static inline int __init check_vmi_rom(struct vrom_header *rom) @@ -718,12 +724,12 @@ static inline int __init activate_vmi(void) para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, set_lazy_mode, SetLazyMode); - para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy_cpu, set_lazy_mode, SetLazyMode); para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, set_lazy_mode, SetLazyMode); - para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu, set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 9fe4ddaa8f6..41a5562e710 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -147,10 +147,16 @@ static void lazy_hcall(unsigned long call, /* When lazy mode is turned off reset the per-cpu lazy mode variable and then * issue the do-nothing hypercall to flush any stored calls. */ -static void lguest_leave_lazy_mode(void) +static void lguest_leave_lazy_mmu_mode(void) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); + paravirt_leave_lazy_mmu(); +} + +static void lguest_leave_lazy_cpu_mode(void) +{ + hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); + paravirt_leave_lazy_cpu(); } /*G:033 @@ -1026,7 +1032,7 @@ __init void lguest_init(void) pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; pv_cpu_ops.wbinvd = lguest_wbinvd; pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; - pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; + pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_cpu_mode; /* pagetable management */ pv_mmu_ops.write_cr3 = lguest_write_cr3; @@ -1039,7 +1045,7 @@ __init void lguest_init(void) pv_mmu_ops.read_cr2 = lguest_read_cr2; pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; + pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; #ifdef CONFIG_X86_LOCAL_APIC /* apic read/write intercepts */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 82cd39a6cbd..f586e63b9a6 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -203,10 +203,10 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -void xen_leave_lazy(void) +static void xen_leave_lazy_cpu(void) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); xen_mc_flush(); + paravirt_leave_lazy_cpu(); } static unsigned long xen_store_tr(void) @@ -819,7 +819,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .lazy_mode = { .enter = paravirt_enter_lazy_cpu, - .leave = xen_leave_lazy, + .leave = xen_leave_lazy_cpu, }, }; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 6b98f87232a..f5f8faa4f76 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1816,6 +1816,11 @@ __init void xen_post_allocator_init(void) xen_mark_init_mm_pinned(); } +static void xen_leave_lazy_mmu(void) +{ + xen_mc_flush(); + paravirt_leave_lazy_mmu(); +} const struct pv_mmu_ops xen_mmu_ops __initdata = { .pagetable_setup_start = xen_pagetable_setup_start, @@ -1891,7 +1896,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .lazy_mode = { .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy, + .leave = xen_leave_lazy_mmu, }, .set_fixmap = xen_set_fixmap, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 2f5ef2632ea..f897cdffccb 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); void xen_reserve_top(void); -void xen_leave_lazy(void); void xen_post_allocator_init(void); char * __init xen_memory_setup(void); From 224101ed69d3fbb486868e0f6e0f9fa37302efb4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Feb 2009 11:18:57 -0800 Subject: [PATCH 022/900] x86/paravirt: finish change from lazy cpu to context switch start/end Impact: fix lazy context switch API Pass the previous and next tasks into the context switch start end calls, so that the called functions can properly access the task state (esp in end_context_switch, in which the next task is not yet completely current). Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/include/asm/paravirt.h | 17 ++++++++++------- arch/x86/include/asm/pgtable.h | 2 ++ arch/x86/kernel/paravirt.c | 14 ++++++-------- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vmi_32.c | 12 ++++++------ arch/x86/lguest/boot.c | 8 ++++---- arch/x86/xen/enlighten.c | 10 ++++------ include/asm-frv/pgtable.h | 2 +- include/asm-generic/pgtable.h | 2 +- kernel/sched.c | 2 +- 11 files changed, 37 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 58d2481b01a..dfdee0ca57d 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -56,6 +56,7 @@ struct desc_ptr; struct tss_struct; struct mm_struct; struct desc_struct; +struct task_struct; /* * Wrapper type for pointers to code which uses the non-standard @@ -203,7 +204,8 @@ struct pv_cpu_ops { void (*swapgs)(void); - struct pv_lazy_ops lazy_mode; + void (*start_context_switch)(struct task_struct *prev); + void (*end_context_switch)(struct task_struct *next); }; struct pv_irq_ops { @@ -1414,20 +1416,21 @@ enum paravirt_lazy_mode { }; enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_enter_lazy_cpu(void); -void paravirt_leave_lazy_cpu(void); +void paravirt_start_context_switch(struct task_struct *prev); +void paravirt_end_context_switch(struct task_struct *next); + void paravirt_enter_lazy_mmu(void); void paravirt_leave_lazy_mmu(void); #define __HAVE_ARCH_START_CONTEXT_SWITCH -static inline void arch_start_context_switch(void) +static inline void arch_start_context_switch(struct task_struct *prev) { - PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); + PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev); } -static inline void arch_end_context_switch(void) +static inline void arch_end_context_switch(struct task_struct *next) { - PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); + PVOP_VCALL1(pv_cpu_ops.end_context_switch, next); } #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d0812e155f1..24e42836e92 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -83,6 +83,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base) #define pte_val(x) native_pte_val(x) #define __pte(x) native_make_pte(x) +#define arch_end_context_switch(prev) do {} while(0) + #endif /* CONFIG_PARAVIRT */ /* diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 430a0e30577..cf1437503ba 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -270,20 +270,20 @@ void paravirt_leave_lazy_mmu(void) leave_lazy(PARAVIRT_LAZY_MMU); } -void paravirt_enter_lazy_cpu(void) +void paravirt_start_context_switch(struct task_struct *prev) { if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { arch_leave_lazy_mmu_mode(); - set_thread_flag(TIF_LAZY_MMU_UPDATES); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); } enter_lazy(PARAVIRT_LAZY_CPU); } -void paravirt_leave_lazy_cpu(void) +void paravirt_end_context_switch(struct task_struct *next) { leave_lazy(PARAVIRT_LAZY_CPU); - if (test_and_clear_thread_flag(TIF_LAZY_MMU_UPDATES)) + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) arch_enter_lazy_mmu_mode(); } @@ -399,10 +399,8 @@ struct pv_cpu_ops pv_cpu_ops = { .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, - .lazy_mode = { - .enter = paravirt_nop, - .leave = paravirt_nop, - }, + .start_context_switch = paravirt_nop, + .end_context_switch = paravirt_nop, }; struct pv_apic_ops pv_apic_ops = { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 57e49a8278a..d766c7616fd 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -407,7 +407,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_end_context_switch(); + arch_end_context_switch(next_p); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7115e608532..e8a9aaf9df8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -428,7 +428,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_end_context_switch(); + arch_end_context_switch(next_p); /* * Switch FS and GS. diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 950929c607d..55a5d6938e5 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -467,16 +467,16 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_enter_lazy_cpu(void) +static void vmi_start_context_switch(struct task_struct *prev) { - paravirt_enter_lazy_cpu(); + paravirt_start_context_switch(prev); vmi_ops.set_lazy_mode(2); } -static void vmi_leave_lazy_cpu(void) +static void vmi_end_context_switch(struct task_struct *next) { vmi_ops.set_lazy_mode(0); - paravirt_leave_lazy_cpu(); + paravirt_end_context_switch(next); } static void vmi_enter_lazy_mmu(void) @@ -722,9 +722,9 @@ static inline int __init activate_vmi(void) para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); para_fill(pv_cpu_ops.io_delay, IODelay); - para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, + para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch, set_lazy_mode, SetLazyMode); - para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy_cpu, + para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch, set_lazy_mode, SetLazyMode); para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 41a5562e710..5287081b356 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -153,10 +153,10 @@ static void lguest_leave_lazy_mmu_mode(void) paravirt_leave_lazy_mmu(); } -static void lguest_leave_lazy_cpu_mode(void) +static void lguest_end_context_switch(struct task_struct *next) { hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); - paravirt_leave_lazy_cpu(); + paravirt_end_context_switch(next); } /*G:033 @@ -1031,8 +1031,8 @@ __init void lguest_init(void) pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; pv_cpu_ops.wbinvd = lguest_wbinvd; - pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; - pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_cpu_mode; + pv_cpu_ops.start_context_switch = paravirt_start_context_switch; + pv_cpu_ops.end_context_switch = lguest_end_context_switch; /* pagetable management */ pv_mmu_ops.write_cr3 = lguest_write_cr3; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f586e63b9a6..70b355d3a86 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -203,10 +203,10 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -static void xen_leave_lazy_cpu(void) +static void xen_end_context_switch(struct task_struct *next) { xen_mc_flush(); - paravirt_leave_lazy_cpu(); + paravirt_end_context_switch(next); } static unsigned long xen_store_tr(void) @@ -817,10 +817,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { /* Xen takes care of %gs when switching to usermode for us */ .swapgs = paravirt_nop, - .lazy_mode = { - .enter = paravirt_enter_lazy_cpu, - .leave = xen_leave_lazy_cpu, - }, + .start_context_switch = paravirt_start_context_switch, + .end_context_switch = xen_end_context_switch, }; static const struct pv_apic_ops xen_apic_ops __initdata = { diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h index 235e34a7a34..09887045d03 100644 --- a/include/asm-frv/pgtable.h +++ b/include/asm-frv/pgtable.h @@ -74,7 +74,7 @@ static inline int pte_file(pte_t pte) { return 0; } #define arch_enter_lazy_mmu_mode() do {} while (0) #define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_start_context_switch() do {} while (0) +#define arch_start_context_switch(prev) do {} while (0) #else /* !CONFIG_MMU */ /*****************************************************************************/ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 922f03671dd..e410f602cab 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -291,7 +291,7 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, * definition. */ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH -#define arch_start_context_switch() do {} while (0) +#define arch_start_context_switch(prev) do {} while (0) #endif #ifndef __HAVE_PFNMAP_TRACKING diff --git a/kernel/sched.c b/kernel/sched.c index 7530fdd7c98..133762aece5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2746,7 +2746,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_start_context_switch(); + arch_start_context_switch(prev); if (unlikely(!mm)) { next->active_mm = oldmm; From 2829b449276aed45f3d649efb21e3418e39dd5d1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:53:19 -0800 Subject: [PATCH 023/900] x86/paravirt: allow preemption with lazy mmu mode Impact: remove obsolete checks, simplification Lift restrictions on preemption with lazy mmu mode, as it is now allowed. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/kernel/paravirt.c | 7 ++++--- arch/x86/xen/mmu.c | 8 +------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index cf1437503ba..bf2e86eee80 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -247,7 +247,6 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - BUG_ON(preemptible()); __get_cpu_var(paravirt_lazy_mode) = mode; } @@ -255,7 +254,6 @@ static inline void enter_lazy(enum paravirt_lazy_mode mode) static void leave_lazy(enum paravirt_lazy_mode mode) { BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); - BUG_ON(preemptible()); __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; } @@ -272,6 +270,8 @@ void paravirt_leave_lazy_mmu(void) void paravirt_start_context_switch(struct task_struct *prev) { + BUG_ON(preemptible()); + if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { arch_leave_lazy_mmu_mode(); set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); @@ -281,6 +281,8 @@ void paravirt_start_context_switch(struct task_struct *prev) void paravirt_end_context_switch(struct task_struct *next) { + BUG_ON(preemptible()); + leave_lazy(PARAVIRT_LAZY_CPU); if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) @@ -300,7 +302,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - WARN_ON(preempt_count() == 1); arch_leave_lazy_mmu_mode(); arch_enter_lazy_mmu_mode(); } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f5f8faa4f76..3f2d0fe5e6a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -419,10 +419,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - /* updates to init_mm may be done without lock */ - if (mm == &init_mm) - preempt_disable(); - ADD_STATS(set_pte_at, 1); // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); ADD_STATS(set_pte_at_current, mm == current->mm); @@ -443,9 +439,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, } xen_set_pte(ptep, pteval); -out: - if (mm == &init_mm) - preempt_enable(); +out: return; } pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, From 252a6bf2a3a7e7add56b17d48aecf3f3ef213103 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Feb 2009 00:11:28 -0800 Subject: [PATCH 024/900] mm: allow preemption in apply_to_pte_range Impact: allow preemption in apply_to_pte_range updates to init_mm Preemption is now allowed for lazy mmu mode, so don't disable it for the inner loop of apply_to_pte_range. This only applies when doing updates to init_mm; user pagetables are still modified under the pte lock, so preemption is disabled anyway. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index b80cc31292b..baa999e87cd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1718,7 +1718,6 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, BUG_ON(pmd_huge(*pmd)); - preempt_disable(); arch_enter_lazy_mmu_mode(); token = pmd_pgtable(*pmd); @@ -1730,7 +1729,6 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); - preempt_enable(); if (mm != &init_mm) pte_unmap_unlock(pte-1, ptl); From ab2f75f0b760d2b0c9a875b669a1b51dce02c85a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Feb 2009 00:18:50 -0800 Subject: [PATCH 025/900] x86/paravirt: use percpu_ rather than __get_cpu_var Impact: minor optimisation percpu_read/write is a slightly more direct way of getting to percpu data. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/paravirt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bf2e86eee80..254e8aa8bfd 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -246,16 +246,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - __get_cpu_var(paravirt_lazy_mode) = mode; + percpu_write(paravirt_lazy_mode, mode); } static void leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); + BUG_ON(percpu_read(paravirt_lazy_mode) != mode); - __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; + percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); } void paravirt_enter_lazy_mmu(void) @@ -294,7 +294,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) if (in_interrupt()) return PARAVIRT_LAZY_NONE; - return __get_cpu_var(paravirt_lazy_mode); + return percpu_read(paravirt_lazy_mode); } void arch_flush_lazy_mmu_mode(void) From 5caecb9432428241d0c641897f07ff4003f1b55f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 20 Feb 2009 23:01:26 -0800 Subject: [PATCH 026/900] xen: disable preempt for leave_lazy_mmu xen_mc_flush() requires preemption to be disabled for its own sanity, so disable it while we're flushing. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3f2d0fe5e6a..0e572380413 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1812,8 +1812,10 @@ __init void xen_post_allocator_init(void) static void xen_leave_lazy_mmu(void) { + preempt_disable(); xen_mc_flush(); paravirt_leave_lazy_mmu(); + preempt_enable(); } const struct pv_mmu_ops xen_mmu_ops __initdata = { From 59d7187142bbe9b404a403ed0f874d3227305f26 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Feb 2009 15:48:33 -0800 Subject: [PATCH 027/900] xen: separate p2m allocation from setting When doing very early p2m setting, we need to separate setting from allocation, so split things up accordingly. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 71 ++++++++++++++++++++++++++++++++-------------- arch/x86/xen/mmu.h | 3 ++ 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0e572380413..e0a55b7a6ce 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -233,47 +233,74 @@ unsigned long get_phys_to_machine(unsigned long pfn) } EXPORT_SYMBOL_GPL(get_phys_to_machine); -static void alloc_p2m(unsigned long **pp, unsigned long *mfnp) +/* install a new p2m_top page */ +bool install_p2mtop_page(unsigned long pfn, unsigned long *p) { - unsigned long *p; + unsigned topidx = p2m_top_index(pfn); + unsigned long **pfnp, *mfnp; unsigned i; - p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); - BUG_ON(p == NULL); + pfnp = &p2m_top[topidx]; + mfnp = &p2m_top_mfn[topidx]; for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) p[i] = INVALID_P2M_ENTRY; - if (cmpxchg(pp, p2m_missing, p) != p2m_missing) - free_page((unsigned long)p); - else + if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { *mfnp = virt_to_mfn(p); + return true; + } + + return false; +} + +static void alloc_p2m(unsigned long pfn) +{ + unsigned long *p; + + p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); + BUG_ON(p == NULL); + + if (!install_p2mtop_page(pfn, p)) + free_page((unsigned long)p); +} + +/* Try to install p2m mapping; fail if intermediate bits missing */ +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + unsigned topidx, idx; + + if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { + BUG_ON(mfn != INVALID_P2M_ENTRY); + return true; + } + + topidx = p2m_top_index(pfn); + if (p2m_top[topidx] == p2m_missing) { + if (mfn == INVALID_P2M_ENTRY) + return true; + return false; + } + + idx = p2m_index(pfn); + p2m_top[topidx][idx] = mfn; + + return true; } void set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - unsigned topidx, idx; - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); return; } - if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { - BUG_ON(mfn != INVALID_P2M_ENTRY); - return; - } + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + alloc_p2m(pfn); - topidx = p2m_top_index(pfn); - if (p2m_top[topidx] == p2m_missing) { - /* no need to allocate a page to store an invalid entry */ - if (mfn == INVALID_P2M_ENTRY) - return; - alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]); + if (!__set_phys_to_machine(pfn, mfn)) + BUG(); } - - idx = p2m_index(pfn); - p2m_top[topidx][idx] = mfn; } unsigned long arbitrary_virt_to_mfn(void *vaddr) diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 24d1b44a337..da730262489 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -11,6 +11,9 @@ enum pt_level { }; +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); +bool install_p2mtop_page(unsigned long pfn, unsigned long *p); + void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); From a2bcd4731f77cb77ae4b5e4a3d7f5471cf346c33 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 29 Mar 2009 23:47:48 +0200 Subject: [PATCH 028/900] x86/mm: further cleanups of fault.c's include file section Impact: cleanup Eliminate more than 20 unnecessary #include lines in fault.c Also fix include file dependency bug in asm/traps.h. (this was masked before, by implicit inclusion) Signed-off-by: Ingo Molnar LKML-Reference: Acked-by: H. Peter Anvin --- arch/x86/include/asm/traps.h | 1 + arch/x86/mm/fault.c | 42 ++++++++---------------------------- 2 files changed, 10 insertions(+), 33 deletions(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0d5342515b8..37fb07a9cda 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -2,6 +2,7 @@ #define _ASM_X86_TRAPS_H #include +#include /* TRAP_TRACE, ... */ #ifdef CONFIG_X86_32 #define dotraplinkage diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a03b7279efa..24a36a6426a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -3,40 +3,16 @@ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include /* STACK_END_MAGIC */ +#include /* test_thread_flag(), ... */ +#include /* oops_begin/end, ... */ +#include /* search_exception_table */ +#include /* max_low_pfn */ +#include /* __kprobes, ... */ +#include /* kmmio_handler, ... */ -#include - -#include -#include -#include -#include -#include -#include -#include +#include /* dotraplinkage, ... */ +#include /* pgd_*(), ... */ /* * Page fault error code bits: From 7571a60446030d2576d881438447e86a0755a83b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Feb 2009 15:34:59 -0800 Subject: [PATCH 029/900] xen: split construction of p2m mfn tables from registration Build the p2m_mfn_list_list early with the rest of the p2m table, but register it later when the real shared_info structure is in place. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index e0a55b7a6ce..67d2ab45cd9 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -184,7 +184,7 @@ static inline unsigned p2m_index(unsigned long pfn) } /* Build the parallel p2m_top_mfn structures */ -void xen_setup_mfn_list_list(void) +static void __init xen_build_mfn_list_list(void) { unsigned pfn, idx; @@ -198,7 +198,10 @@ void xen_setup_mfn_list_list(void) unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); } +} +void xen_setup_mfn_list_list(void) +{ BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = @@ -218,6 +221,8 @@ void __init xen_build_dynamic_phys_to_machine(void) p2m_top[topidx] = &mfn_list[pfn]; } + + xen_build_mfn_list_list(); } unsigned long get_phys_to_machine(unsigned long pfn) From 6ed6bf428aff64fe37cdc54b239d598fee6016f1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 13:02:18 -0800 Subject: [PATCH 030/900] xen: clean up xen_load_gdt Makes the logic a bit clearer. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 70b355d3a86..5776dc27029 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -301,10 +301,21 @@ static void xen_load_gdt(const struct desc_ptr *dtr) frames = mcs.args; for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - frames[f] = arbitrary_virt_to_mfn((void *)va); + int level; + pte_t *ptep = lookup_address(va, &level); + unsigned long pfn, mfn; + void *virt; + + BUG_ON(ptep == NULL); + + pfn = pte_pfn(*ptep); + mfn = pfn_to_mfn(pfn); + virt = __va(PFN_PHYS(pfn)); + + frames[f] = mfn; make_lowmem_page_readonly((void *)va); - make_lowmem_page_readonly(mfn_to_virt(frames[f])); + make_lowmem_page_readonly(virt); } MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); From 3ce5fa7ebff74b6a4dc5fdcdc22e6979f5a4ff85 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 15:26:00 -0800 Subject: [PATCH 031/900] xen: make xen_load_gdt simpler Remove use of multicall machinery which is unused (gdt loading is never performance critical). This removes the implicit use of percpu variables, which simplifies understanding how the percpu code's use of load_gdt interacts with this code. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5776dc27029..48b399bd6e0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -284,12 +284,11 @@ static void xen_set_ldt(const void *addr, unsigned entries) static void xen_load_gdt(const struct desc_ptr *dtr) { - unsigned long *frames; unsigned long va = dtr->address; unsigned int size = dtr->size + 1; unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + unsigned long frames[pages]; int f; - struct multicall_space mcs; /* A GDT can be up to 64k in size, which corresponds to 8192 8-byte entries, or 16 4k pages.. */ @@ -297,9 +296,6 @@ static void xen_load_gdt(const struct desc_ptr *dtr) BUG_ON(size > 65536); BUG_ON(va & ~PAGE_MASK); - mcs = xen_mc_entry(sizeof(*frames) * pages); - frames = mcs.args; - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { int level; pte_t *ptep = lookup_address(va, &level); @@ -314,13 +310,15 @@ static void xen_load_gdt(const struct desc_ptr *dtr) frames[f] = mfn; + printk("xen_load_gdt: %d va=%p mfn=%lx pfn=%lx va'=%p\n", + f, (void *)va, mfn, pfn, virt); + make_lowmem_page_readonly((void *)va); make_lowmem_page_readonly(virt); } - MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); - - xen_mc_issue(PARAVIRT_LAZY_CPU); + if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) + BUG(); } static void load_TLS_descriptor(struct thread_struct *t, From b4b7e58590d0e94ed78bd6be1aa163caba7b6c74 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 16:34:27 -0800 Subject: [PATCH 032/900] xen: remove xen_load_gdt debug Don't need the noise. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 48b399bd6e0..75b7a0f9038 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -310,9 +310,6 @@ static void xen_load_gdt(const struct desc_ptr *dtr) frames[f] = mfn; - printk("xen_load_gdt: %d va=%p mfn=%lx pfn=%lx va'=%p\n", - f, (void *)va, mfn, pfn, virt); - make_lowmem_page_readonly((void *)va); make_lowmem_page_readonly(virt); } From e9e2d1ffcfdb38bed11a3064aa74bea9ee38ed80 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Thu, 5 Mar 2009 20:13:57 +0100 Subject: [PATCH 033/900] NULL noise: arch/x86/xen/smp.c Fix this sparse warnings: arch/x86/xen/smp.c:316:52: warning: Using plain integer as NULL pointer arch/x86/xen/smp.c:421:60: warning: Using plain integer as NULL pointer Signed-off-by: Hannes Eder Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 8d470562ffc..304d832710c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -317,7 +317,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) BUG_ON(rc); while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { - HYPERVISOR_sched_op(SCHEDOP_yield, 0); + HYPERVISOR_sched_op(SCHEDOP_yield, NULL); barrier(); } @@ -422,7 +422,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask) /* Make sure other vcpus get a chance to run if they need to. */ for_each_cpu(cpu, mask) { if (xen_vcpu_stolen(cpu)) { - HYPERVISOR_sched_op(SCHEDOP_yield, 0); + HYPERVISOR_sched_op(SCHEDOP_yield, NULL); break; } } From e826fe1ba1563a9272345da8e3279a930ac160a7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 7 Mar 2009 17:09:27 -0800 Subject: [PATCH 034/900] xen: mask XSAVE from cpuid Xen leaves XSAVE set in cpuid, but doesn't allow cr4.OSXSAVE to be set. This confuses the kernel and it ends up crashing on an xsetbv instruction. At boot time, try to set cr4.OSXSAVE, and mask XSAVE out of cpuid it we can't. This will produce a spurious error from Xen, but allows us to support XSAVE if/when Xen does. This also factors out the cpuid mask decisions to boot time. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 50 +++++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 75b7a0f9038..da33e0c5870 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -168,21 +168,23 @@ static void __init xen_banner(void) xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } +static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; +static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; + static void xen_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { + unsigned maskecx = ~0; unsigned maskedx = ~0; /* * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. */ - if (*ax == 1) - maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ - (1 << X86_FEATURE_ACPI) | /* disable ACPI */ - (1 << X86_FEATURE_MCE) | /* disable MCE */ - (1 << X86_FEATURE_MCA) | /* disable MCA */ - (1 << X86_FEATURE_ACC)); /* thermal monitoring */ + if (*ax == 1) { + maskecx = cpuid_leaf1_ecx_mask; + maskedx = cpuid_leaf1_edx_mask; + } asm(XEN_EMULATE_PREFIX "cpuid" : "=a" (*ax), @@ -190,9 +192,43 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, "=c" (*cx), "=d" (*dx) : "0" (*ax), "2" (*cx)); + + *cx &= maskecx; *dx &= maskedx; } +static __init void xen_init_cpuid_mask(void) +{ + unsigned int ax, bx, cx, dx; + + cpuid_leaf1_edx_mask = + ~((1 << X86_FEATURE_MCE) | /* disable MCE */ + (1 << X86_FEATURE_MCA) | /* disable MCA */ + (1 << X86_FEATURE_ACC)); /* thermal monitoring */ + + if (!xen_initial_domain()) + cpuid_leaf1_edx_mask &= + ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ + (1 << X86_FEATURE_ACPI)); /* disable ACPI */ + + ax = 1; + xen_cpuid(&ax, &bx, &cx, &dx); + + /* cpuid claims we support xsave; try enabling it to see what happens */ + if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { + unsigned long cr4; + + set_in_cr4(X86_CR4_OSXSAVE); + + cr4 = read_cr4(); + + if ((cr4 & X86_CR4_OSXSAVE) == 0) + cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); + + clear_in_cr4(X86_CR4_OSXSAVE); + } +} + static void xen_set_debugreg(int reg, unsigned long val) { HYPERVISOR_set_debugreg(reg, val); @@ -901,6 +937,8 @@ asmlinkage void __init xen_start_kernel(void) xen_init_irq_ops(); + xen_init_cpuid_mask(); + #ifdef CONFIG_X86_LOCAL_APIC /* * set up the basic apic ops. From 68509cdcde6583ee1a9542899d1270449c7d5903 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 8 Mar 2009 03:59:04 -0700 Subject: [PATCH 035/900] x86-64: remove PGE from must-have feature list PGE may not be available when running paravirtualized, so test the cpuid bit before using it. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/required-features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index d5cd6c58688..a4737dddfd5 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -50,7 +50,7 @@ #ifdef CONFIG_X86_64 #define NEED_PSE 0 #define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) -#define NEED_PGE (1<<(X86_FEATURE_PGE & 31)) +#define NEED_PGE 0 #define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) #define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) #define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) From 1e7449730853e7c9ae9a2458b2ced7ba12559a0e Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 9 Feb 2009 12:05:46 -0800 Subject: [PATCH 036/900] Xen: Add virt_to_pfn helper function Signed-off-by: Alex Nixon --- arch/x86/include/asm/xen/page.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 1a918dde46b..018a0a40079 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -124,7 +124,8 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn) /* VIRT <-> MACHINE conversion */ #define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) -#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v)))) +#define virt_to_pfn(v) (PFN_DOWN(__pa(v))) +#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) static inline unsigned long pte_mfn(pte_t pte) From 5f241e65f2be4661a33e1937e1c829252a80b2b8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Mar 2009 17:08:48 -0700 Subject: [PATCH 037/900] x86-64: non-paravirt systems always has PSE and PGE A paravirtualized system may not have PSE or PGE available to guests, so they are not required features. However, without paravirt we can assume that any x86-64 implementation will have them available. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/required-features.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index a4737dddfd5..64cf2d24fad 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -48,9 +48,15 @@ #endif #ifdef CONFIG_X86_64 +#ifdef CONFIG_PARAVIRT +/* Paravirtualized systems may not have PSE or PGE available */ #define NEED_PSE 0 -#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) #define NEED_PGE 0 +#else +#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31) +#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31) +#endif +#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) #define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) #define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) #define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) From 4185f35404dc96f8525298c7c548aee419f3b3f4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Mar 2009 13:30:55 -0700 Subject: [PATCH 038/900] xen/mmu: some early pagetable cleanups 1. make sure early-allocated ptes are pinned, so they can be later unpinned 2. don't pin pmd+pud, just make them RO 3. scatter some __inits around Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 40 ++++++++++++++++++++++++++++------------ arch/x86/xen/xen-ops.h | 2 -- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 67d2ab45cd9..df87c803cec 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1013,7 +1013,7 @@ static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, return 0; } -void __init xen_mark_init_mm_pinned(void) +static void __init xen_mark_init_mm_pinned(void) { xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } @@ -1461,10 +1461,29 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) } #endif +static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) { +#ifdef CONFIG_FLATMEM + BUG_ON(mem_map); /* should only be used early */ +#endif + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); +} + +/* Used for pmd and pud */ +static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) +{ #ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ #endif @@ -1473,18 +1492,15 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) /* Early release_pte assumes that all pts are pinned, since there's only init_mm and anything attached to that is pinned. */ -static void xen_release_pte_init(unsigned long pfn) +static __init void xen_release_pte_init(unsigned long pfn) { + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } -static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +static __init void xen_release_pmd_init(unsigned long pfn) { - struct mmuext_op op; - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } /* This needs to make sure the new pte page is pinned iff its being @@ -1873,9 +1889,9 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .alloc_pte = xen_alloc_pte_init, .release_pte = xen_release_pte_init, - .alloc_pmd = xen_alloc_pte_init, + .alloc_pmd = xen_alloc_pmd_init, .alloc_pmd_clone = paravirt_nop, - .release_pmd = xen_release_pte_init, + .release_pmd = xen_release_pmd_init, #ifdef CONFIG_HIGHPTE .kmap_atomic_pte = xen_kmap_atomic_pte, @@ -1914,8 +1930,8 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pud = PV_CALLEE_SAVE(xen_make_pud), .set_pgd = xen_set_pgd_hyper, - .alloc_pud = xen_alloc_pte_init, - .release_pud = xen_release_pte_init, + .alloc_pud = xen_alloc_pmd_init, + .release_pud = xen_release_pmd_init, #endif /* PAGETABLE_LEVELS == 4 */ .activate_mm = xen_activate_mm, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f897cdffccb..5c50a1017a3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -56,8 +56,6 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id); bool xen_vcpu_stolen(int vcpu); -void xen_mark_init_mm_pinned(void); - void xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP From 8de07bbdede03598801cf33ab23dcbcd28a918d2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 17:36:57 -0800 Subject: [PATCH 039/900] xen/mmu: weaken flush_tlb_other test Impact: fixes crashing bug There's no particular problem with getting an empty cpu mask, so just shortcut-return if we get one. Avoids crash reported by Christophe Saout Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index df87c803cec..e425a32e0a9 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1293,8 +1293,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, } *args; struct multicall_space mcs; - BUG_ON(cpumask_empty(cpus)); - BUG_ON(!mm); + if (cpumask_empty(cpus)) + return; /* nothing to do */ mcs = xen_mc_entry(sizeof(*args)); args = mcs.args; From 1e6fcf840e11ceff8a656a678c6e4b0560a98e08 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 25 Mar 2009 17:46:42 +0000 Subject: [PATCH 040/900] xen: resume interrupts before system devices. Impact: bugfix Xen domain restore Otherwise the first timer interrupt after resume is missed and we never get another. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/manage.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 3ccd348d112..b703dd2c9f1 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -68,15 +68,15 @@ static int xen_suspend(void *data) gnttab_resume(); xen_mm_unpin_all(); - sysdev_resume(); - device_power_up(PMSG_RESUME); - if (!*cancelled) { xen_irq_resume(); xen_console_resume(); xen_timer_resume(); } + sysdev_resume(); + device_power_up(PMSG_RESUME); + return 0; } From 707ebbc81c61eb480d8a51ca61e355e240df1d32 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Mar 2009 11:29:02 -0700 Subject: [PATCH 041/900] xen: set _PAGE_NX in __supported_pte_mask before pagetable construction Some 64-bit machines don't support the NX flag in ptes. Check for NX before constructing the kernel pagetables. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index da33e0c5870..80f4c534349 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -912,7 +913,6 @@ static const struct machine_ops __initdata xen_machine_ops = { .emergency_restart = xen_emergency_restart, }; - /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -980,6 +980,11 @@ asmlinkage void __init xen_start_kernel(void) if (!xen_initial_domain()) __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); +#ifdef CONFIG_X86_64 + /* Work out if we support NX */ + check_efer(); +#endif + /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; From 6d02c42698f99eccb290ac53d4f10ca883b9f90c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 29 Mar 2009 22:57:15 -0700 Subject: [PATCH 042/900] xen: clean up gate trap/interrupt constants Use GATE_INTERRUPT/TRAP rather than 0xe/f. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 80f4c534349..12a3159333b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -428,7 +428,7 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { - if (val->type != 0xf && val->type != 0xe) + if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) return 0; info->vector = vector; @@ -436,8 +436,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, info->cs = gate_segment(*val); info->flags = val->dpl; /* interrupt gates clear IF */ - if (val->type == 0xe) - info->flags |= 4; + if (val->type == GATE_INTERRUPT) + info->flags |= 1 << 2; return 1; } From d4c045364d3107603187f21a56ec231e74d26441 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 6 Feb 2009 19:20:31 -0800 Subject: [PATCH 043/900] xen: add irq_from_evtchn Given an evtchn, return the corresponding irq. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/events.c | 6 ++++++ include/xen/events.h | 3 +++ 2 files changed, 9 insertions(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 30963af5dba..1cd2a0e15ae 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -151,6 +151,12 @@ static unsigned int evtchn_from_irq(unsigned irq) return info_for_irq(irq)->evtchn; } +unsigned irq_from_evtchn(unsigned int evtchn) +{ + return evtchn_to_irq[evtchn]; +} +EXPORT_SYMBOL_GPL(irq_from_evtchn); + static enum ipi_vector ipi_from_irq(unsigned irq) { struct irq_info *info = info_for_irq(irq); diff --git a/include/xen/events.h b/include/xen/events.h index 0d5f1adc036..e68d59a90ca 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -53,4 +53,7 @@ bool xen_test_irq_pending(int irq); irq will be disabled so it won't deliver an interrupt. */ void xen_poll_irq(int irq); +/* Determine the IRQ which is bound to an event channel */ +unsigned irq_from_evtchn(unsigned int evtchn); + #endif /* _XEN_EVENTS_H */ From f7116284c734f3a47180cd9c907944a1837ccb3c Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 6 Feb 2009 19:21:19 -0800 Subject: [PATCH 044/900] xen: add /dev/xen/evtchn driver This driver is used by application which wish to receive notifications from the hypervisor or other guests via Xen's event channel mechanism. In particular it is used by the xenstore daemon in domain 0. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/Kconfig | 10 + drivers/xen/Makefile | 3 +- drivers/xen/evtchn.c | 494 +++++++++++++++++++++++++++++++++++++++++++ include/xen/evtchn.h | 88 ++++++++ 4 files changed, 594 insertions(+), 1 deletion(-) create mode 100644 drivers/xen/evtchn.c create mode 100644 include/xen/evtchn.h diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 526187c8a12..1bbb9108f31 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -18,6 +18,16 @@ config XEN_SCRUB_PAGES secure, but slightly less efficient. If in doubt, say yes. +config XEN_DEV_EVTCHN + tristate "Xen /dev/xen/evtchn device" + depends on XEN + default y + help + The evtchn driver allows a userspace process to triger event + channels and to receive notification of an event channel + firing. + If in doubt, say yes. + config XENFS tristate "Xen filesystem" depends on XEN diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index ff8accc9e10..1567639847e 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -4,4 +4,5 @@ obj-y += xenbus/ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o -obj-$(CONFIG_XENFS) += xenfs/ \ No newline at end of file +obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o +obj-$(CONFIG_XENFS) += xenfs/ diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c new file mode 100644 index 00000000000..517b9ee63e1 --- /dev/null +++ b/drivers/xen/evtchn.c @@ -0,0 +1,494 @@ +/****************************************************************************** + * evtchn.c + * + * Driver for receiving and demuxing event-channel signals. + * + * Copyright (c) 2004-2005, K A Fraser + * Multi-process extensions Copyright (c) 2004, Steven Smith + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct per_user_data { + /* Notification ring, accessed via /dev/xen/evtchn. */ +#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) + evtchn_port_t *ring; + unsigned int ring_cons, ring_prod, ring_overflow; + struct mutex ring_cons_mutex; /* protect against concurrent readers */ + + /* Processes wait on this queue when ring is empty. */ + wait_queue_head_t evtchn_wait; + struct fasync_struct *evtchn_async_queue; + const char *name; +}; + +/* Who's bound to each port? */ +static struct per_user_data *port_user[NR_EVENT_CHANNELS]; +static DEFINE_SPINLOCK(port_user_lock); + +irqreturn_t evtchn_interrupt(int irq, void *data) +{ + unsigned int port = (unsigned long)data; + struct per_user_data *u; + + spin_lock(&port_user_lock); + + u = port_user[port]; + + disable_irq_nosync(irq); + + if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { + u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; + wmb(); /* Ensure ring contents visible */ + if (u->ring_cons == u->ring_prod++) { + wake_up_interruptible(&u->evtchn_wait); + kill_fasync(&u->evtchn_async_queue, + SIGIO, POLL_IN); + } + } else { + u->ring_overflow = 1; + } + + spin_unlock(&port_user_lock); + + return IRQ_HANDLED; +} + +static ssize_t evtchn_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + int rc; + unsigned int c, p, bytes1 = 0, bytes2 = 0; + struct per_user_data *u = file->private_data; + + /* Whole number of ports. */ + count &= ~(sizeof(evtchn_port_t)-1); + + if (count == 0) + return 0; + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + for (;;) { + mutex_lock(&u->ring_cons_mutex); + + rc = -EFBIG; + if (u->ring_overflow) + goto unlock_out; + + c = u->ring_cons; + p = u->ring_prod; + if (c != p) + break; + + mutex_unlock(&u->ring_cons_mutex); + + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + rc = wait_event_interruptible(u->evtchn_wait, + u->ring_cons != u->ring_prod); + if (rc) + return rc; + } + + /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ + if (((c ^ p) & EVTCHN_RING_SIZE) != 0) { + bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * + sizeof(evtchn_port_t); + bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t); + } else { + bytes1 = (p - c) * sizeof(evtchn_port_t); + bytes2 = 0; + } + + /* Truncate chunks according to caller's maximum byte count. */ + if (bytes1 > count) { + bytes1 = count; + bytes2 = 0; + } else if ((bytes1 + bytes2) > count) { + bytes2 = count - bytes1; + } + + rc = -EFAULT; + rmb(); /* Ensure that we see the port before we copy it. */ + if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) || + ((bytes2 != 0) && + copy_to_user(&buf[bytes1], &u->ring[0], bytes2))) + goto unlock_out; + + u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t); + rc = bytes1 + bytes2; + + unlock_out: + mutex_unlock(&u->ring_cons_mutex); + return rc; +} + +static ssize_t evtchn_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + int rc, i; + evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL); + struct per_user_data *u = file->private_data; + + if (kbuf == NULL) + return -ENOMEM; + + /* Whole number of ports. */ + count &= ~(sizeof(evtchn_port_t)-1); + + rc = 0; + if (count == 0) + goto out; + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + rc = -EFAULT; + if (copy_from_user(kbuf, buf, count) != 0) + goto out; + + spin_lock_irq(&port_user_lock); + for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) + if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u)) + enable_irq(irq_from_evtchn(kbuf[i])); + spin_unlock_irq(&port_user_lock); + + rc = count; + + out: + free_page((unsigned long)kbuf); + return rc; +} + +static int evtchn_bind_to_user(struct per_user_data *u, int port) +{ + int irq; + int rc = 0; + + spin_lock_irq(&port_user_lock); + + BUG_ON(port_user[port] != NULL); + + irq = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, + u->name, (void *)(unsigned long)port); + if (rc < 0) + goto fail; + + port_user[port] = u; + +fail: + spin_unlock_irq(&port_user_lock); + return rc; +} + +static void evtchn_unbind_from_user(struct per_user_data *u, int port) +{ + int irq = irq_from_evtchn(port); + + unbind_from_irqhandler(irq, (void *)(unsigned long)port); + port_user[port] = NULL; +} + +static long evtchn_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + int rc; + struct per_user_data *u = file->private_data; + void __user *uarg = (void __user *) arg; + + switch (cmd) { + case IOCTL_EVTCHN_BIND_VIRQ: { + struct ioctl_evtchn_bind_virq bind; + struct evtchn_bind_virq bind_virq; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + bind_virq.virq = bind.virq; + bind_virq.vcpu = 0; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq); + if (rc != 0) + break; + + rc = evtchn_bind_to_user(u, bind_virq.port); + if (rc == 0) + rc = bind_virq.port; + break; + } + + case IOCTL_EVTCHN_BIND_INTERDOMAIN: { + struct ioctl_evtchn_bind_interdomain bind; + struct evtchn_bind_interdomain bind_interdomain; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + bind_interdomain.remote_dom = bind.remote_domain; + bind_interdomain.remote_port = bind.remote_port; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + if (rc != 0) + break; + + rc = evtchn_bind_to_user(u, bind_interdomain.local_port); + if (rc == 0) + rc = bind_interdomain.local_port; + break; + } + + case IOCTL_EVTCHN_BIND_UNBOUND_PORT: { + struct ioctl_evtchn_bind_unbound_port bind; + struct evtchn_alloc_unbound alloc_unbound; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = bind.remote_domain; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (rc != 0) + break; + + rc = evtchn_bind_to_user(u, alloc_unbound.port); + if (rc == 0) + rc = alloc_unbound.port; + break; + } + + case IOCTL_EVTCHN_UNBIND: { + struct ioctl_evtchn_unbind unbind; + + rc = -EFAULT; + if (copy_from_user(&unbind, uarg, sizeof(unbind))) + break; + + rc = -EINVAL; + if (unbind.port >= NR_EVENT_CHANNELS) + break; + + spin_lock_irq(&port_user_lock); + + rc = -ENOTCONN; + if (port_user[unbind.port] != u) { + spin_unlock_irq(&port_user_lock); + break; + } + + evtchn_unbind_from_user(u, unbind.port); + + spin_unlock_irq(&port_user_lock); + + rc = 0; + break; + } + + case IOCTL_EVTCHN_NOTIFY: { + struct ioctl_evtchn_notify notify; + + rc = -EFAULT; + if (copy_from_user(¬ify, uarg, sizeof(notify))) + break; + + if (notify.port >= NR_EVENT_CHANNELS) { + rc = -EINVAL; + } else if (port_user[notify.port] != u) { + rc = -ENOTCONN; + } else { + notify_remote_via_evtchn(notify.port); + rc = 0; + } + break; + } + + case IOCTL_EVTCHN_RESET: { + /* Initialise the ring to empty. Clear errors. */ + mutex_lock(&u->ring_cons_mutex); + spin_lock_irq(&port_user_lock); + u->ring_cons = u->ring_prod = u->ring_overflow = 0; + spin_unlock_irq(&port_user_lock); + mutex_unlock(&u->ring_cons_mutex); + rc = 0; + break; + } + + default: + rc = -ENOSYS; + break; + } + + return rc; +} + +static unsigned int evtchn_poll(struct file *file, poll_table *wait) +{ + unsigned int mask = POLLOUT | POLLWRNORM; + struct per_user_data *u = file->private_data; + + poll_wait(file, &u->evtchn_wait, wait); + if (u->ring_cons != u->ring_prod) + mask |= POLLIN | POLLRDNORM; + if (u->ring_overflow) + mask = POLLERR; + return mask; +} + +static int evtchn_fasync(int fd, struct file *filp, int on) +{ + struct per_user_data *u = filp->private_data; + return fasync_helper(fd, filp, on, &u->evtchn_async_queue); +} + +static int evtchn_open(struct inode *inode, struct file *filp) +{ + struct per_user_data *u; + + u = kzalloc(sizeof(*u), GFP_KERNEL); + if (u == NULL) + return -ENOMEM; + + u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm); + if (u->name == NULL) { + kfree(u); + return -ENOMEM; + } + + init_waitqueue_head(&u->evtchn_wait); + + u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL); + if (u->ring == NULL) { + kfree(u->name); + kfree(u); + return -ENOMEM; + } + + mutex_init(&u->ring_cons_mutex); + + filp->private_data = u; + + return 0; +} + +static int evtchn_release(struct inode *inode, struct file *filp) +{ + int i; + struct per_user_data *u = filp->private_data; + + spin_lock_irq(&port_user_lock); + + free_page((unsigned long)u->ring); + + for (i = 0; i < NR_EVENT_CHANNELS; i++) { + if (port_user[i] != u) + continue; + + evtchn_unbind_from_user(port_user[i], i); + } + + spin_unlock_irq(&port_user_lock); + + kfree(u->name); + kfree(u); + + return 0; +} + +static const struct file_operations evtchn_fops = { + .owner = THIS_MODULE, + .read = evtchn_read, + .write = evtchn_write, + .unlocked_ioctl = evtchn_ioctl, + .poll = evtchn_poll, + .fasync = evtchn_fasync, + .open = evtchn_open, + .release = evtchn_release, +}; + +static struct miscdevice evtchn_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "evtchn", + .fops = &evtchn_fops, +}; +static int __init evtchn_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + spin_lock_init(&port_user_lock); + memset(port_user, 0, sizeof(port_user)); + + /* Create '/dev/misc/evtchn'. */ + err = misc_register(&evtchn_miscdev); + if (err != 0) { + printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + return err; + } + + printk(KERN_INFO "Event-channel device installed.\n"); + + return 0; +} + +static void __exit evtchn_cleanup(void) +{ + misc_deregister(&evtchn_miscdev); +} + +module_init(evtchn_init); +module_exit(evtchn_cleanup); + +MODULE_LICENSE("GPL"); diff --git a/include/xen/evtchn.h b/include/xen/evtchn.h new file mode 100644 index 00000000000..14e833ee4e0 --- /dev/null +++ b/include/xen/evtchn.h @@ -0,0 +1,88 @@ +/****************************************************************************** + * evtchn.h + * + * Interface to /dev/xen/evtchn. + * + * Copyright (c) 2003-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_EVTCHN_H__ +#define __LINUX_PUBLIC_EVTCHN_H__ + +/* + * Bind a fresh port to VIRQ @virq. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_VIRQ \ + _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq)) +struct ioctl_evtchn_bind_virq { + unsigned int virq; +}; + +/* + * Bind a fresh port to remote <@remote_domain, @remote_port>. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_INTERDOMAIN \ + _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain)) +struct ioctl_evtchn_bind_interdomain { + unsigned int remote_domain, remote_port; +}; + +/* + * Allocate a fresh port for binding to @remote_domain. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \ + _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port)) +struct ioctl_evtchn_bind_unbound_port { + unsigned int remote_domain; +}; + +/* + * Unbind previously allocated @port. + */ +#define IOCTL_EVTCHN_UNBIND \ + _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind)) +struct ioctl_evtchn_unbind { + unsigned int port; +}; + +/* + * Unbind previously allocated @port. + */ +#define IOCTL_EVTCHN_NOTIFY \ + _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify)) +struct ioctl_evtchn_notify { + unsigned int port; +}; + +/* Clear and reinitialise the event buffer. Clear error condition. */ +#define IOCTL_EVTCHN_RESET \ + _IOC(_IOC_NONE, 'E', 5, 0) + +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */ From c5cfef0f79cacc3aa438fc28f4747f0d10c54d0d Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 6 Feb 2009 19:21:19 -0800 Subject: [PATCH 045/900] xen: export ioctl headers to userspace Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- include/Kbuild | 1 + include/xen/Kbuild | 1 + 2 files changed, 2 insertions(+) create mode 100644 include/xen/Kbuild diff --git a/include/Kbuild b/include/Kbuild index d8c3e3cbf41..fe36accd432 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -8,3 +8,4 @@ header-y += mtd/ header-y += rdma/ header-y += video/ header-y += drm/ +header-y += xen/ diff --git a/include/xen/Kbuild b/include/xen/Kbuild new file mode 100644 index 00000000000..4e65c16a445 --- /dev/null +++ b/include/xen/Kbuild @@ -0,0 +1 @@ +header-y += evtchn.h From 0a4666b539a0e896ec4e8396a034a479e3573125 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Feb 2009 13:03:24 -0800 Subject: [PATCH 046/900] xen/dev-evtchn: clean up locking in evtchn Define a new per_user_data mutex to serialize bind/unbind operations to prevent them from racing with each other. Fix error returns and don't do a bind while holding a spinlock. Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/evtchn.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 517b9ee63e1..af031950f9b 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -54,6 +54,8 @@ #include struct per_user_data { + struct mutex bind_mutex; /* serialize bind/unbind operations */ + /* Notification ring, accessed via /dev/xen/evtchn. */ #define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) #define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) @@ -69,7 +71,7 @@ struct per_user_data { /* Who's bound to each port? */ static struct per_user_data *port_user[NR_EVENT_CHANNELS]; -static DEFINE_SPINLOCK(port_user_lock); +static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */ irqreturn_t evtchn_interrupt(int irq, void *data) { @@ -210,22 +212,24 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, static int evtchn_bind_to_user(struct per_user_data *u, int port) { - int irq; int rc = 0; - spin_lock_irq(&port_user_lock); - + /* + * Ports are never reused, so every caller should pass in a + * unique port. + * + * (Locking not necessary because we haven't registered the + * interrupt handler yet, and our caller has already + * serialized bind operations.) + */ BUG_ON(port_user[port] != NULL); - - irq = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, - u->name, (void *)(unsigned long)port); - if (rc < 0) - goto fail; - port_user[port] = u; -fail: - spin_unlock_irq(&port_user_lock); + rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, + u->name, (void *)(unsigned long)port); + if (rc >= 0) + rc = 0; + return rc; } @@ -234,6 +238,10 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port) int irq = irq_from_evtchn(port); unbind_from_irqhandler(irq, (void *)(unsigned long)port); + + /* make sure we unbind the irq handler before clearing the port */ + barrier(); + port_user[port] = NULL; } @@ -244,6 +252,9 @@ static long evtchn_ioctl(struct file *file, struct per_user_data *u = file->private_data; void __user *uarg = (void __user *) arg; + /* Prevent bind from racing with unbind */ + mutex_lock(&u->bind_mutex); + switch (cmd) { case IOCTL_EVTCHN_BIND_VIRQ: { struct ioctl_evtchn_bind_virq bind; @@ -368,6 +379,7 @@ static long evtchn_ioctl(struct file *file, rc = -ENOSYS; break; } + mutex_unlock(&u->bind_mutex); return rc; } @@ -414,6 +426,7 @@ static int evtchn_open(struct inode *inode, struct file *filp) return -ENOMEM; } + mutex_init(&u->bind_mutex); mutex_init(&u->ring_cons_mutex); filp->private_data = u; From a1ce1be578365a4da7e7d7db4812539d2d5da763 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 9 Feb 2009 12:05:50 -0800 Subject: [PATCH 047/900] xen: remove suspend_cancel hook Remove suspend_cancel hook from xenbus_driver, in preparation for using the device model for suspending. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/xenbus/xenbus_probe.c | 23 ----------------------- include/xen/xenbus.h | 1 - 2 files changed, 24 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 773d1cf2328..bd20361fb09 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -689,27 +689,6 @@ static int suspend_dev(struct device *dev, void *data) return 0; } -static int suspend_cancel_dev(struct device *dev, void *data) -{ - int err = 0; - struct xenbus_driver *drv; - struct xenbus_device *xdev; - - DPRINTK(""); - - if (dev->driver == NULL) - return 0; - drv = to_xenbus_driver(dev->driver); - xdev = container_of(dev, struct xenbus_device, dev); - if (drv->suspend_cancel) - err = drv->suspend_cancel(xdev); - if (err) - printk(KERN_WARNING - "xenbus: suspend_cancel %s failed: %i\n", - dev_name(dev), err); - return 0; -} - static int resume_dev(struct device *dev, void *data) { int err; @@ -777,8 +756,6 @@ EXPORT_SYMBOL_GPL(xenbus_resume); void xenbus_suspend_cancel(void) { xs_suspend_cancel(); - bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev); - xenbus_backend_resume(suspend_cancel_dev); } EXPORT_SYMBOL_GPL(xenbus_suspend_cancel); diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index f87f9614844..0836772b968 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -92,7 +92,6 @@ struct xenbus_driver { enum xenbus_state backend_state); int (*remove)(struct xenbus_device *dev); int (*suspend)(struct xenbus_device *dev); - int (*suspend_cancel)(struct xenbus_device *dev); int (*resume)(struct xenbus_device *dev); int (*uevent)(struct xenbus_device *, char **, int, char *, int); struct device_driver driver; From de5b31bd47de7e6f41be2e271318dbc8f1af354d Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 9 Feb 2009 12:05:50 -0800 Subject: [PATCH 048/900] xen: use device model for suspending xenbus devices Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/manage.c | 9 ++++---- drivers/xen/xenbus/xenbus_probe.c | 37 ++++++++----------------------- drivers/xen/xenbus/xenbus_xs.c | 2 ++ include/xen/xenbus.h | 2 +- 4 files changed, 16 insertions(+), 34 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index b703dd2c9f1..5269bb4d249 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -104,9 +104,8 @@ static void do_suspend(void) goto out; } - printk("suspending xenbus...\n"); - /* XXX use normal device tree? */ - xenbus_suspend(); + printk(KERN_DEBUG "suspending xenstore...\n"); + xs_suspend(); err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); if (err) { @@ -116,9 +115,9 @@ static void do_suspend(void) if (!cancelled) { xen_arch_resume(); - xenbus_resume(); + xs_resume(); } else - xenbus_suspend_cancel(); + xs_suspend_cancel(); device_resume(PMSG_RESUME); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index bd20361fb09..4649213bed9 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -71,6 +71,9 @@ static int xenbus_probe_frontend(const char *type, const char *name); static void xenbus_dev_shutdown(struct device *_dev); +static int xenbus_dev_suspend(struct device *dev, pm_message_t state); +static int xenbus_dev_resume(struct device *dev); + /* If something in array of ids matches this device, return it. */ static const struct xenbus_device_id * match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) @@ -188,6 +191,9 @@ static struct xen_bus_type xenbus_frontend = { .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, .dev_attrs = xenbus_dev_attrs, + + .suspend = xenbus_dev_suspend, + .resume = xenbus_dev_resume, }, }; @@ -669,7 +675,7 @@ static struct xenbus_watch fe_watch = { .callback = frontend_changed, }; -static int suspend_dev(struct device *dev, void *data) +static int xenbus_dev_suspend(struct device *dev, pm_message_t state) { int err = 0; struct xenbus_driver *drv; @@ -682,14 +688,14 @@ static int suspend_dev(struct device *dev, void *data) drv = to_xenbus_driver(dev->driver); xdev = container_of(dev, struct xenbus_device, dev); if (drv->suspend) - err = drv->suspend(xdev); + err = drv->suspend(xdev, state); if (err) printk(KERN_WARNING "xenbus: suspend %s failed: %i\n", dev_name(dev), err); return 0; } -static int resume_dev(struct device *dev, void *data) +static int xenbus_dev_resume(struct device *dev) { int err; struct xenbus_driver *drv; @@ -734,31 +740,6 @@ static int resume_dev(struct device *dev, void *data) return 0; } -void xenbus_suspend(void) -{ - DPRINTK(""); - - bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev); - xenbus_backend_suspend(suspend_dev); - xs_suspend(); -} -EXPORT_SYMBOL_GPL(xenbus_suspend); - -void xenbus_resume(void) -{ - xb_init_comms(); - xs_resume(); - bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev); - xenbus_backend_resume(resume_dev); -} -EXPORT_SYMBOL_GPL(xenbus_resume); - -void xenbus_suspend_cancel(void) -{ - xs_suspend_cancel(); -} -EXPORT_SYMBOL_GPL(xenbus_suspend_cancel); - /* A flag to determine if xenstored is 'ready' (i.e. has started) */ int xenstored_ready = 0; diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index e325eab4724..eab33f1dbdf 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -673,6 +673,8 @@ void xs_resume(void) struct xenbus_watch *watch; char token[sizeof(watch) * 2 + 1]; + xb_init_comms(); + mutex_unlock(&xs_state.response_mutex); mutex_unlock(&xs_state.request_mutex); up_write(&xs_state.transaction_mutex); diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 0836772b968..b9763badbd7 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -91,7 +91,7 @@ struct xenbus_driver { void (*otherend_changed)(struct xenbus_device *dev, enum xenbus_state backend_state); int (*remove)(struct xenbus_device *dev); - int (*suspend)(struct xenbus_device *dev); + int (*suspend)(struct xenbus_device *dev, pm_message_t state); int (*resume)(struct xenbus_device *dev); int (*uevent)(struct xenbus_device *, char **, int, char *, int); struct device_driver driver; From c6a960ce8858f20036cc3afc3b9422670d0d9021 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 9 Feb 2009 12:05:53 -0800 Subject: [PATCH 049/900] xen/xenbus: export xenbus_dev_changed Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/xenbus/xenbus_probe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 4649213bed9..d42e25d5968 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -660,6 +660,7 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus) kfree(root); } +EXPORT_SYMBOL_GPL(xenbus_dev_changed); static void frontend_changed(struct xenbus_watch *watch, const char **vec, unsigned int len) From cff7e81b3dd7c25cd2248cd7a04c5764552d5d55 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 10 Mar 2009 14:39:59 -0700 Subject: [PATCH 050/900] xen: add /sys/hypervisor support Adds support for Xen info under /sys/hypervisor. Taken from Novell 2.6.27 backport tree. Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/Kconfig | 10 + drivers/xen/Makefile | 3 +- drivers/xen/sys-hypervisor.c | 475 ++++++++++++++++++++++++++++++++ include/xen/interface/version.h | 3 + 4 files changed, 490 insertions(+), 1 deletion(-) create mode 100644 drivers/xen/sys-hypervisor.c diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 526187c8a12..88bca1c42db 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -41,3 +41,13 @@ config XEN_COMPAT_XENFS a xen platform. If in doubt, say yes. +config XEN_SYS_HYPERVISOR + bool "Create xen entries under /sys/hypervisor" + depends on XEN && SYSFS + select SYS_HYPERVISOR + default y + help + Create entries under /sys/hypervisor describing the Xen + hypervisor environment. When running native or in another + virtual environment, /sys/hypervisor will still be present, + but will have no xen contents. \ No newline at end of file diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index ff8accc9e10..f3603a39db5 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -4,4 +4,5 @@ obj-y += xenbus/ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o -obj-$(CONFIG_XENFS) += xenfs/ \ No newline at end of file +obj-$(CONFIG_XENFS) += xenfs/ +obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c new file mode 100644 index 00000000000..cb29d1ccf0d --- /dev/null +++ b/drivers/xen/sys-hypervisor.c @@ -0,0 +1,475 @@ +/* + * copyright (c) 2006 IBM Corporation + * Authored by: Mike D. Day + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define HYPERVISOR_ATTR_RO(_name) \ +static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) + +#define HYPERVISOR_ATTR_RW(_name) \ +static struct hyp_sysfs_attr _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +struct hyp_sysfs_attr { + struct attribute attr; + ssize_t (*show)(struct hyp_sysfs_attr *, char *); + ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t); + void *hyp_attr_data; +}; + +static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + return sprintf(buffer, "xen\n"); +} + +HYPERVISOR_ATTR_RO(type); + +static int __init xen_sysfs_type_init(void) +{ + return sysfs_create_file(hypervisor_kobj, &type_attr.attr); +} + +static void xen_sysfs_type_destroy(void) +{ + sysfs_remove_file(hypervisor_kobj, &type_attr.attr); +} + +/* xen version attributes */ +static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int version = HYPERVISOR_xen_version(XENVER_version, NULL); + if (version) + return sprintf(buffer, "%d\n", version >> 16); + return -ENODEV; +} + +HYPERVISOR_ATTR_RO(major); + +static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int version = HYPERVISOR_xen_version(XENVER_version, NULL); + if (version) + return sprintf(buffer, "%d\n", version & 0xff); + return -ENODEV; +} + +HYPERVISOR_ATTR_RO(minor); + +static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *extra; + + extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL); + if (extra) { + ret = HYPERVISOR_xen_version(XENVER_extraversion, extra); + if (!ret) + ret = sprintf(buffer, "%s\n", extra); + kfree(extra); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(extra); + +static struct attribute *version_attrs[] = { + &major_attr.attr, + &minor_attr.attr, + &extra_attr.attr, + NULL +}; + +static struct attribute_group version_group = { + .name = "version", + .attrs = version_attrs, +}; + +static int __init xen_sysfs_version_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &version_group); +} + +static void xen_sysfs_version_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &version_group); +} + +/* UUID */ + +static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + char *vm, *val; + int ret; + extern int xenstored_ready; + + if (!xenstored_ready) + return -EBUSY; + + vm = xenbus_read(XBT_NIL, "vm", "", NULL); + if (IS_ERR(vm)) + return PTR_ERR(vm); + val = xenbus_read(XBT_NIL, vm, "uuid", NULL); + kfree(vm); + if (IS_ERR(val)) + return PTR_ERR(val); + ret = sprintf(buffer, "%s\n", val); + kfree(val); + return ret; +} + +HYPERVISOR_ATTR_RO(uuid); + +static int __init xen_sysfs_uuid_init(void) +{ + return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr); +} + +static void xen_sysfs_uuid_destroy(void) +{ + sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr); +} + +/* xen compilation attributes */ + +static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compiler); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compiler); + +static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compile_by); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compiled_by); + +static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compile_date); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compile_date); + +static struct attribute *xen_compile_attrs[] = { + &compiler_attr.attr, + &compiled_by_attr.attr, + &compile_date_attr.attr, + NULL +}; + +static struct attribute_group xen_compilation_group = { + .name = "compilation", + .attrs = xen_compile_attrs, +}; + +int __init static xen_compilation_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &xen_compilation_group); +} + +static void xen_compilation_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &xen_compilation_group); +} + +/* xen properties info */ + +static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *caps; + + caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL); + if (caps) { + ret = HYPERVISOR_xen_version(XENVER_capabilities, caps); + if (!ret) + ret = sprintf(buffer, "%s\n", caps); + kfree(caps); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(capabilities); + +static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *cset; + + cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL); + if (cset) { + ret = HYPERVISOR_xen_version(XENVER_changeset, cset); + if (!ret) + ret = sprintf(buffer, "%s\n", cset); + kfree(cset); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(changeset); + +static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_platform_parameters *parms; + + parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL); + if (parms) { + ret = HYPERVISOR_xen_version(XENVER_platform_parameters, + parms); + if (!ret) + ret = sprintf(buffer, "%lx\n", parms->virt_start); + kfree(parms); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(virtual_start); + +static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + + ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL); + if (ret > 0) + ret = sprintf(buffer, "%x\n", ret); + + return ret; +} + +HYPERVISOR_ATTR_RO(pagesize); + +/* eventually there will be several more features to export */ +static ssize_t xen_feature_show(int index, char *buffer) +{ + int ret = -ENOMEM; + struct xen_feature_info *info; + + info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL); + if (info) { + info->submap_idx = index; + ret = HYPERVISOR_xen_version(XENVER_get_features, info); + if (!ret) + ret = sprintf(buffer, "%d\n", info->submap); + kfree(info); + } + + return ret; +} + +static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + return xen_feature_show(XENFEAT_writable_page_tables, buffer); +} + +HYPERVISOR_ATTR_RO(writable_pt); + +static struct attribute *xen_properties_attrs[] = { + &capabilities_attr.attr, + &changeset_attr.attr, + &virtual_start_attr.attr, + &pagesize_attr.attr, + &writable_pt_attr.attr, + NULL +}; + +static struct attribute_group xen_properties_group = { + .name = "properties", + .attrs = xen_properties_attrs, +}; + +static int __init xen_properties_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &xen_properties_group); +} + +static void xen_properties_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &xen_properties_group); +} + +#ifdef CONFIG_KEXEC + +extern size_t vmcoreinfo_size_xen; +extern unsigned long paddr_vmcoreinfo_xen; + +static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page) +{ + return sprintf(page, "%lx %zx\n", + paddr_vmcoreinfo_xen, vmcoreinfo_size_xen); +} + +HYPERVISOR_ATTR_RO(vmcoreinfo); + +static int __init xen_sysfs_vmcoreinfo_init(void) +{ + return sysfs_create_file(hypervisor_kobj, + &vmcoreinfo_attr.attr); +} + +static void xen_sysfs_vmcoreinfo_destroy(void) +{ + sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr); +} + +#endif + +static int __init hyper_sysfs_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + ret = xen_sysfs_type_init(); + if (ret) + goto out; + ret = xen_sysfs_version_init(); + if (ret) + goto version_out; + ret = xen_compilation_init(); + if (ret) + goto comp_out; + ret = xen_sysfs_uuid_init(); + if (ret) + goto uuid_out; + ret = xen_properties_init(); + if (ret) + goto prop_out; +#ifdef CONFIG_KEXEC + if (vmcoreinfo_size_xen != 0) { + ret = xen_sysfs_vmcoreinfo_init(); + if (ret) + goto vmcoreinfo_out; + } +#endif + + goto out; + +#ifdef CONFIG_KEXEC +vmcoreinfo_out: +#endif + xen_properties_destroy(); +prop_out: + xen_sysfs_uuid_destroy(); +uuid_out: + xen_compilation_destroy(); +comp_out: + xen_sysfs_version_destroy(); +version_out: + xen_sysfs_type_destroy(); +out: + return ret; +} + +static void __exit hyper_sysfs_exit(void) +{ +#ifdef CONFIG_KEXEC + if (vmcoreinfo_size_xen != 0) + xen_sysfs_vmcoreinfo_destroy(); +#endif + xen_properties_destroy(); + xen_compilation_destroy(); + xen_sysfs_uuid_destroy(); + xen_sysfs_version_destroy(); + xen_sysfs_type_destroy(); + +} +module_init(hyper_sysfs_init); +module_exit(hyper_sysfs_exit); + +static ssize_t hyp_sysfs_show(struct kobject *kobj, + struct attribute *attr, + char *buffer) +{ + struct hyp_sysfs_attr *hyp_attr; + hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr); + if (hyp_attr->show) + return hyp_attr->show(hyp_attr, buffer); + return 0; +} + +static ssize_t hyp_sysfs_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t len) +{ + struct hyp_sysfs_attr *hyp_attr; + hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr); + if (hyp_attr->store) + return hyp_attr->store(hyp_attr, buffer, len); + return 0; +} + +static struct sysfs_ops hyp_sysfs_ops = { + .show = hyp_sysfs_show, + .store = hyp_sysfs_store, +}; + +static struct kobj_type hyp_sysfs_kobj_type = { + .sysfs_ops = &hyp_sysfs_ops, +}; + +static int __init hypervisor_subsys_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + hypervisor_kobj->ktype = &hyp_sysfs_kobj_type; + return 0; +} +device_initcall(hypervisor_subsys_init); diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h index 453235e923f..e8b6519d47e 100644 --- a/include/xen/interface/version.h +++ b/include/xen/interface/version.h @@ -57,4 +57,7 @@ struct xen_feature_info { /* Declares the features reported by XENVER_get_features. */ #include "features.h" +/* arg == NULL; returns host memory page size. */ +#define XENVER_pagesize 7 + #endif /* __XEN_PUBLIC_VERSION_H__ */ From a649b720614d5675dc402bef75a92576143fede7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 10 Mar 2009 17:17:41 -0700 Subject: [PATCH 051/900] xen/sys/hypervisor: change writable_pt to features /sys/hypervisor/properties/writable_pt was misnamed. Rename to features, expressed as a bit array in hex. Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/sys-hypervisor.c | 41 +++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index cb29d1ccf0d..1267d6fcc0c 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -293,37 +293,48 @@ static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer) HYPERVISOR_ATTR_RO(pagesize); -/* eventually there will be several more features to export */ static ssize_t xen_feature_show(int index, char *buffer) { - int ret = -ENOMEM; - struct xen_feature_info *info; + ssize_t ret; + struct xen_feature_info info; - info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL); - if (info) { - info->submap_idx = index; - ret = HYPERVISOR_xen_version(XENVER_get_features, info); - if (!ret) - ret = sprintf(buffer, "%d\n", info->submap); - kfree(info); - } + info.submap_idx = index; + ret = HYPERVISOR_xen_version(XENVER_get_features, &info); + if (!ret) + ret = sprintf(buffer, "%08x", info.submap); return ret; } -static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer) +static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer) { - return xen_feature_show(XENFEAT_writable_page_tables, buffer); + ssize_t len; + int i; + + len = 0; + for (i = XENFEAT_NR_SUBMAPS-1; i >= 0; i--) { + int ret = xen_feature_show(i, buffer + len); + if (ret < 0) { + if (len == 0) + len = ret; + break; + } + len += ret; + } + if (len > 0) + buffer[len++] = '\n'; + + return len; } -HYPERVISOR_ATTR_RO(writable_pt); +HYPERVISOR_ATTR_RO(features); static struct attribute *xen_properties_attrs[] = { &capabilities_attr.attr, &changeset_attr.attr, &virtual_start_attr.attr, &pagesize_attr.attr, - &writable_pt_attr.attr, + &features_attr.attr, NULL }; From f0783708bf63a2827863cf2be57c08a24843e6bd Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 11 Mar 2009 10:19:54 +0000 Subject: [PATCH 052/900] xen: drop kexec bits from /sys/hypervisor since kexec isn't implemented yet I needed this to compile since there is no kexec yet in pvops kernel CC drivers/xen/sys-hypervisor.o drivers/xen/sys-hypervisor.c: In function 'hyper_sysfs_init': drivers/xen/sys-hypervisor.c:405: error: 'vmcoreinfo_size_xen' undeclared (first use in this function) drivers/xen/sys-hypervisor.c:405: error: (Each undeclared identifier is reported only once drivers/xen/sys-hypervisor.c:405: error: for each function it appears in.) drivers/xen/sys-hypervisor.c:406: error: implicit declaration of function 'xen_sysfs_vmcoreinfo_init' drivers/xen/sys-hypervisor.c: In function 'hyper_sysfs_exit': drivers/xen/sys-hypervisor.c:433: error: 'vmcoreinfo_size_xen' undeclared (first use in this function) drivers/xen/sys-hypervisor.c:434: error: implicit declaration of function 'xen_sysfs_vmcoreinfo_destroy' Signed-off-by: Ian Campbell --- drivers/xen/sys-hypervisor.c | 41 ------------------------------------ 1 file changed, 41 deletions(-) diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 1267d6fcc0c..88a60e03ccf 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -353,32 +353,6 @@ static void xen_properties_destroy(void) sysfs_remove_group(hypervisor_kobj, &xen_properties_group); } -#ifdef CONFIG_KEXEC - -extern size_t vmcoreinfo_size_xen; -extern unsigned long paddr_vmcoreinfo_xen; - -static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page) -{ - return sprintf(page, "%lx %zx\n", - paddr_vmcoreinfo_xen, vmcoreinfo_size_xen); -} - -HYPERVISOR_ATTR_RO(vmcoreinfo); - -static int __init xen_sysfs_vmcoreinfo_init(void) -{ - return sysfs_create_file(hypervisor_kobj, - &vmcoreinfo_attr.attr); -} - -static void xen_sysfs_vmcoreinfo_destroy(void) -{ - sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr); -} - -#endif - static int __init hyper_sysfs_init(void) { int ret; @@ -401,20 +375,9 @@ static int __init hyper_sysfs_init(void) ret = xen_properties_init(); if (ret) goto prop_out; -#ifdef CONFIG_KEXEC - if (vmcoreinfo_size_xen != 0) { - ret = xen_sysfs_vmcoreinfo_init(); - if (ret) - goto vmcoreinfo_out; - } -#endif goto out; -#ifdef CONFIG_KEXEC -vmcoreinfo_out: -#endif - xen_properties_destroy(); prop_out: xen_sysfs_uuid_destroy(); uuid_out: @@ -429,10 +392,6 @@ out: static void __exit hyper_sysfs_exit(void) { -#ifdef CONFIG_KEXEC - if (vmcoreinfo_size_xen != 0) - xen_sysfs_vmcoreinfo_destroy(); -#endif xen_properties_destroy(); xen_compilation_destroy(); xen_sysfs_uuid_destroy(); From 818fd20673df82031e604bb784d836f1fc2e2451 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 6 Feb 2009 18:46:47 -0800 Subject: [PATCH 053/900] xen: add "capabilities" file The xenfs capabilities file allows usermode to determine what capabilities the domain has. The only one at present is "control_d" in a privileged domain. Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/xenfs/super.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 515741a8e6b..6559e0c752c 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -20,10 +20,27 @@ MODULE_DESCRIPTION("Xen filesystem"); MODULE_LICENSE("GPL"); +static ssize_t capabilities_read(struct file *file, char __user *buf, + size_t size, loff_t *off) +{ + char *tmp = ""; + + if (xen_initial_domain()) + tmp = "control_d\n"; + + return simple_read_from_buffer(buf, size, off, tmp, strlen(tmp)); +} + +static const struct file_operations capabilities_file_ops = { + .read = capabilities_read, +}; + static int xenfs_fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr xenfs_files[] = { - [2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR}, + [1] = {}, + { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR }, + { "capabilities", &capabilities_file_ops, S_IRUGO }, {""}, }; From 53152f957d4a5dfd537d17c823afeb1a2c03753e Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 2 Apr 2009 13:24:28 +0100 Subject: [PATCH 054/900] xen: honour VCPU availability on boot If a VM is booted with offline VCPUs then unplug them during boot. Determining the availability of a VCPU requires access to XenStore which is not available at the point smp_prepare_cpus() is called, therefore we bring up all VCPUS initially and unplug the offline ones as soon as XenStore becomes available. Signed-off-by: Ian Campbell --- drivers/xen/cpu_hotplug.c | 40 +++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 974f56d1ebe..411cb1fc927 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -21,29 +21,41 @@ static void disable_hotplug_cpu(int cpu) cpu_clear(cpu, cpu_present_map); } -static void vcpu_hotplug(unsigned int cpu) +static int vcpu_online(unsigned int cpu) { int err; char dir[32], state[32]; - if (!cpu_possible(cpu)) - return; - sprintf(dir, "cpu/%u", cpu); err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state); if (err != 1) { printk(KERN_ERR "XENBUS: Unable to read cpu state\n"); - return; + return err; } - if (strcmp(state, "online") == 0) { + if (strcmp(state, "online") == 0) + return 1; + else if (strcmp(state, "offline") == 0) + return 0; + + printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", state, cpu); + return -EINVAL; +} +static void vcpu_hotplug(unsigned int cpu) +{ + if (!cpu_possible(cpu)) + return; + + switch (vcpu_online(cpu)) { + case 1: enable_hotplug_cpu(cpu); - } else if (strcmp(state, "offline") == 0) { + break; + case 0: (void)cpu_down(cpu); disable_hotplug_cpu(cpu); - } else { - printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", - state, cpu); + break; + default: + break; } } @@ -64,12 +76,20 @@ static void handle_vcpu_hotplug_event(struct xenbus_watch *watch, static int setup_cpu_watcher(struct notifier_block *notifier, unsigned long event, void *data) { + int cpu; static struct xenbus_watch cpu_watch = { .node = "cpu", .callback = handle_vcpu_hotplug_event}; (void)register_xenbus_watch(&cpu_watch); + for_each_possible_cpu(cpu) { + if (vcpu_online(cpu) == 0) { + (void)cpu_down(cpu); + cpu_clear(cpu, cpu_present_map); + } + } + return NOTIFY_DONE; } From 31c9a24ec82926fcae49483e53566d231e705057 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Apr 2009 21:06:25 -0700 Subject: [PATCH 055/900] RCU: make treercu be default Impact: switch default config from CLASSIC_RCU to TREE_RCU Given that I have not gotten any complaints or bug reports on treercu recently, this patch makes it be the default. There are a number of other defconfig files that explicitly call out CLASSIC_RCU, but which have comment headers saying not to edit them. Probably holdovers from one of the flavors of "make config", but... Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: dipankar@in.ibm.com Cc: niv@us.ibm.com Cc: manfred@colorfullife.com Cc: peterz@infradead.org LKML-Reference: <20090403040625.GA9473@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 14c483d2b7c..26ac8772464 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -302,7 +302,7 @@ menu "RCU Subsystem" choice prompt "RCU Implementation" - default CLASSIC_RCU + default TREE_RCU config CLASSIC_RCU bool "Classic RCU" From ca5f9524d61f54b1f618293ab92fc6b49cac864d Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:39:33 -0700 Subject: [PATCH 056/900] futex: separate futex_wait_queue_me() logic from futex_wait() Refactor futex_wait() in preparation for futex_wait_requeue_pi(). In order to reuse a good chunk of the futex_wait() code for the upcoming futex_wait_requeue_pi() function, this patch breaks out the queue-to-wakeup section of futex_wait() into futex_wait_queue_me(). Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 138 +++++++++++++++++++++++++++---------------------- 1 file changed, 76 insertions(+), 62 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 6b50a024bca..ebb48d6d1a8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1115,24 +1115,87 @@ handle_fault: static long futex_wait_restart(struct restart_block *restart); +/** + * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal + * @hb: the futex hash bucket, must be locked by the caller + * @q: the futex_q to queue up on + * @timeout: the prepared hrtimer_sleeper, or null for no timeout + * @wait: the wait_queue to add to the futex_q after queueing in the hb + */ +static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout, + wait_queue_t *wait) +{ + queue_me(q, hb); + + /* + * There might have been scheduling since the queue_me(), as we + * cannot hold a spinlock across the get_user() in case it + * faults, and we cannot just set TASK_INTERRUPTIBLE state when + * queueing ourselves into the futex hash. This code thus has to + * rely on the futex_wake() code removing us from hash when it + * wakes us up. + */ + + /* add_wait_queue is the barrier after __set_current_state. */ + __set_current_state(TASK_INTERRUPTIBLE); + + /* + * Add current as the futex_q waiter. We don't remove ourselves from + * the wait_queue because we are the only user of it. + */ + add_wait_queue(&q->waiter, wait); + + /* Arm the timer */ + if (timeout) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + /* + * !plist_node_empty() is safe here without any lock. + * q.lock_ptr != 0 is not safe, because of ordering against wakeup. + */ + if (likely(!plist_node_empty(&q->list))) { + /* + * If the timer has already expired, current will already be + * flagged for rescheduling. Only call schedule if there + * is no timeout, or if it has yet to expire. + */ + if (!timeout || timeout->task) + schedule(); + } + __set_current_state(TASK_RUNNING); +} + static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset, int clockrt) { - struct task_struct *curr = current; + struct hrtimer_sleeper timeout, *to = NULL; + DECLARE_WAITQUEUE(wait, current); struct restart_block *restart; - DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; u32 uval; int ret; - struct hrtimer_sleeper t; - int rem = 0; if (!bitset) return -EINVAL; q.pi_state = NULL; q.bitset = bitset; + + if (abs_time) { + to = &timeout; + + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); @@ -1178,75 +1241,22 @@ retry_private: goto retry; } ret = -EWOULDBLOCK; + + /* Only actually queue if *uaddr contained val. */ if (unlikely(uval != val)) { queue_unlock(&q, hb); goto out_put_key; } - /* Only actually queue if *uaddr contained val. */ - queue_me(&q, hb); - - /* - * There might have been scheduling since the queue_me(), as we - * cannot hold a spinlock across the get_user() in case it - * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to - * rely on the futex_wake() code removing us from hash when it - * wakes us up. - */ - - /* add_wait_queue is the barrier after __set_current_state. */ - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&q.waiter, &wait); - /* - * !plist_node_empty() is safe here without any lock. - * q.lock_ptr != 0 is not safe, because of ordering against wakeup. - */ - if (likely(!plist_node_empty(&q.list))) { - if (!abs_time) - schedule(); - else { - hrtimer_init_on_stack(&t.timer, - clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(&t, current); - hrtimer_set_expires_range_ns(&t.timer, *abs_time, - current->timer_slack_ns); - - hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&t.timer)) - t.task = NULL; - - /* - * the timer could have already expired, in which - * case current would be flagged for rescheduling. - * Don't bother calling schedule. - */ - if (likely(t.task)) - schedule(); - - hrtimer_cancel(&t.timer); - - /* Flag if a timeout occured */ - rem = (t.task == NULL); - - destroy_hrtimer_on_stack(&t.timer); - } - } - __set_current_state(TASK_RUNNING); - - /* - * NOTE: we don't remove ourselves from the waitqueue because - * we are the only user of it. - */ + /* queue_me and wait for wakeup, timeout, or a signal. */ + futex_wait_queue_me(hb, &q, to, &wait); /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; if (!unqueue_me(&q)) goto out_put_key; ret = -ETIMEDOUT; - if (rem) + if (to && !to->task) goto out_put_key; /* @@ -1275,6 +1285,10 @@ retry_private: out_put_key: put_futex_key(fshared, &q.key); out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } return ret; } From 4b1c486b3587d2abf50bee4a05eb488cd4045f2c Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:39:42 -0700 Subject: [PATCH 057/900] futex: add helper to find the top prio waiter of a futex Improve legibility by wrapping finding the top waiter in a function. This will be used by the follow-on patches for enabling requeue pi. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/futex.c b/kernel/futex.c index ebb48d6d1a8..421fb5e42a1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -276,6 +276,25 @@ void put_futex_key(int fshared, union futex_key *key) drop_futex_key_refs(key); } +/** + * futex_top_waiter() - Return the highest priority waiter on a futex + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) + * + * Must be called with the hb lock held. + */ +static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, + union futex_key *key) +{ + struct futex_q *this; + + plist_for_each_entry(this, &hb->chain, list) { + if (match_futex(&this->key, key)) + return this; + } + return NULL; +} + static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) { u32 curval; From 1a52084d0919c2799258737c21fb328a9de159b5 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:39:52 -0700 Subject: [PATCH 058/900] futex: split out atomic logic from futex_lock_pi() Refactor the atomic portion of futex_lock_pi() into futex_lock_pi_atomic(). This logic will be needed by requeue_pi, so modularize it to reduce code duplication. The only significant change is passing of the task to try and take the lock for. This simplifies the -EDEADLK test as if the lock is owned by task t, it's a deadlock, regardless of if we are doing requeue pi or not. This patch updates the corresponding comment accordingly. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 224 ++++++++++++++++++++++++++++--------------------- 1 file changed, 130 insertions(+), 94 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 421fb5e42a1..986b16e4453 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -556,6 +556,127 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return 0; } +/** + * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the lookup + * @task: the task to perform the atomic lock work for. This will be + * "current" except in the case of requeue pi. + * + * Returns: + * 0 - ready to wait + * 1 - acquired the lock + * <0 - error + * + * The hb->lock and futex_key refs shall be held by the caller. + */ +static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **ps, + struct task_struct *task) +{ + int lock_taken, ret, ownerdied = 0; + u32 uval, newval, curval; + +retry: + ret = lock_taken = 0; + + /* + * To avoid races, we attempt to take the lock here again + * (by doing a 0 -> TID atomic cmpxchg), while holding all + * the locks. It will most likely not succeed. + */ + newval = task_pid_vnr(task); + + curval = cmpxchg_futex_value_locked(uaddr, 0, newval); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + + /* + * Detect deadlocks. + */ + if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) + return -EDEADLK; + + /* + * Surprise - we got the lock. Just return to userspace: + */ + if (unlikely(!curval)) + return 1; + + uval = curval; + + /* + * Set the FUTEX_WAITERS flag, so the owner will know it has someone + * to wake at the next unlock. + */ + newval = curval | FUTEX_WAITERS; + + /* + * There are two cases, where a futex might have no owner (the + * owner TID is 0): OWNER_DIED. We take over the futex in this + * case. We also do an unconditional take over, when the owner + * of the futex died. + * + * This is safe as we are protected by the hash bucket lock ! + */ + if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + /* Keep the OWNER_DIED bit */ + newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); + ownerdied = 0; + lock_taken = 1; + } + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + if (unlikely(curval != uval)) + goto retry; + + /* + * We took the lock due to owner died take over. + */ + if (unlikely(lock_taken)) + return 1; + + /* + * We dont have the lock. Look up the PI state (or create it if + * we are the first waiter): + */ + ret = lookup_pi_state(uval, hb, key, ps); + + if (unlikely(ret)) { + switch (ret) { + case -ESRCH: + /* + * No owner found for this futex. Check if the + * OWNER_DIED bit is set to figure out whether + * this is a robust futex or not. + */ + if (get_futex_value_locked(&curval, uaddr)) + return -EFAULT; + + /* + * We simply start over in case of a robust + * futex. The code above will take the futex + * and return happy. + */ + if (curval & FUTEX_OWNER_DIED) { + ownerdied = 1; + goto retry; + } + default: + break; + } + } + + return ret; +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. @@ -1340,9 +1461,9 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, struct hrtimer_sleeper timeout, *to = NULL; struct task_struct *curr = current; struct futex_hash_bucket *hb; - u32 uval, newval, curval; + u32 uval; struct futex_q q; - int ret, lock_taken, ownerdied = 0; + int ret; if (refill_pi_state_cache()) return -ENOMEM; @@ -1365,81 +1486,15 @@ retry: retry_private: hb = queue_lock(&q); -retry_locked: - ret = lock_taken = 0; - - /* - * To avoid races, we attempt to take the lock here again - * (by doing a 0 -> TID atomic cmpxchg), while holding all - * the locks. It will most likely not succeed. - */ - newval = task_pid_vnr(current); - - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - - /* - * Detect deadlocks. In case of REQUEUE_PI this is a valid - * situation and we return success to user space. - */ - if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { - ret = -EDEADLK; - goto out_unlock_put_key; - } - - /* - * Surprise - we got the lock. Just return to userspace: - */ - if (unlikely(!curval)) - goto out_unlock_put_key; - - uval = curval; - - /* - * Set the WAITERS flag, so the owner will know it has someone - * to wake at next unlock - */ - newval = curval | FUTEX_WAITERS; - - /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! - */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); - ownerdied = 0; - lock_taken = 1; - } - - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - if (unlikely(curval != uval)) - goto retry_locked; - - /* - * We took the lock due to owner died take over. - */ - if (unlikely(lock_taken)) - goto out_unlock_put_key; - - /* - * We dont have the lock. Look up the PI state (or create it if - * we are the first waiter): - */ - ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); - + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current); if (unlikely(ret)) { switch (ret) { - + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; case -EAGAIN: /* * Task is exiting and we just wait for the @@ -1449,25 +1504,6 @@ retry_locked: put_futex_key(fshared, &q.key); cond_resched(); goto retry; - - case -ESRCH: - /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. - */ - if (get_futex_value_locked(&curval, uaddr)) - goto uaddr_faulted; - - /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. - */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; - goto retry_locked; - } default: goto out_unlock_put_key; } From dd9739980b50c8cde33e1f8eb08b7e0140bcd61e Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:02 -0700 Subject: [PATCH 059/900] futex: split out fixup owner logic from futex_lock_pi() Refactor the post lock acquisition logic from futex_lock_pi(). This code will be reused in futex_wait_requeue_pi(). Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 158 ++++++++++++++++++++++++++++--------------------- 1 file changed, 89 insertions(+), 69 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 986b16e4453..af831fbb7fb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1255,6 +1255,79 @@ handle_fault: static long futex_wait_restart(struct restart_block *restart); +/** + * fixup_owner() - Post lock pi_state and corner case management + * @uaddr: user address of the futex + * @fshared: whether the futex is shared (1) or not (0) + * @q: futex_q (contains pi_state and access to the rt_mutex) + * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) + * + * After attempting to lock an rt_mutex, this function is called to cleanup + * the pi_state owner as well as handle race conditions that may allow us to + * acquire the lock. Must be called with the hb lock held. + * + * Returns: + * 1 - success, lock taken + * 0 - success, lock not taken + * <0 - on error (-EFAULT) + */ +static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, + int locked) +{ + struct task_struct *owner; + int ret = 0; + + if (locked) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case: + */ + if (q->pi_state->owner != current) + ret = fixup_pi_state_owner(uaddr, q, current, fshared); + goto out; + } + + /* + * Catch the rare case, where the lock was released when we were on the + * way back before we locked the hash bucket. + */ + if (q->pi_state->owner == current) { + /* + * Try to get the rt_mutex now. This might fail as some other + * task acquired the rt_mutex after we removed ourself from the + * rt_mutex waiters list. + */ + if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { + locked = 1; + goto out; + } + + /* + * pi_state is incorrect, some other task did a lock steal and + * we returned due to timeout or signal without taking the + * rt_mutex. Too late. We can access the rt_mutex_owner without + * locking, as the other task is now blocked on the hash bucket + * lock. Fix the state up. + */ + owner = rt_mutex_owner(&q->pi_state->pi_mutex); + ret = fixup_pi_state_owner(uaddr, q, owner, fshared); + goto out; + } + + /* + * Paranoia check. If we did not take the lock, then we should not be + * the owner, nor the pending owner, of the rt_mutex. + */ + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " + "pi-state %p\n", ret, + q->pi_state->pi_mutex.owner, + q->pi_state->owner); + +out: + return ret ? ret : locked; +} + /** * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal * @hb: the futex hash bucket, must be locked by the caller @@ -1459,11 +1532,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; - struct task_struct *curr = current; struct futex_hash_bucket *hb; u32 uval; struct futex_q q; - int ret; + int res, ret; if (refill_pi_state_cache()) return -ENOMEM; @@ -1527,71 +1599,21 @@ retry_private: } spin_lock(q.lock_ptr); - - if (!ret) { - /* - * Got the lock. We might not be the anticipated owner - * if we did a lock-steal - fix up the PI-state in - * that case: - */ - if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); - } else { - /* - * Catch the rare case, where the lock was released - * when we were on the way back before we locked the - * hash bucket. - */ - if (q.pi_state->owner == curr) { - /* - * Try to get the rt_mutex now. This might - * fail as some other task acquired the - * rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q.pi_state->pi_mutex)) - ret = 0; - else { - /* - * pi_state is incorrect, some other - * task did a lock steal and we - * returned due to timeout or signal - * without taking the rt_mutex. Too - * late. We can access the - * rt_mutex_owner without locking, as - * the other task is now blocked on - * the hash bucket lock. Fix the state - * up. - */ - struct task_struct *owner; - int res; - - owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner, - fshared); - - /* propagate -EFAULT, if the fixup failed */ - if (res) - ret = res; - } - } else { - /* - * Paranoia check. If we did not take the lock - * in the trylock above, then we should not be - * the owner of the rtmutex, neither the real - * nor the pending one: - */ - if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) - printk(KERN_ERR "futex_lock_pi: ret = %d " - "pi-mutex: %p pi-state %p\n", ret, - q.pi_state->pi_mutex.owner, - q.pi_state->owner); - } - } + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr, fshared, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; /* - * If fixup_pi_state_owner() faulted and was unable to handle the - * fault, unlock it and return the fault to userspace. + * If fixup_owner() faulted and was unable to handle the fault, unlock + * it and return the fault to userspace. */ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) rt_mutex_unlock(&q.pi_state->pi_mutex); @@ -1599,9 +1621,7 @@ retry_private: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret != -EINTR ? ret : -ERESTARTNOINTR; + goto out; out_unlock_put_key: queue_unlock(&q, hb); @@ -1611,7 +1631,7 @@ out_put_key: out: if (to) destroy_hrtimer_on_stack(&to->timer); - return ret; + return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: /* From 8dac456a681bd94272ff50ecb31be6b669382c2b Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:12 -0700 Subject: [PATCH 060/900] rt_mutex: add proxy lock routines This patch is a prerequisite for futex requeue_pi. It basically splits rt_mutex_slowlock() right down the middle, just before the first call to schedule(). It further adds helper functions which make use of the split and provide the rt-mutex preliminaries for futex requeue_pi. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/rtmutex.c | 288 +++++++++++++++++++++++++++++----------- kernel/rtmutex_common.h | 8 ++ 2 files changed, 219 insertions(+), 77 deletions(-) diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 69d9cb921ff..fec77e7e056 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * assigned pending owner [which might not have taken the * lock yet]: */ -static inline int try_to_steal_lock(struct rt_mutex *lock) +static inline int try_to_steal_lock(struct rt_mutex *lock, + struct task_struct *task) { struct task_struct *pendowner = rt_mutex_owner(lock); struct rt_mutex_waiter *next; @@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) if (!rt_mutex_owner_pending(lock)) return 0; - if (pendowner == current) + if (pendowner == task) return 1; spin_lock_irqsave(&pendowner->pi_lock, flags); - if (current->prio >= pendowner->prio) { + if (task->prio >= pendowner->prio) { spin_unlock_irqrestore(&pendowner->pi_lock, flags); return 0; } @@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) * We are going to steal the lock and a waiter was * enqueued on the pending owners pi_waiters queue. So * we have to enqueue this waiter into - * current->pi_waiters list. This covers the case, - * where current is boosted because it holds another + * task->pi_waiters list. This covers the case, + * where task is boosted because it holds another * lock and gets unboosted because the booster is * interrupted, so we would delay a waiter with higher - * priority as current->normal_prio. + * priority as task->normal_prio. * * Note: in the rare case of a SCHED_OTHER task changing * its priority and thus stealing the lock, next->task - * might be current: + * might be task: */ - if (likely(next->task != current)) { - spin_lock_irqsave(¤t->pi_lock, flags); - plist_add(&next->pi_list_entry, ¤t->pi_waiters); - __rt_mutex_adjust_prio(current); - spin_unlock_irqrestore(¤t->pi_lock, flags); + if (likely(next->task != task)) { + spin_lock_irqsave(&task->pi_lock, flags); + plist_add(&next->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + spin_unlock_irqrestore(&task->pi_lock, flags); } return 1; } @@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) return 0; /* We got the lock. */ @@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) { struct task_struct *owner = rt_mutex_owner(lock); @@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(¤t->pi_lock, flags); - __rt_mutex_adjust_prio(current); - waiter->task = current; + spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + waiter->task = task; waiter->lock = lock; - plist_node_init(&waiter->list_entry, current->prio); - plist_node_init(&waiter->pi_list_entry, current->prio); + plist_node_init(&waiter->list_entry, task->prio); + plist_node_init(&waiter->pi_list_entry, task->prio); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) top_waiter = rt_mutex_top_waiter(lock); plist_add(&waiter->list_entry, &lock->wait_list); - current->pi_blocked_on = waiter; + task->pi_blocked_on = waiter; - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock_irqrestore(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { spin_lock_irqsave(&owner->pi_lock, flags); @@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - current); + task); spin_lock(&lock->wait_lock); @@ -605,6 +607,85 @@ void rt_mutex_adjust_pi(struct task_struct *task) rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } +/** + * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take + * @state: the state the task should block in (TASK_INTERRUPTIBLE + * or TASK_UNINTERRUPTIBLE) + * @timeout: the pre-initialized and started timer, or NULL for none + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: passed to task_blocks_on_rt_mutex + * + * lock->wait_lock must be held by the caller. + */ +static int __sched +__rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter, + int detect_deadlock) +{ + int ret = 0; + + for (;;) { + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock)) + break; + + /* + * TASK_INTERRUPTIBLE checks for signals and + * timeout. Ignored otherwise. + */ + if (unlikely(state == TASK_INTERRUPTIBLE)) { + /* Signal pending? */ + if (signal_pending(current)) + ret = -EINTR; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + if (ret) + break; + } + + /* + * waiter->task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by a higher prio task. + */ + if (!waiter->task) { + ret = task_blocks_on_rt_mutex(lock, waiter, current, + detect_deadlock); + /* + * If we got woken up by the owner then start loop + * all over without going into schedule to try + * to get the lock now: + */ + if (unlikely(!waiter->task)) { + /* + * Reset the return value. We might + * have returned with -EDEADLK and the + * owner released the lock while we + * were walking the pi chain. + */ + ret = 0; + continue; + } + if (unlikely(ret)) + break; + } + + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(waiter); + + if (waiter->task) + schedule_rt_mutex(lock); + + spin_lock(&lock->wait_lock); + set_current_state(state); + } + + return ret; +} + /* * Slow path lock function: */ @@ -636,62 +717,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, timeout->task = NULL; } - for (;;) { - /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock)) - break; - - /* - * TASK_INTERRUPTIBLE checks for signals and - * timeout. Ignored otherwise. - */ - if (unlikely(state == TASK_INTERRUPTIBLE)) { - /* Signal pending? */ - if (signal_pending(current)) - ret = -EINTR; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - if (ret) - break; - } - - /* - * waiter.task is NULL the first time we come here and - * when we have been woken up by the previous owner - * but the lock got stolen by a higher prio task. - */ - if (!waiter.task) { - ret = task_blocks_on_rt_mutex(lock, &waiter, - detect_deadlock); - /* - * If we got woken up by the owner then start loop - * all over without going into schedule to try - * to get the lock now: - */ - if (unlikely(!waiter.task)) { - /* - * Reset the return value. We might - * have returned with -EDEADLK and the - * owner released the lock while we - * were walking the pi chain. - */ - ret = 0; - continue; - } - if (unlikely(ret)) - break; - } - - spin_unlock(&lock->wait_lock); - - debug_rt_mutex_print_deadlock(&waiter); - - if (waiter.task) - schedule_rt_mutex(lock); - - spin_lock(&lock->wait_lock); - set_current_state(state); - } + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, + detect_deadlock); set_current_state(TASK_RUNNING); @@ -985,6 +1012,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, rt_mutex_deadlock_account_unlock(proxy_owner); } +/** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) +{ + int ret; + + spin_lock(&lock->wait_lock); + + mark_rt_mutex_waiters(lock); + + if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { + /* We got the lock for task. */ + debug_rt_mutex_lock(lock); + + rt_mutex_set_owner(lock, task, 0); + + rt_mutex_deadlock_account_lock(lock, task); + return 1; + } + + ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + + + if (ret && !waiter->task) { + /* + * Reset the return value. We might have + * returned with -EDEADLK and the owner + * released the lock while we were walking the + * pi chain. Let the waiter sort it out. + */ + ret = 0; + } + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(waiter); + + return ret; +} + /** * rt_mutex_next_owner - return the next owner of the lock * @@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) return rt_mutex_top_waiter(lock)->task; } + +/** + * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Complete the lock acquisition started our behalf by another thread. + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * + * Special API call for PI-futex requeue support + */ +int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock) +{ + int ret; + + spin_lock(&lock->wait_lock); + + set_current_state(TASK_INTERRUPTIBLE); + + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, + detect_deadlock); + + set_current_state(TASK_RUNNING); + + if (unlikely(waiter->task)) + remove_waiter(lock, waiter); + + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* + * Readjust priority, when we did not get the lock. We might have been + * the pending owner and boosted. Since we did not take the lock, the + * PI boost has to go. + */ + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + + return ret; +} diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e124bf5800e..97a2f81866a 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); +extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + int detect_deadlock); +extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" From a72188d8a64ebe74722f1cf7ffac41b41ffdba21 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:22 -0700 Subject: [PATCH 061/900] futex: add FUTEX_HAS_TIMEOUT flag to restart.futex.flags Currently restart is only used if there is a timeout. The requeue_pi functionality requires restarting to futex_lock_pi() on signal after wakeup in futex_wait_requeue_pi() regardless of if there was a timeout or not. Using 0 for the timeout value is confusing as that could indicate an expired timer. The flag makes this explicit. While the check is not technically needed in futex_wait_restart(), doing so makes the code consistent with and will avoid confusion should the need arise to restart wait without a timeout. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index af831fbb7fb..6b597cf33b0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1252,6 +1252,7 @@ handle_fault: */ #define FLAGS_SHARED 0x01 #define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); @@ -1486,7 +1487,7 @@ retry_private: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = 0; + restart->futex.flags = FLAGS_HAS_TIMEOUT; if (fshared) restart->futex.flags |= FLAGS_SHARED; @@ -1510,13 +1511,16 @@ static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; int fshared = 0; - ktime_t t; + ktime_t t, *tp = NULL; - t.tv64 = restart->futex.time; + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t.tv64 = restart->futex.time; + tp = &t; + } restart->fn = do_no_restart_syscall; if (restart->futex.flags & FLAGS_SHARED) fshared = 1; - return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, + return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, restart->futex.bitset, restart->futex.flags & FLAGS_CLOCKRT); } From 9121e4783cd5c7e2a407763f3b61c2d573891133 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:31 -0700 Subject: [PATCH 062/900] futex: distangle futex_requeue() futex_requeue() is getting a bit long-winded, and will be getting more so after the requeue_pi patch. Factor out the actual requeueing into a nicely contained inline function to reduce function length and improve legibility. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 6b597cf33b0..e76942e2a79 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -940,6 +940,34 @@ out: return ret; } +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q: the futex_q to requeue + * @hb1: the source hash_bucket + * @hb2: the target hash_bucket + * @key2: the new key for the requeued futex_q + */ +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key2) +{ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(&hb1->chain != &hb2->chain)) { + plist_del(&q->list, &hb1->chain); + plist_add(&q->list, &hb2->chain); + q->lock_ptr = &hb2->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb2->lock; +#endif + } + get_futex_key_refs(key2); + q->key = *key2; +} + /* * Requeue all waiters hashed on one physical page to another * physical page. @@ -999,20 +1027,7 @@ retry_private: if (++ret <= nr_wake) { wake_futex(this); } else { - /* - * If key1 and key2 hash to the same bucket, no need to - * requeue. - */ - if (likely(head1 != &hb2->chain)) { - plist_del(&this->list, &hb1->chain); - plist_add(&this->list, &hb2->chain); - this->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - this->list.plist.lock = &hb2->lock; -#endif - } - this->key = key2; - get_futex_key_refs(&key2); + requeue_futex(this, hb1, hb2, &key2); drop_count++; if (ret - nr_wake >= nr_requeue) From f801073f87aa22ddf0e9146355fec3993163790f Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:40 -0700 Subject: [PATCH 063/900] futex: split out futex value validation code Refactor the code to validate the expected futex value in order to reuse it with the requeue_pi code. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 130 ++++++++++++++++++++++++++++++------------------- 1 file changed, 79 insertions(+), 51 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index e76942e2a79..dbe857aa438 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1398,6 +1398,82 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, __set_current_state(TASK_RUNNING); } +/** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address + * @val: the expected value + * @fshared: whether the futex is shared (1) or not (0) + * @q: the associated futex_q + * @hb: storage for hash_bucket pointer to be returned to caller + * + * Setup the futex_q and locate the hash_bucket. Get the futex value and + * compare it with the expected value. Handle atomic faults internally. + * Return with the hb lock held and a q.key reference on success, and unlocked + * with no q.key reference on failure. + * + * Returns: + * 0 - uaddr contains val and hb has been locked + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked + */ +static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, + struct futex_q *q, struct futex_hash_bucket **hb) +{ + u32 uval; + int ret; + + /* + * Access the page AFTER the hash-bucket is locked. + * Order is important: + * + * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); + * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } + * + * The basic logical guarantee of a futex is that it blocks ONLY + * if cond(var) is known to be true at the time of blocking, for + * any cond. If we queued after testing *uaddr, that would open + * a race condition where we could block indefinitely with + * cond(var) false, which would violate the guarantee. + * + * A consequence is that futex_wait() can return zero and absorb + * a wakeup when *uaddr != val on entry to the syscall. This is + * rare, but normal. + */ +retry: + q->key = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr, fshared, &q->key); + if (unlikely(ret != 0)) + goto out; + +retry_private: + *hb = queue_lock(q); + + ret = get_futex_value_locked(&uval, uaddr); + + if (ret) { + queue_unlock(q, *hb); + + ret = get_user(uval, uaddr); + if (ret) + goto out; + + if (!fshared) + goto retry_private; + + put_futex_key(fshared, &q->key); + goto retry; + } + + if (uval != val) { + queue_unlock(q, *hb); + ret = -EWOULDBLOCK; + } + +out: + if (ret) + put_futex_key(fshared, &q->key); + return ret; +} + static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset, int clockrt) { @@ -1406,7 +1482,6 @@ static int futex_wait(u32 __user *uaddr, int fshared, struct restart_block *restart; struct futex_hash_bucket *hb; struct futex_q q; - u32 uval; int ret; if (!bitset) @@ -1425,58 +1500,11 @@ static int futex_wait(u32 __user *uaddr, int fshared, current->timer_slack_ns); } -retry: - q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); - if (unlikely(ret != 0)) + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) goto out; -retry_private: - hb = queue_lock(&q); - - /* - * Access the page AFTER the hash-bucket is locked. - * Order is important: - * - * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); - * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } - * - * The basic logical guarantee of a futex is that it blocks ONLY - * if cond(var) is known to be true at the time of blocking, for - * any cond. If we queued after testing *uaddr, that would open - * a race condition where we could block indefinitely with - * cond(var) false, which would violate the guarantee. - * - * A consequence is that futex_wait() can return zero and absorb - * a wakeup when *uaddr != val on entry to the syscall. This is - * rare, but normal. - * - * For shared futexes, we hold the mmap semaphore, so the mapping - * cannot have changed since we looked it up in get_futex_key. - */ - ret = get_futex_value_locked(&uval, uaddr); - - if (unlikely(ret)) { - queue_unlock(&q, hb); - - ret = get_user(uval, uaddr); - if (ret) - goto out_put_key; - - if (!fshared) - goto retry_private; - - put_futex_key(fshared, &q.key); - goto retry; - } - ret = -EWOULDBLOCK; - - /* Only actually queue if *uaddr contained val. */ - if (unlikely(uval != val)) { - queue_unlock(&q, hb); - goto out_put_key; - } - /* queue_me and wait for wakeup, timeout, or a signal. */ futex_wait_queue_me(hb, &q, to, &wait); From 52400ba946759af28442dee6265c5c0180ac7122 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:49 -0700 Subject: [PATCH 064/900] futex: add requeue_pi functionality PI Futexes and their underlying rt_mutex cannot be left ownerless if there are pending waiters as this will break the PI boosting logic, so the standard requeue commands aren't sufficient. The new commands properly manage pi futex ownership by ensuring a futex with waiters has an owner at all times. This will allow glibc to properly handle pi mutexes with pthread_condvars. The approach taken here is to create two new futex op codes: FUTEX_WAIT_REQUEUE_PI: Tasks will use this op code to wait on a futex (such as a non-pi waitqueue) and wake after they have been requeued to a pi futex. Prior to returning to userspace, they will acquire this pi futex (and the underlying rt_mutex). futex_wait_requeue_pi() is the result of a high speed collision between futex_wait() and futex_lock_pi() (with the first part of futex_lock_pi() being done by futex_proxy_trylock_atomic() on behalf of the top_waiter). FUTEX_REQUEUE_PI (and FUTEX_CMP_REQUEUE_PI): This call must be used to wake tasks waiting with FUTEX_WAIT_REQUEUE_PI, regardless of how many tasks the caller intends to wake or requeue. pthread_cond_broadcast() should call this with nr_wake=1 and nr_requeue=INT_MAX. pthread_cond_signal() should call this with nr_wake=1 and nr_requeue=0. The reason being we need both callers to get the benefit of the futex_proxy_trylock_atomic() routine. futex_requeue() also enqueues the top_waiter on the rt_mutex via rt_mutex_start_proxy_lock(). Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- include/linux/futex.h | 8 + include/linux/thread_info.h | 3 +- kernel/futex.c | 523 ++++++++++++++++++++++++++++++++++-- 3 files changed, 512 insertions(+), 22 deletions(-) diff --git a/include/linux/futex.h b/include/linux/futex.h index 3bf5bb5a34f..b05519ca9e5 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -23,6 +23,9 @@ union ktime; #define FUTEX_TRYLOCK_PI 8 #define FUTEX_WAIT_BITSET 9 #define FUTEX_WAKE_BITSET 10 +#define FUTEX_WAIT_REQUEUE_PI 11 +#define FUTEX_REQUEUE_PI 12 +#define FUTEX_CMP_REQUEUE_PI 13 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 @@ -38,6 +41,11 @@ union ktime; #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) #define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG) #define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) +#define FUTEX_REQUEUE_PI_PRIVATE (FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG) +#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) /* * Support for robust futexes: the kernel cleans up held futexes at diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e6b820f8b56..a8cc4e13434 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -21,13 +21,14 @@ struct restart_block { struct { unsigned long arg0, arg1, arg2, arg3; }; - /* For futex_wait */ + /* For futex_wait and futex_wait_requeue_pi */ struct { u32 *uaddr; u32 val; u32 flags; u32 bitset; u64 time; + u32 *uaddr2; } futex; /* For nanosleep */ struct { diff --git a/kernel/futex.c b/kernel/futex.c index dbe857aa438..185c981d89e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -19,6 +19,10 @@ * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet * + * Requeue-PI support by Darren Hart + * Copyright (C) IBM Corporation, 2009 + * Thanks to Thomas Gleixner for conceptual design and careful reviews. + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -109,6 +113,9 @@ struct futex_q { struct futex_pi_state *pi_state; struct task_struct *task; + /* rt_waiter storage for requeue_pi: */ + struct rt_mutex_waiter *rt_waiter; + /* Bitset for the optional bitmasked wakeup */ u32 bitset; }; @@ -827,7 +834,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { - if (this->pi_state) { + if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } @@ -968,20 +975,138 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, q->key = *key2; } -/* - * Requeue all waiters hashed on one physical page to another - * physical page. +/** + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue + * q: the futex_q + * key: the key of the requeue target futex + * + * During futex_requeue, with requeue_pi=1, it is possible to acquire the + * target futex if it is uncontended or via a lock steal. Set the futex_q key + * to the requeue target futex so the waiter can detect the wakeup on the right + * futex, but remove it from the hb and NULL the rt_waiter so it can detect + * atomic lock acquisition. Must be called with the q->lock_ptr held. + */ +static inline +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +{ + drop_futex_key_refs(&q->key); + get_futex_key_refs(key); + q->key = *key; + + WARN_ON(plist_node_empty(&q->list)); + plist_del(&q->list, &q->list.plist); + + WARN_ON(!q->rt_waiter); + q->rt_waiter = NULL; + + wake_up(&q->waiter); +} + +/** + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. + * Wake the top waiter if we succeed. hb1 and hb2 must be held by the caller. + * + * Returns: + * 0 - failed to acquire the lock atomicly + * 1 - acquired the lock + * <0 - error + */ +static int futex_proxy_trylock_atomic(u32 __user *pifutex, + struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, + union futex_key *key1, union futex_key *key2, + struct futex_pi_state **ps) +{ + struct futex_q *top_waiter; + u32 curval; + int ret; + + if (get_futex_value_locked(&curval, pifutex)) + return -EFAULT; + + top_waiter = futex_top_waiter(hb1, key1); + + /* There are no waiters, nothing for us to do. */ + if (!top_waiter) + return 0; + + /* + * Either take the lock for top_waiter or set the FUTEX_WAITERS bit. + * The pi_state is returned in ps in contended cases. + */ + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task); + if (ret == 1) + requeue_pi_wake_futex(top_waiter, key2); + + return ret; +} + +/** + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * uaddr1: source futex user address + * uaddr2: target futex user address + * nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * nr_requeue: number of waiters to requeue (0-INT_MAX) + * requeue_pi: if we are attempting to requeue from a non-pi futex to a + * pi futex (pi to pi requeue is not supported) + * + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. + * + * Returns: + * >=0 - on success, the number of tasks requeued or woken + * <0 - on error */ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, - int nr_wake, int nr_requeue, u32 *cmpval) + int nr_wake, int nr_requeue, u32 *cmpval, + int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + int drop_count = 0, task_count = 0, ret; + struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; - int ret, drop_count = 0; + u32 curval2; + + if (requeue_pi) { + /* + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. + */ + if (refill_pi_state_cache()) + return -ENOMEM; + /* + * requeue_pi must wake as many tasks as it can, up to nr_wake + * + nr_requeue, since it acquires the rt_mutex prior to + * returning to userspace, so as to not leave the rt_mutex with + * waiters and no owner. However, second and third wake-ups + * cannot be predicted as they involve race conditions with the + * first wake and a fault while looking up the pi_state. Both + * pthread_cond_signal() and pthread_cond_broadcast() should + * use nr_wake=1. + */ + if (nr_wake != 1) + return -EINVAL; + } retry: + if (pi_state != NULL) { + /* + * We will have to lookup the pi_state again, so free this one + * to keep the accounting correct. + */ + free_pi_state(pi_state); + pi_state = NULL; + } + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@ -1020,19 +1145,94 @@ retry_private: } } + if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + /* Attempt to acquire uaddr2 and wake the top_waiter. */ + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state); + + /* + * At this point the top_waiter has either taken uaddr2 or is + * waiting on it. If the former, then the pi_state will not + * exist yet, look it up one more time to ensure we have a + * reference to it. + */ + if (ret == 1) { + WARN_ON(pi_state); + task_count++; + ret = get_futex_value_locked(&curval2, uaddr2); + if (!ret) + ret = lookup_pi_state(curval2, hb2, &key2, + &pi_state); + } + + switch (ret) { + case 0: + break; + case -EFAULT: + double_unlock_hb(hb1, hb2); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + ret = get_user(curval2, uaddr2); + if (!ret) + goto retry; + goto out; + case -EAGAIN: + /* The owner was exiting, try again. */ + double_unlock_hb(hb1, hb2); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + cond_resched(); + goto retry; + default: + goto out_unlock; + } + } + head1 = &hb1->chain; plist_for_each_entry_safe(this, next, head1, list) { - if (!match_futex (&this->key, &key1)) - continue; - if (++ret <= nr_wake) { - wake_futex(this); - } else { - requeue_futex(this, hb1, hb2, &key2); - drop_count++; + if (task_count - nr_wake >= nr_requeue) + break; - if (ret - nr_wake >= nr_requeue) - break; + if (!match_futex(&this->key, &key1)) + continue; + + WARN_ON(!requeue_pi && this->rt_waiter); + WARN_ON(requeue_pi && !this->rt_waiter); + + /* + * Wake nr_wake waiters. For requeue_pi, if we acquired the + * lock, we already woke the top_waiter. If not, it will be + * woken by futex_unlock_pi(). + */ + if (++task_count <= nr_wake && !requeue_pi) { + wake_futex(this); + continue; } + + /* + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + */ + if (requeue_pi) { + /* Prepare the waiter to take the rt_mutex. */ + atomic_inc(&pi_state->refcount); + this->pi_state = pi_state; + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task, 1); + if (ret == 1) { + /* We got the lock. */ + requeue_pi_wake_futex(this, &key2); + continue; + } else if (ret) { + /* -EDEADLK */ + this->pi_state = NULL; + free_pi_state(pi_state); + goto out_unlock; + } + } + requeue_futex(this, hb1, hb2, &key2); + drop_count++; } out_unlock: @@ -1047,7 +1247,9 @@ out_put_keys: out_put_key1: put_futex_key(fshared, &key1); out: - return ret; + if (pi_state != NULL) + free_pi_state(pi_state); + return ret ? ret : task_count; } /* The key must be already stored in q->key. */ @@ -1270,6 +1472,7 @@ handle_fault: #define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); +static long futex_lock_pi_restart(struct restart_block *restart); /** * fixup_owner() - Post lock pi_state and corner case management @@ -1489,6 +1692,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.bitset = bitset; + q.rt_waiter = NULL; if (abs_time) { to = &timeout; @@ -1596,6 +1800,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, } q.pi_state = NULL; + q.rt_waiter = NULL; retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); @@ -1701,6 +1906,20 @@ uaddr_faulted: goto retry; } +static long futex_lock_pi_restart(struct restart_block *restart) +{ + u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; + ktime_t t, *tp = NULL; + int fshared = restart->futex.flags & FLAGS_SHARED; + + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t.tv64 = restart->futex.time; + tp = &t; + } + restart->fn = do_no_restart_syscall; + + return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0); +} /* * Userspace attempted a TID -> 0 atomic transition, and failed. @@ -1803,6 +2022,253 @@ pi_faulted: return ret; } +/** + * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * @hb: the hash_bucket futex_q was original enqueued on + * @q: the futex_q woken while waiting to be requeued + * @key2: the futex_key of the requeue target futex + * @timeout: the timeout associated with the wait (NULL if none) + * + * Detect if the task was woken on the initial futex as opposed to the requeue + * target futex. If so, determine if it was a timeout or a signal that caused + * the wakeup and return the appropriate error code to the caller. Must be + * called with the hb lock held. + * + * Returns + * 0 - no early wakeup detected + * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?) + */ +static inline +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, + struct futex_q *q, union futex_key *key2, + struct hrtimer_sleeper *timeout) +{ + int ret = 0; + + /* + * With the hb lock held, we avoid races while we process the wakeup. + * We only need to hold hb (and not hb2) to ensure atomicity as the + * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. + * It can't be requeued from uaddr2 to something else since we don't + * support a PI aware source futex for requeue. + */ + if (!match_futex(&q->key, key2)) { + WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &q->list.plist); + drop_futex_key_refs(&q->key); + + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else { + /* + * We expect signal_pending(current), but another + * thread may have handled it for us already. + */ + /* FIXME: ERESTARTSYS or ERESTARTNOINTR? Do we care if + * the user specified SA_RESTART or not? */ + ret = -ERESTARTSYS; + } + } + return ret; +} + +/** + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 + * @uaddr: the futex we initialyl wait on (non-pi) + * @fshared: whether the futexes are shared (1) or not (0). They must be + * the same type, no requeueing from private to shared, etc. + * @val: the expected value of uaddr + * @abs_time: absolute timeout + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. + * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) + * @uaddr2: the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to + * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and + * complete the acquisition of the rt_mutex prior to returning to userspace. + * This ensures the rt_mutex maintains an owner when it has waiters; without + * one, the pi logic wouldn't know which task to boost/deboost, if there was a + * need to. + * + * We call schedule in futex_wait_queue_me() when we enqueue and return there + * via the following: + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() + * 2) wakeup on uaddr2 after a requeue and subsequent unlock + * 3) signal (before or after requeue) + * 4) timeout (before or after requeue) + * + * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. + * + * If 2, we may then block on trying to take the rt_mutex and return via: + * 5) successful lock + * 6) signal + * 7) timeout + * 8) other lock acquisition failure + * + * If 6, we setup a restart_block with futex_lock_pi() as the function. + * + * If 4 or 7, we cleanup and return with -ETIMEDOUT. + * + * Returns: + * 0 - On success + * <0 - On error + */ +static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, + u32 val, ktime_t *abs_time, u32 bitset, + int clockrt, u32 __user *uaddr2) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct rt_mutex_waiter rt_waiter; + struct rt_mutex *pi_mutex = NULL; + DECLARE_WAITQUEUE(wait, current); + struct restart_block *restart; + struct futex_hash_bucket *hb; + union futex_key key2; + struct futex_q q; + int res, ret; + u32 uval; + + if (!bitset) + return -EINVAL; + + if (abs_time) { + to = &timeout; + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + + /* + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ + debug_rt_mutex_init_waiter(&rt_waiter); + rt_waiter.task = NULL; + + q.pi_state = NULL; + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + + key2 = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr2, fshared, &key2); + if (unlikely(ret != 0)) + goto out; + + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) { + put_futex_key(fshared, &key2); + goto out; + } + + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue_me(hb, &q, to, &wait); + + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) + goto out_put_keys; + + /* + * In order for us to be here, we know our q.key == key2, and since + * we took the hb->lock above, we also know that futex_requeue() has + * completed and we no longer have to concern ourselves with a wakeup + * race with the atomic proxy lock acquition by the requeue code. + */ + + /* Check if the requeue code acquired the second futex for us. */ + if (!q.rt_waiter) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (q.pi_state && (q.pi_state->owner != current)) { + spin_lock(q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current, + fshared); + spin_unlock(q.lock_ptr); + } + } else { + /* + * We have been woken up by futex_unlock_pi(), a timeout, or a + * signal. futex_unlock_pi() will not destroy the lock_ptr nor + * the pi_state. + */ + WARN_ON(!&q.pi_state); + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); + debug_rt_mutex_free_waiter(&rt_waiter); + + spin_lock(q.lock_ptr); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr2, fshared, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it + * acquired the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + /* Unqueue and drop the lock. */ + unqueue_me_pi(&q); + } + + /* + * If fixup_pi_state_owner() faulted and was unable to handle the + * fault, unlock the rt_mutex and return the fault to userspace. + */ + if (ret == -EFAULT) { + if (rt_mutex_owner(pi_mutex) == current) + rt_mutex_unlock(pi_mutex); + } else if (ret == -EINTR) { + ret = -EFAULT; + if (get_user(uval, uaddr2)) + goto out_put_keys; + + /* + * We've already been requeued, so restart by calling + * futex_lock_pi() directly, rather then returning to this + * function. + */ + ret = -ERESTART_RESTARTBLOCK; + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_lock_pi_restart; + restart->futex.uaddr = (u32 *)uaddr2; + restart->futex.val = uval; + restart->futex.flags = 0; + if (abs_time) { + restart->futex.flags |= FLAGS_HAS_TIMEOUT; + restart->futex.time = abs_time->tv64; + } + + if (fshared) + restart->futex.flags |= FLAGS_SHARED; + if (clockrt) + restart->futex.flags |= FLAGS_CLOCKRT; + } + +out_put_keys: + put_futex_key(fshared, &q.key); + put_futex_key(fshared, &key2); + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. @@ -2025,7 +2491,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, fshared = 1; clockrt = op & FUTEX_CLOCK_REALTIME; - if (clockrt && cmd != FUTEX_WAIT_BITSET) + if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) return -ENOSYS; switch (cmd) { @@ -2040,10 +2506,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake(uaddr, fshared, val, val3); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 0); break; case FUTEX_WAKE_OP: ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); @@ -2060,6 +2527,18 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, if (futex_cmpxchg_enabled) ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; + case FUTEX_WAIT_REQUEUE_PI: + val3 = FUTEX_BITSET_MATCH_ANY; + ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, + clockrt, uaddr2); + break; + case FUTEX_REQUEUE_PI: + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1); + break; + case FUTEX_CMP_REQUEUE_PI: + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 1); + break; default: ret = -ENOSYS; } @@ -2077,7 +2556,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) @@ -2089,10 +2569,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, tp = &t; } /* - * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. + * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. */ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (u32) (unsigned long) utime; From cac94f979326212831c0ea44ed9ea1622b4f4e93 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:33 +0200 Subject: [PATCH 065/900] x86, bts: fix race when bts tracer is removed When the bts tracer is removed while the traced task is running, the write to clear the bts tracer pointer races with context switch code. Read the tracer once during a context switch. When a new tracer is installed, the bts tracer is set in the ds context before the tracer is initialized in order to claim the context for that tracer. This may result in write accesses using an uninitialized trace configuration when scheduling timestamps have been requested. Store active tracing flags separately and only set active flags after the tracing configuration has been initialized. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144548.881338000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 58 ++++++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index b1d6e1f502f..c730155bf54 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -89,6 +89,9 @@ struct bts_tracer { /* Buffer overflow notification function: */ bts_ovfl_callback_t ovfl; + + /* Active flags affecting trace collection. */ + unsigned int flags; }; struct pebs_tracer { @@ -799,6 +802,8 @@ void ds_suspend_bts(struct bts_tracer *tracer) if (!tracer) return; + tracer->flags = 0; + task = tracer->ds.context->task; if (!task || (task == current)) @@ -820,6 +825,8 @@ void ds_resume_bts(struct bts_tracer *tracer) if (!tracer) return; + tracer->flags = tracer->trace.ds.flags; + task = tracer->ds.context->task; control = ds_cfg.ctl[dsf_bts]; @@ -1037,43 +1044,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) } } +static inline void ds_take_timestamp(struct ds_context *context, + enum bts_qualifier qualifier, + struct task_struct *task) +{ + struct bts_tracer *tracer = context->bts_master; + struct bts_struct ts; + + /* Prevent compilers from reading the tracer pointer twice. */ + barrier(); + + if (!tracer || !(tracer->flags & BTS_TIMESTAMPS)) + return; + + memset(&ts, 0, sizeof(ts)); + ts.qualifier = qualifier; + ts.variant.timestamp.jiffies = jiffies_64; + ts.variant.timestamp.pid = task->pid; + + bts_write(tracer, &ts); +} + /* * Change the DS configuration from tracing prev to tracing next. */ void ds_switch_to(struct task_struct *prev, struct task_struct *next) { - struct ds_context *prev_ctx = prev->thread.ds_ctx; - struct ds_context *next_ctx = next->thread.ds_ctx; + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + unsigned long debugctlmsr = next->thread.debugctlmsr; + + /* Make sure all data is read before we start. */ + barrier(); if (prev_ctx) { update_debugctlmsr(0); - if (prev_ctx->bts_master && - (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { - struct bts_struct ts = { - .qualifier = bts_task_departs, - .variant.timestamp.jiffies = jiffies_64, - .variant.timestamp.pid = prev->pid - }; - bts_write(prev_ctx->bts_master, &ts); - } + ds_take_timestamp(prev_ctx, bts_task_departs, prev); } if (next_ctx) { - if (next_ctx->bts_master && - (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { - struct bts_struct ts = { - .qualifier = bts_task_arrives, - .variant.timestamp.jiffies = jiffies_64, - .variant.timestamp.pid = next->pid - }; - bts_write(next_ctx->bts_master, &ts); - } + ds_take_timestamp(next_ctx, bts_task_arrives, next); wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); } - update_debugctlmsr(next->thread.debugctlmsr); + update_debugctlmsr(debugctlmsr); } void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) From a26b89f05d194413c7238e0bea071054f6b5d3c8 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:34 +0200 Subject: [PATCH 066/900] sched, hw-branch-tracer: add wait_task_context_switch() function to sched.h Add a function to wait until some other task has been switched out at least once. This differs from wait_task_inactive() subtly, in that the latter will wait until the task has left the CPU. Signed-off-by: Markus Metzger Cc: markus.t.metzger@gmail.com Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144549.794157000@intel.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index b94f3541f67..a5b9a83065f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1993,8 +1993,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP +extern void wait_task_context_switch(struct task_struct *p); extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else +static inline void wait_task_context_switch(struct task_struct *p) {} static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { diff --git a/kernel/sched.c b/kernel/sched.c index 6cc1fd5d507..f91bc8141dc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2002,6 +2002,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) return 1; } +/* + * wait_task_context_switch - wait for a thread to complete at least one + * context switch. + * + * @p must not be current. + */ +void wait_task_context_switch(struct task_struct *p) +{ + unsigned long nvcsw, nivcsw, flags; + int running; + struct rq *rq; + + nvcsw = p->nvcsw; + nivcsw = p->nivcsw; + for (;;) { + /* + * The runqueue is assigned before the actual context + * switch. We need to take the runqueue lock. + * + * We could check initially without the lock but it is + * very likely that we need to take the lock in every + * iteration. + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); + task_rq_unlock(rq, &flags); + + if (likely(!running)) + break; + /* + * The switch count is incremented before the actual + * context switch. We thus wait for two switches to be + * sure at least one completed. + */ + if ((p->nvcsw - nvcsw) > 1) + break; + if ((p->nivcsw - nivcsw) > 1) + break; + + cpu_relax(); + } +} + /* * wait_task_inactive - wait for a thread to unschedule. * From e2b371f00a6f529f6362654239bdec8dcd510760 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:35 +0200 Subject: [PATCH 067/900] mm, x86, ptrace, bts: defer branch trace stopping When a ptraced task is unlinked, we need to stop branch tracing for that task. Since the unlink is called with interrupts disabled, and we need interrupts enabled to stop branch tracing, we defer the work. Collect all branch tracing related stuff in a branch tracing context. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: Andrew Morton Cc: Peter Zijlstra Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144550.712401000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 4 - arch/x86/kernel/ptrace.c | 254 ++++++++++++++++++++----------- include/linux/mm.h | 3 +- include/linux/sched.h | 9 +- mm/mlock.c | 13 +- 5 files changed, 179 insertions(+), 104 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 34c52370f2f..2483807e06e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -458,10 +458,6 @@ struct thread_struct { /* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */ struct ds_context *ds_ctx; #endif /* CONFIG_X86_DS */ -#ifdef CONFIG_X86_PTRACE_BTS -/* the signal to send on a bts buffer overflow */ - unsigned int bts_ovfl_signal; -#endif /* CONFIG_X86_PTRACE_BTS */ }; static inline unsigned long native_get_debugreg(int regno) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index fe9345c967d..7c21d1e8cae 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -577,17 +578,119 @@ static int ioperm_get(struct task_struct *target, } #ifdef CONFIG_X86_PTRACE_BTS +/* + * A branch trace store context. + * + * Contexts may only be installed by ptrace_bts_config() and only for + * ptraced tasks. + * + * Contexts are destroyed when the tracee is detached from the tracer. + * The actual destruction work requires interrupts enabled, so the + * work is deferred and will be scheduled during __ptrace_unlink(). + * + * Contexts hold an additional task_struct reference on the traced + * task, as well as a reference on the tracer's mm. + * + * Ptrace already holds a task_struct for the duration of ptrace operations, + * but since destruction is deferred, it may be executed after both + * tracer and tracee exited. + */ +struct bts_context { + /* The branch trace handle. */ + struct bts_tracer *tracer; + + /* The buffer used to store the branch trace and its size. */ + void *buffer; + unsigned int size; + + /* The mm that paid for the above buffer. */ + struct mm_struct *mm; + + /* The task this context belongs to. */ + struct task_struct *task; + + /* The signal to send on a bts buffer overflow. */ + unsigned int bts_ovfl_signal; + + /* The work struct to destroy a context. */ + struct work_struct work; +}; + +static inline void alloc_bts_buffer(struct bts_context *context, + unsigned int size) +{ + void *buffer; + + buffer = alloc_locked_buffer(size); + if (buffer) { + context->buffer = buffer; + context->size = size; + context->mm = get_task_mm(current); + } +} + +static inline void free_bts_buffer(struct bts_context *context) +{ + if (!context->buffer) + return; + + kfree(context->buffer); + context->buffer = NULL; + + refund_locked_buffer_memory(context->mm, context->size); + context->size = 0; + + mmput(context->mm); + context->mm = NULL; +} + +static void free_bts_context_work(struct work_struct *w) +{ + struct bts_context *context; + + context = container_of(w, struct bts_context, work); + + ds_release_bts(context->tracer); + put_task_struct(context->task); + free_bts_buffer(context); + kfree(context); +} + +static inline void free_bts_context(struct bts_context *context) +{ + INIT_WORK(&context->work, free_bts_context_work); + schedule_work(&context->work); +} + +static inline struct bts_context *alloc_bts_context(struct task_struct *task) +{ + struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL); + if (context) { + context->task = task; + task->bts = context; + + get_task_struct(task); + } + + return context; +} + static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { + struct bts_context *context; const struct bts_trace *trace; struct bts_struct bts; const unsigned char *at; int error; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; at = trace->ds.top - ((index + 1) * trace->ds.size); if ((void *)at < trace->ds.begin) @@ -596,7 +699,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, if (!trace->read) return -EOPNOTSUPP; - error = trace->read(child->bts, at, &bts); + error = trace->read(context->tracer, at, &bts); if (error < 0) return error; @@ -610,13 +713,18 @@ static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { + struct bts_context *context; const struct bts_trace *trace; const unsigned char *at; int error, drained = 0; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; if (!trace->read) return -EOPNOTSUPP; @@ -627,9 +735,8 @@ static int ptrace_bts_drain(struct task_struct *child, for (at = trace->ds.begin; (void *)at < trace->ds.top; out++, drained++, at += trace->ds.size) { struct bts_struct bts; - int error; - error = trace->read(child->bts, at, &bts); + error = trace->read(context->tracer, at, &bts); if (error < 0) return error; @@ -639,35 +746,18 @@ static int ptrace_bts_drain(struct task_struct *child, memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - error = ds_reset_bts(child->bts); + error = ds_reset_bts(context->tracer); if (error < 0) return error; return drained; } -static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size) -{ - child->bts_buffer = alloc_locked_buffer(size); - if (!child->bts_buffer) - return -ENOMEM; - - child->bts_size = size; - - return 0; -} - -static void ptrace_bts_free_buffer(struct task_struct *child) -{ - free_locked_buffer(child->bts_buffer, child->bts_size); - child->bts_buffer = NULL; - child->bts_size = 0; -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) { + struct bts_context *context; struct ptrace_bts_config cfg; unsigned int flags = 0; @@ -677,28 +767,31 @@ static int ptrace_bts_config(struct task_struct *child, if (copy_from_user(&cfg, ucfg, sizeof(cfg))) return -EFAULT; - if (child->bts) { - ds_release_bts(child->bts); - child->bts = NULL; - } + context = child->bts; + if (!context) + context = alloc_bts_context(child); + if (!context) + return -ENOMEM; if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) return -EINVAL; - child->thread.bts_ovfl_signal = cfg.signal; return -EOPNOTSUPP; + context->bts_ovfl_signal = cfg.signal; } - if ((cfg.flags & PTRACE_BTS_O_ALLOC) && - (cfg.size != child->bts_size)) { - int error; + ds_release_bts(context->tracer); + context->tracer = NULL; - ptrace_bts_free_buffer(child); + if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { + free_bts_buffer(context); + if (!cfg.size) + return 0; - error = ptrace_bts_allocate_buffer(child, cfg.size); - if (error < 0) - return error; + alloc_bts_buffer(context, cfg.size); + if (!context->buffer) + return -ENOMEM; } if (cfg.flags & PTRACE_BTS_O_TRACE) @@ -707,15 +800,13 @@ static int ptrace_bts_config(struct task_struct *child, if (cfg.flags & PTRACE_BTS_O_SCHED) flags |= BTS_TIMESTAMPS; - child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, - /* ovfl = */ NULL, /* th = */ (size_t)-1, - flags); - if (IS_ERR(child->bts)) { - int error = PTR_ERR(child->bts); - - ptrace_bts_free_buffer(child); - child->bts = NULL; + context->tracer = ds_request_bts(child, context->buffer, context->size, + NULL, (size_t)-1, flags); + if (unlikely(IS_ERR(context->tracer))) { + int error = PTR_ERR(context->tracer); + free_bts_buffer(context); + context->tracer = NULL; return error; } @@ -726,20 +817,25 @@ static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { + struct bts_context *context; const struct bts_trace *trace; struct ptrace_bts_config cfg; + context = child->bts; + if (!context) + return -ESRCH; + if (cfg_size < sizeof(cfg)) return -EIO; - trace = ds_read_bts(child->bts); + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; memset(&cfg, 0, sizeof(cfg)); - cfg.size = trace->ds.end - trace->ds.begin; - cfg.signal = child->thread.bts_ovfl_signal; - cfg.bts_size = sizeof(struct bts_struct); + cfg.size = trace->ds.end - trace->ds.begin; + cfg.signal = context->bts_ovfl_signal; + cfg.bts_size = sizeof(struct bts_struct); if (cfg.signal) cfg.flags |= PTRACE_BTS_O_SIGNAL; @@ -758,67 +854,56 @@ static int ptrace_bts_status(struct task_struct *child, static int ptrace_bts_clear(struct task_struct *child) { + struct bts_context *context; const struct bts_trace *trace; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - return ds_reset_bts(child->bts); + return ds_reset_bts(context->tracer); } static int ptrace_bts_size(struct task_struct *child) { + struct bts_context *context; const struct bts_trace *trace; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; return (trace->ds.top - trace->ds.begin) / trace->ds.size; } -static void ptrace_bts_fork(struct task_struct *tsk) +static inline void ptrace_bts_fork(struct task_struct *tsk) { tsk->bts = NULL; - tsk->bts_buffer = NULL; - tsk->bts_size = 0; - tsk->thread.bts_ovfl_signal = 0; } -static void ptrace_bts_untrace(struct task_struct *child) +/* + * Called from __ptrace_unlink() after the child has been moved back + * to its original parent. + */ +static inline void ptrace_bts_untrace(struct task_struct *child) { if (unlikely(child->bts)) { - ds_release_bts(child->bts); + free_bts_context(child->bts); child->bts = NULL; - - /* We cannot update total_vm and locked_vm since - child's mm is already gone. But we can reclaim the - memory. */ - kfree(child->bts_buffer); - child->bts_buffer = NULL; - child->bts_size = 0; } } - -static void ptrace_bts_detach(struct task_struct *child) -{ - /* - * Ptrace_detach() races with ptrace_untrace() in case - * the child dies and is reaped by another thread. - * - * We only do the memory accounting at this point and - * leave the buffer deallocation and the bts tracer - * release to ptrace_bts_untrace() which will be called - * later on with tasklist_lock held. - */ - release_locked_buffer(child->bts_buffer, child->bts_size); -} #else static inline void ptrace_bts_fork(struct task_struct *tsk) {} -static inline void ptrace_bts_detach(struct task_struct *child) {} static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ @@ -843,7 +928,6 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif - ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION diff --git a/include/linux/mm.h b/include/linux/mm.h index bff1f0d475c..64d8ed2538a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -13,6 +13,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1321,6 +1322,6 @@ void vmemmap_populate_print_last(void); extern void *alloc_locked_buffer(size_t size); extern void free_locked_buffer(void *buffer, size_t size); -extern void release_locked_buffer(void *buffer, size_t size); +extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a5b9a83065f..52b8cd049c2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -96,8 +96,8 @@ struct exec_domain; struct futex_pi_state; struct robust_list_head; struct bio; -struct bts_tracer; struct fs_struct; +struct bts_context; /* * List of flags we want to share for kernel threads, @@ -1210,12 +1210,7 @@ struct task_struct { * This is the tracer handle for the ptrace BTS extension. * This field actually belongs to the ptracer task. */ - struct bts_tracer *bts; - /* - * The buffer to hold the BTS data. - */ - void *bts_buffer; - size_t bts_size; + struct bts_context *bts; #endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ diff --git a/mm/mlock.c b/mm/mlock.c index cbe9e0581b7..749383b442c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -660,21 +660,20 @@ void *alloc_locked_buffer(size_t size) return buffer; } -void release_locked_buffer(void *buffer, size_t size) +void refund_locked_buffer_memory(struct mm_struct *mm, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); - current->mm->total_vm -= pgsz; - current->mm->locked_vm -= pgsz; + mm->total_vm -= pgsz; + mm->locked_vm -= pgsz; - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); } void free_locked_buffer(void *buffer, size_t size) { - release_locked_buffer(buffer, size); - + refund_locked_buffer_memory(current->mm, size); kfree(buffer); } From 8d99b3ac2726e5edd97ad147fa5c1f2acb63a745 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:36 +0200 Subject: [PATCH 068/900] x86, bts: wait until traced task has been scheduled out In order to stop branch tracing for a running task, we need to first clear the branch tracing control bits before we may free the tracing buffer. If the traced task is running, the cpu might still trace that task after the branch trace control bits have cleared. Wait until the traced task has been scheduled out before proceeding. A similar problem affects the task debug store context. We first remove the context, then we need to wait until the task has been scheduled out before we can free the context memory. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144551.919636000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index c730155bf54..5cd137ab267 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -299,6 +299,7 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) static inline void ds_put_context(struct ds_context *context) { + struct task_struct *task; unsigned long irq; if (!context) @@ -313,14 +314,20 @@ static inline void ds_put_context(struct ds_context *context) *(context->this) = NULL; - if (context->task) - clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + task = context->task; - if (!context->task || (context->task == current)) + if (task) + clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); + + if (!task || (task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); spin_unlock_irqrestore(&ds_lock, irq); + /* The context might still be in use for context switching. */ + if (task && (task != current)) + wait_task_context_switch(task); + kfree(context); } @@ -781,15 +788,23 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, void ds_release_bts(struct bts_tracer *tracer) { + struct task_struct *task; + if (!tracer) return; + task = tracer->ds.context->task; + ds_suspend_bts(tracer); WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); tracer->ds.context->bts_master = NULL; - put_tracer(tracer->ds.context->task); + /* Make sure tracing stopped and the tracer is not in use. */ + if (task && (task != current)) + wait_task_context_switch(task); + + put_tracer(task); ds_put_context(tracer->ds.context); kfree(tracer); From 38f801129ad07b9afa7f9bd3779f61b805416d8c Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:37 +0200 Subject: [PATCH 069/900] x86, bts: fix race between per-task and per-cpu branch tracing Per-task branch tracing installs a debug store context with the traced task. This immediately results in the branch trace control bits to be cleared for the next context switch of that task, if not set before. Either per-cpu or per-task tracing are allowed at the same time. An active per-cpu tracing would be disabled even if the per-task tracing request is rejected and the task debug store context removed. Check the tracing type (per-cpu or per-task) before installing a task debug store context. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144552.856000000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 72 +++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 5cd137ab267..f03f117eff8 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -193,12 +193,28 @@ static DEFINE_SPINLOCK(ds_lock); */ static atomic_t tracers = ATOMIC_INIT(0); -static inline void get_tracer(struct task_struct *task) +static inline int get_tracer(struct task_struct *task) { - if (task) + int error; + + spin_lock_irq(&ds_lock); + + if (task) { + error = -EPERM; + if (atomic_read(&tracers) < 0) + goto out; atomic_inc(&tracers); - else + } else { + error = -EPERM; + if (atomic_read(&tracers) > 0) + goto out; atomic_dec(&tracers); + } + + error = 0; +out: + spin_unlock_irq(&ds_lock); + return error; } static inline void put_tracer(struct task_struct *task) @@ -209,14 +225,6 @@ static inline void put_tracer(struct task_struct *task) atomic_inc(&tracers); } -static inline int check_tracer(struct task_struct *task) -{ - return task ? - (atomic_read(&tracers) >= 0) : - (atomic_read(&tracers) <= 0); -} - - /* * The DS context is either attached to a thread or to a cpu: * - in the former case, the thread_struct contains a pointer to the @@ -677,6 +685,10 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, if (ovfl) goto out; + error = get_tracer(task); + if (error < 0) + goto out; + /* * Per-cpu tracing is typically requested using smp_call_function(). * We must not sleep. @@ -684,7 +696,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, error = -ENOMEM; tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) - goto out; + goto out_put_tracer; tracer->ovfl = ovfl; error = ds_request(&tracer->ds, &tracer->trace.ds, @@ -695,14 +707,9 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, spin_lock_irqsave(&ds_lock, irq); - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); - error = -EPERM; if (tracer->ds.context->bts_master) - goto out_put_tracer; + goto out_unlock; tracer->ds.context->bts_master = tracer; spin_unlock_irqrestore(&ds_lock, irq); @@ -716,13 +723,13 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, return tracer; - out_put_tracer: - put_tracer(task); out_unlock: spin_unlock_irqrestore(&ds_lock, irq); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); + out_put_tracer: + put_tracer(task); out: return ERR_PTR(error); } @@ -741,6 +748,10 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, if (ovfl) goto out; + error = get_tracer(task); + if (error < 0) + goto out; + /* * Per-cpu tracing is typically requested using smp_call_function(). * We must not sleep. @@ -748,7 +759,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, error = -ENOMEM; tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) - goto out; + goto out_put_tracer; tracer->ovfl = ovfl; error = ds_request(&tracer->ds, &tracer->trace.ds, @@ -758,14 +769,9 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, spin_lock_irqsave(&ds_lock, irq); - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); - error = -EPERM; if (tracer->ds.context->pebs_master) - goto out_put_tracer; + goto out_unlock; tracer->ds.context->pebs_master = tracer; spin_unlock_irqrestore(&ds_lock, irq); @@ -775,13 +781,13 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, return tracer; - out_put_tracer: - put_tracer(task); out_unlock: spin_unlock_irqrestore(&ds_lock, irq); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); + out_put_tracer: + put_tracer(task); out: return ERR_PTR(error); } @@ -804,8 +810,8 @@ void ds_release_bts(struct bts_tracer *tracer) if (task && (task != current)) wait_task_context_switch(task); - put_tracer(task); ds_put_context(tracer->ds.context); + put_tracer(task); kfree(tracer); } @@ -861,16 +867,20 @@ void ds_resume_bts(struct bts_tracer *tracer) void ds_release_pebs(struct pebs_tracer *tracer) { + struct task_struct *task; + if (!tracer) return; + task = tracer->ds.context->task; + ds_suspend_pebs(tracer); WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); tracer->ds.context->pebs_master = NULL; - put_tracer(tracer->ds.context->task); ds_put_context(tracer->ds.context); + put_tracer(task); kfree(tracer); } From 15879d042164650b93d83281ad5f87ad323bfbfe Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:38 +0200 Subject: [PATCH 070/900] x86, bts: use trace_clock_global() for timestamps Rename the bts_struct timestamp field to event. Use trace_clock_global() for time measurement. Reported-by: Ingo Molnar Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144553.773216000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 4 ++-- arch/x86/kernel/ds.c | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index a8f672ba100..772f141afb9 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -170,9 +170,9 @@ struct bts_struct { } lbr; /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */ struct { - __u64 jiffies; + __u64 clock; pid_t pid; - } timestamp; + } event; } variant; }; diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index f03f117eff8..2071b992c35 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -471,7 +472,7 @@ enum bts_field { bts_flags, bts_qual = bts_from, - bts_jiffies = bts_to, + bts_clock = bts_to, bts_pid = bts_flags, bts_qual_mask = (bts_qual_max - 1), @@ -517,8 +518,8 @@ bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out) memset(out, 0, sizeof(*out)); if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); - out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); - out->variant.timestamp.pid = bts_get(at, bts_pid); + out->variant.event.clock = bts_get(at, bts_clock); + out->variant.event.pid = bts_get(at, bts_pid); } else { out->qualifier = bts_branch; out->variant.lbr.from = bts_get(at, bts_from); @@ -555,8 +556,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) case bts_task_arrives: case bts_task_departs: bts_set(raw, bts_qual, (bts_escape | in->qualifier)); - bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); - bts_set(raw, bts_pid, in->variant.timestamp.pid); + bts_set(raw, bts_clock, in->variant.event.clock); + bts_set(raw, bts_pid, in->variant.event.pid); break; default: return -EINVAL; @@ -1083,9 +1084,9 @@ static inline void ds_take_timestamp(struct ds_context *context, return; memset(&ts, 0, sizeof(ts)); - ts.qualifier = qualifier; - ts.variant.timestamp.jiffies = jiffies_64; - ts.variant.timestamp.pid = task->pid; + ts.qualifier = qualifier; + ts.variant.event.clock = trace_clock_global(); + ts.variant.event.pid = task->pid; bts_write(tracer, &ts); } From 35bb7600c17762bb129588c1877d2717fe325289 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:39 +0200 Subject: [PATCH 071/900] x86, debugctlmsr: add _on_cpu variants to debugctlmsr functions Add functions to get and set the debugctlmsr on different cpus. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144554.738772000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2483807e06e..1efeb497f1f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -785,6 +785,21 @@ static inline unsigned long get_debugctlmsr(void) return debugctlmsr; } +static inline unsigned long get_debugctlmsr_on_cpu(int cpu) +{ + u64 debugctlmsr = 0; + u32 val1, val2; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2); + debugctlmsr = val1 | ((u64)val2 << 32); + + return debugctlmsr; +} + static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR @@ -794,6 +809,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } +static inline void update_debugctlmsr_on_cpu(int cpu, + unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, + (u32)((u64)debugctlmsr), + (u32)((u64)debugctlmsr >> 32)); +} + /* * from system description table in BIOS. Mostly for MCA use, but * others may find it useful: From de79f54f5347ad7ec6ff55ccbb6d4ab2a21f6a93 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:40 +0200 Subject: [PATCH 072/900] x86, bts, hw-branch-tracer: add _noirq variants to the debug store interface The hw-branch-tracer uses debug store functions from an on_each_cpu() context, which is simply wrong since the functions may sleep. Add _noirq variants for most functions, which may be called with interrupts disabled. Separate per-cpu and per-task tracing and allow per-cpu tracing to be controlled from any cpu. Make the hw-branch-tracer use the new debug store interface, synchronize with hotplug cpu event using get/put_online_cpus(), and remove the unnecessary spinlock. Make the ptrace bts and the ds selftest code use the new interface. Defer the ds selftest. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144555.658136000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 57 +++- arch/x86/kernel/ds.c | 476 ++++++++++++++++++++++++------- arch/x86/kernel/ds_selftest.c | 9 +- arch/x86/kernel/ptrace.c | 5 +- kernel/trace/trace_hw_branches.c | 191 +++++-------- 5 files changed, 492 insertions(+), 246 deletions(-) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 772f141afb9..413e127e567 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -15,8 +15,8 @@ * - buffer allocation (memory accounting) * * - * Copyright (C) 2007-2008 Intel Corporation. - * Markus Metzger , 2007-2008 + * Copyright (C) 2007-2009 Intel Corporation. + * Markus Metzger , 2007-2009 */ #ifndef _ASM_X86_DS_H @@ -83,8 +83,10 @@ enum ds_feature { * The interrupt threshold is independent from the overflow callback * to allow users to use their own overflow interrupt handling mechanism. * - * task: the task to request recording for; - * NULL for per-cpu recording on the current cpu + * The function might sleep. + * + * task: the task to request recording for + * cpu: the cpu to request recording for * base: the base pointer for the (non-pageable) buffer; * size: the size of the provided buffer in bytes * ovfl: pointer to a function to be called on buffer overflow; @@ -93,19 +95,28 @@ enum ds_feature { * -1 if no interrupt threshold is requested. * flags: a bit-mask of the above flags */ -extern struct bts_tracer *ds_request_bts(struct task_struct *task, - void *base, size_t size, - bts_ovfl_callback_t ovfl, - size_t th, unsigned int flags); -extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, - size_t th, unsigned int flags); +extern struct bts_tracer *ds_request_bts_task(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct pebs_tracer *ds_request_pebs_cpu(int cpu, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags); /* * Release BTS or PEBS resources * Suspend and resume BTS or PEBS tracing * + * Must be called with irq's enabled. + * * tracer: the tracer handle returned from ds_request_~() */ extern void ds_release_bts(struct bts_tracer *tracer); @@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer); extern void ds_suspend_pebs(struct pebs_tracer *tracer); extern void ds_resume_pebs(struct pebs_tracer *tracer); +/* + * Release BTS or PEBS resources + * Suspend and resume BTS or PEBS tracing + * + * Cpu tracers must call this on the traced cpu. + * Task tracers must call ds_release_~_noirq() for themselves. + * + * May be called with irq's disabled. + * + * Returns 0 if successful; + * -EPERM if the cpu tracer does not trace the current cpu. + * -EPERM if the task tracer does not trace itself. + * + * tracer: the tracer handle returned from ds_request_~() + */ +extern int ds_release_bts_noirq(struct bts_tracer *tracer); +extern int ds_suspend_bts_noirq(struct bts_tracer *tracer); +extern int ds_resume_bts_noirq(struct bts_tracer *tracer); +extern int ds_release_pebs_noirq(struct pebs_tracer *tracer); +extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer); +extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer); + /* * The raw DS buffer state as it is used for BTS and PEBS recording. diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 2071b992c35..21a3852abf6 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -245,60 +245,50 @@ struct ds_context { struct pebs_tracer *pebs_master; /* Use count: */ - unsigned long count; + unsigned long count; /* Pointer to the context pointer field: */ struct ds_context **this; - /* The traced task; NULL for current cpu: */ + /* The traced task; NULL for cpu tracing: */ struct task_struct *task; + + /* The traced cpu; only valid if task is NULL: */ + int cpu; }; -static DEFINE_PER_CPU(struct ds_context *, system_context_array); - -#define system_context per_cpu(system_context_array, smp_processor_id()) +static DEFINE_PER_CPU(struct ds_context *, cpu_context); -static inline struct ds_context *ds_get_context(struct task_struct *task) +static struct ds_context *ds_get_context(struct task_struct *task, int cpu) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &system_context); + (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); struct ds_context *context = NULL; struct ds_context *new_context = NULL; - unsigned long irq; - /* - * Chances are small that we already have a context. - * - * Contexts for per-cpu tracing are allocated using - * smp_call_function(). We must not sleep. - */ - new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC); + /* Chances are small that we already have a context. */ + new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); if (!new_context) return NULL; - spin_lock_irqsave(&ds_lock, irq); + spin_lock_irq(&ds_lock); context = *p_context; - if (!context) { + if (likely(!context)) { context = new_context; context->this = p_context; context->task = task; + context->cpu = cpu; context->count = 0; - if (task) - set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); - *p_context = context; } context->count++; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); if (context != new_context) kfree(new_context); @@ -306,7 +296,7 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) return context; } -static inline void ds_put_context(struct ds_context *context) +static void ds_put_context(struct ds_context *context) { struct task_struct *task; unsigned long irq; @@ -328,8 +318,15 @@ static inline void ds_put_context(struct ds_context *context) if (task) clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, 0); + /* + * We leave the (now dangling) pointer to the DS configuration in + * the DS_AREA msr. This is as good or as bad as replacing it with + * NULL - the hardware would crash if we enabled tracing. + * + * This saves us some problems with having to write an msr on a + * different cpu while preventing others from doing the same for the + * next context for that same cpu. + */ spin_unlock_irqrestore(&ds_lock, irq); @@ -340,6 +337,31 @@ static inline void ds_put_context(struct ds_context *context) kfree(context); } +static void ds_install_ds_area(struct ds_context *context) +{ + unsigned long ds; + + ds = (unsigned long)context->ds; + + /* + * There is a race between the bts master and the pebs master. + * + * The thread/cpu access is synchronized via get/put_cpu() for + * task tracing and via wrmsr_on_cpu for cpu tracing. + * + * If bts and pebs are collected for the same task or same cpu, + * the same confiuration is written twice. + */ + if (context->task) { + get_cpu(); + if (context->task == current) + wrmsrl(MSR_IA32_DS_AREA, ds); + set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + put_cpu(); + } else + wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA, + (u32)((u64)ds), (u32)((u64)ds >> 32)); +} /* * Call the tracer's callback on a buffer overflow. @@ -622,6 +644,7 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, * The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ + ith *= ds_cfg.sizeof_rec[qual]; trace->ith = (void *)(buffer + size - ith); trace->flags = flags; @@ -630,7 +653,7 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, enum ds_qualifier qual, struct task_struct *task, - void *base, size_t size, size_t th, unsigned int flags) + int cpu, void *base, size_t size, size_t th) { struct ds_context *context; int error; @@ -643,7 +666,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, if (!base) goto out; - /* We require some space to do alignment adjustments below. */ + /* We need space for alignment adjustments in ds_init_ds_trace(). */ error = -EINVAL; if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) goto out; @@ -660,25 +683,27 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, tracer->size = size; error = -ENOMEM; - context = ds_get_context(task); + context = ds_get_context(task, cpu); if (!context) goto out; tracer->context = context; - ds_init_ds_trace(trace, qual, base, size, th, flags); + /* + * Defer any tracer-specific initialization work for the context until + * context ownership has been clarified. + */ error = 0; out: return error; } -struct bts_tracer *ds_request_bts(struct task_struct *task, - void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th, - unsigned int flags) +static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct bts_tracer *tracer; - unsigned long irq; int error; /* Buffer overflow notification is not yet implemented. */ @@ -690,42 +715,46 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, if (error < 0) goto out; - /* - * Per-cpu tracing is typically requested using smp_call_function(). - * We must not sleep. - */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); if (!tracer) goto out_put_tracer; tracer->ovfl = ovfl; + /* Do some more error checking and acquire a tracing context. */ error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_bts, task, base, size, th, flags); + ds_bts, task, cpu, base, size, th); if (error < 0) goto out_tracer; - - spin_lock_irqsave(&ds_lock, irq); + /* Claim the bts part of the tracing context we acquired above. */ + spin_lock_irq(&ds_lock); error = -EPERM; if (tracer->ds.context->bts_master) goto out_unlock; tracer->ds.context->bts_master = tracer; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); + /* + * Now that we own the bts part of the context, let's complete the + * initialization for that part. + */ + ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags); + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_install_ds_area(tracer->ds.context); tracer->trace.read = bts_read; tracer->trace.write = bts_write; - ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + /* Start tracing. */ ds_resume_bts(tracer); return tracer; out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); @@ -735,13 +764,27 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, return ERR_PTR(error); } -struct pebs_tracer *ds_request_pebs(struct task_struct *task, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, size_t th, - unsigned int flags) +struct bts_tracer *ds_request_bts_task(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_bts(task, 0, base, size, ovfl, th, flags); +} + +struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags); +} + +static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct pebs_tracer *tracer; - unsigned long irq; int error; /* Buffer overflow notification is not yet implemented. */ @@ -753,37 +796,43 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, if (error < 0) goto out; - /* - * Per-cpu tracing is typically requested using smp_call_function(). - * We must not sleep. - */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); if (!tracer) goto out_put_tracer; tracer->ovfl = ovfl; + /* Do some more error checking and acquire a tracing context. */ error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_pebs, task, base, size, th, flags); + ds_pebs, task, cpu, base, size, th); if (error < 0) goto out_tracer; - spin_lock_irqsave(&ds_lock, irq); + /* Claim the pebs part of the tracing context we acquired above. */ + spin_lock_irq(&ds_lock); error = -EPERM; if (tracer->ds.context->pebs_master) goto out_unlock; tracer->ds.context->pebs_master = tracer; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); + /* + * Now that we own the pebs part of the context, let's complete the + * initialization for that part. + */ + ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags); ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + ds_install_ds_area(tracer->ds.context); + + /* Start tracing. */ ds_resume_pebs(tracer); return tracer; out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); @@ -793,17 +842,27 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, return ERR_PTR(error); } -void ds_release_bts(struct bts_tracer *tracer) +struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_pebs(task, 0, base, size, ovfl, th, flags); +} + +struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags); +} + +static void ds_free_bts(struct bts_tracer *tracer) { struct task_struct *task; - if (!tracer) - return; - task = tracer->ds.context->task; - ds_suspend_bts(tracer); - WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); tracer->ds.context->bts_master = NULL; @@ -817,9 +876,69 @@ void ds_release_bts(struct bts_tracer *tracer) kfree(tracer); } +void ds_release_bts(struct bts_tracer *tracer) +{ + might_sleep(); + + if (!tracer) + return; + + ds_suspend_bts(tracer); + ds_free_bts(tracer); +} + +int ds_release_bts_noirq(struct bts_tracer *tracer) +{ + struct task_struct *task; + unsigned long irq; + int error; + + if (!tracer) + return 0; + + task = tracer->ds.context->task; + + local_irq_save(irq); + + error = -EPERM; + if (!task && + (tracer->ds.context->cpu != smp_processor_id())) + goto out; + + error = -EPERM; + if (task && (task != current)) + goto out; + + ds_suspend_bts_noirq(tracer); + ds_free_bts(tracer); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static void update_task_debugctlmsr(struct task_struct *task, + unsigned long debugctlmsr) +{ + task->thread.debugctlmsr = debugctlmsr; + + get_cpu(); + if (task == current) + update_debugctlmsr(debugctlmsr); + + if (task->thread.debugctlmsr) + set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + else + clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + put_cpu(); +} + void ds_suspend_bts(struct bts_tracer *tracer) { struct task_struct *task; + unsigned long debugctlmsr; + int cpu; if (!tracer) return; @@ -827,29 +946,60 @@ void ds_suspend_bts(struct bts_tracer *tracer) tracer->flags = 0; task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; - if (!task || (task == current)) - update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); + WARN_ON(!task && irqs_disabled()); - if (task) { - task->thread.debugctlmsr &= ~BTS_CONTROL; + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr_on_cpu(cpu)); + debugctlmsr &= ~BTS_CONTROL; - if (!task->thread.debugctlmsr) - clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - } + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr_on_cpu(cpu, debugctlmsr); } -void ds_resume_bts(struct bts_tracer *tracer) +int ds_suspend_bts_noirq(struct bts_tracer *tracer) { struct task_struct *task; - unsigned long control; + unsigned long debugctlmsr, irq; + int cpu, error = 0; if (!tracer) - return; + return 0; - tracer->flags = tracer->trace.ds.flags; + tracer->flags = 0; task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + local_irq_save(irq); + + error = -EPERM; + if (!task && (cpu != smp_processor_id())) + goto out; + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr()); + debugctlmsr &= ~BTS_CONTROL; + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr(debugctlmsr); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static unsigned long ds_bts_control(struct bts_tracer *tracer) +{ + unsigned long control; control = ds_cfg.ctl[dsf_bts]; if (!(tracer->trace.ds.flags & BTS_KERNEL)) @@ -857,25 +1007,77 @@ void ds_resume_bts(struct bts_tracer *tracer) if (!(tracer->trace.ds.flags & BTS_USER)) control |= ds_cfg.ctl[dsf_bts_user]; - if (task) { - task->thread.debugctlmsr |= control; - set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - } - - if (!task || (task == current)) - update_debugctlmsr(get_debugctlmsr() | control); + return control; } -void ds_release_pebs(struct pebs_tracer *tracer) +void ds_resume_bts(struct bts_tracer *tracer) { struct task_struct *task; + unsigned long debugctlmsr; + int cpu; if (!tracer) return; - task = tracer->ds.context->task; + tracer->flags = tracer->trace.ds.flags; - ds_suspend_pebs(tracer); + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + WARN_ON(!task && irqs_disabled()); + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr_on_cpu(cpu)); + debugctlmsr |= ds_bts_control(tracer); + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr_on_cpu(cpu, debugctlmsr); +} + +int ds_resume_bts_noirq(struct bts_tracer *tracer) +{ + struct task_struct *task; + unsigned long debugctlmsr, irq; + int cpu, error = 0; + + if (!tracer) + return 0; + + tracer->flags = tracer->trace.ds.flags; + + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + local_irq_save(irq); + + error = -EPERM; + if (!task && (cpu != smp_processor_id())) + goto out; + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr()); + debugctlmsr |= ds_bts_control(tracer); + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr(debugctlmsr); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static void ds_free_pebs(struct pebs_tracer *tracer) +{ + struct task_struct *task; + + task = tracer->ds.context->task; WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); tracer->ds.context->pebs_master = NULL; @@ -886,16 +1088,68 @@ void ds_release_pebs(struct pebs_tracer *tracer) kfree(tracer); } +void ds_release_pebs(struct pebs_tracer *tracer) +{ + might_sleep(); + + if (!tracer) + return; + + ds_suspend_pebs(tracer); + ds_free_pebs(tracer); +} + +int ds_release_pebs_noirq(struct pebs_tracer *tracer) +{ + struct task_struct *task; + unsigned long irq; + int error; + + if (!tracer) + return 0; + + task = tracer->ds.context->task; + + local_irq_save(irq); + + error = -EPERM; + if (!task && + (tracer->ds.context->cpu != smp_processor_id())) + goto out; + + error = -EPERM; + if (task && (task != current)) + goto out; + + ds_suspend_pebs_noirq(tracer); + ds_free_pebs(tracer); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + void ds_suspend_pebs(struct pebs_tracer *tracer) { } +int ds_suspend_pebs_noirq(struct pebs_tracer *tracer) +{ + return 0; +} + void ds_resume_pebs(struct pebs_tracer *tracer) { } +int ds_resume_pebs_noirq(struct pebs_tracer *tracer) +{ + return 0; +} + const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { if (!tracer) @@ -1004,26 +1258,6 @@ ds_configure(const struct ds_configuration *cfg, printk(KERN_INFO "[ds] pebs not available\n"); } - if (ds_cfg.sizeof_rec[ds_bts]) { - int error; - - error = ds_selftest_bts(); - if (error) { - WARN(1, "[ds] selftest failed. disabling bts.\n"); - ds_cfg.sizeof_rec[ds_bts] = 0; - } - } - - if (ds_cfg.sizeof_rec[ds_pebs]) { - int error; - - error = ds_selftest_pebs(); - if (error) { - WARN(1, "[ds] selftest failed. disabling pebs.\n"); - ds_cfg.sizeof_rec[ds_pebs] = 0; - } - } - printk(KERN_INFO "[ds] sizes: address: %u bit, ", 8 * ds_cfg.sizeof_ptr_field); printk("bts/pebs record: %u/%u bytes\n", @@ -1127,3 +1361,29 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) void ds_exit_thread(struct task_struct *tsk) { } + +static __init int ds_selftest(void) +{ + if (ds_cfg.sizeof_rec[ds_bts]) { + int error; + + error = ds_selftest_bts(); + if (error) { + WARN(1, "[ds] selftest failed. disabling bts.\n"); + ds_cfg.sizeof_rec[ds_bts] = 0; + } + } + + if (ds_cfg.sizeof_rec[ds_pebs]) { + int error; + + error = ds_selftest_pebs(); + if (error) { + WARN(1, "[ds] selftest failed. disabling pebs.\n"); + ds_cfg.sizeof_rec[ds_pebs] = 0; + } + } + + return 0; +} +device_initcall(ds_selftest); diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index 8c46fbf38c4..e5a263c8a14 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -10,11 +10,12 @@ #include #include +#include #include -#define DS_SELFTEST_BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ +#define BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ static int ds_selftest_bts_consistency(const struct bts_trace *trace) @@ -125,12 +126,12 @@ int ds_selftest_bts(void) struct bts_tracer *tracer; int error = 0; void *top; - unsigned char buffer[DS_SELFTEST_BUFFER_SIZE]; + unsigned char buffer[BUFFER_SIZE]; printk(KERN_INFO "[ds] bts selftest..."); - tracer = ds_request_bts(NULL, buffer, DS_SELFTEST_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); + tracer = ds_request_bts_cpu(smp_processor_id(), buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); if (IS_ERR(tracer)) { error = PTR_ERR(tracer); tracer = NULL; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7c21d1e8cae..adbb24322d8 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -800,8 +800,9 @@ static int ptrace_bts_config(struct task_struct *child, if (cfg.flags & PTRACE_BTS_O_SCHED) flags |= BTS_TIMESTAMPS; - context->tracer = ds_request_bts(child, context->buffer, context->size, - NULL, (size_t)-1, flags); + context->tracer = + ds_request_bts_task(child, context->buffer, context->size, + NULL, (size_t)-1, flags); if (unlikely(IS_ERR(context->tracer))) { int error = PTR_ERR(context->tracer); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 8b2109a6c61..50565d8cd2e 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -4,7 +4,6 @@ * Copyright (C) 2008-2009 Intel Corporation. * Markus Metzger , 2008-2009 */ -#include #include #include #include @@ -21,168 +20,113 @@ #define BTS_BUFFER_SIZE (1 << 13) -/* - * The tracer lock protects the below per-cpu tracer array. - * It needs to be held to: - * - start tracing on all cpus - * - stop tracing on all cpus - * - start tracing on a single hotplug cpu - * - stop tracing on a single hotplug cpu - * - read the trace from all cpus - * - read the trace from a single cpu - */ -static DEFINE_SPINLOCK(bts_tracer_lock); static DEFINE_PER_CPU(struct bts_tracer *, tracer); static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); #define this_tracer per_cpu(tracer, smp_processor_id()) -#define this_buffer per_cpu(buffer, smp_processor_id()) static int trace_hw_branches_enabled __read_mostly; static int trace_hw_branches_suspended __read_mostly; static struct trace_array *hw_branch_trace __read_mostly; -/* - * Initialize the tracer for the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_init_cpu(void *arg) +static void bts_trace_init_cpu(int cpu) { - if (this_tracer) - ds_release_bts(this_tracer); + per_cpu(tracer, cpu) = + ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); - this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - if (IS_ERR(this_tracer)) { - this_tracer = NULL; - return; - } + if (IS_ERR(per_cpu(tracer, cpu))) + per_cpu(tracer, cpu) = NULL; } static int bts_trace_init(struct trace_array *tr) { - int cpu, avail; - - spin_lock(&bts_tracer_lock); + int cpu; hw_branch_trace = tr; + trace_hw_branches_enabled = 0; - on_each_cpu(bts_trace_init_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) { + bts_trace_init_cpu(cpu); - /* Check on how many cpus we could enable tracing */ - avail = 0; - for_each_online_cpu(cpu) - if (per_cpu(tracer, cpu)) - avail++; - - trace_hw_branches_enabled = (avail ? 1 : 0); + if (likely(per_cpu(tracer, cpu))) + trace_hw_branches_enabled = 1; + } trace_hw_branches_suspended = 0; - - spin_unlock(&bts_tracer_lock); - + put_online_cpus(); /* If we could not enable tracing on a single cpu, we fail. */ - return avail ? 0 : -EOPNOTSUPP; -} - -/* - * Release the tracer for the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_release_cpu(void *arg) -{ - if (this_tracer) { - ds_release_bts(this_tracer); - this_tracer = NULL; - } + return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP; } static void bts_trace_reset(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - on_each_cpu(bts_trace_release_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) { + if (likely(per_cpu(tracer, cpu))) { + ds_release_bts(per_cpu(tracer, cpu)); + per_cpu(tracer, cpu) = NULL; + } + } trace_hw_branches_enabled = 0; trace_hw_branches_suspended = 0; - - spin_unlock(&bts_tracer_lock); -} - -/* - * Resume tracing on the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_resume_cpu(void *arg) -{ - if (this_tracer) - ds_resume_bts(this_tracer); + put_online_cpus(); } static void bts_trace_start(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - on_each_cpu(bts_trace_resume_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_resume_bts(per_cpu(tracer, cpu)); trace_hw_branches_suspended = 0; - - spin_unlock(&bts_tracer_lock); -} - -/* - * Suspend tracing on the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_suspend_cpu(void *arg) -{ - if (this_tracer) - ds_suspend_bts(this_tracer); + put_online_cpus(); } static void bts_trace_stop(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - on_each_cpu(bts_trace_suspend_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); trace_hw_branches_suspended = 1; - - spin_unlock(&bts_tracer_lock); + put_online_cpus(); } static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; - - spin_lock(&bts_tracer_lock); - - if (!trace_hw_branches_enabled) - goto out; + int cpu = (long)hcpu; switch (action) { case CPU_ONLINE: case CPU_DOWN_FAILED: - smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1); + /* The notification is sent with interrupts enabled. */ + if (trace_hw_branches_enabled) { + bts_trace_init_cpu(cpu); - if (trace_hw_branches_suspended) - smp_call_function_single(cpu, bts_trace_suspend_cpu, - NULL, 1); + if (trace_hw_branches_suspended && + likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); + } break; + case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1); - break; + /* The notification is sent with interrupts enabled. */ + if (likely(per_cpu(tracer, cpu))) { + ds_release_bts(per_cpu(tracer, cpu)); + per_cpu(tracer, cpu) = NULL; + } } - out: - spin_unlock(&bts_tracer_lock); return NOTIFY_DONE; } @@ -274,7 +218,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at) /* * Collect the trace on the current cpu and write it into the ftrace buffer. * - * pre: bts_tracer_lock must be locked + * pre: tracing must be suspended on the current cpu */ static void trace_bts_cpu(void *arg) { @@ -291,10 +235,9 @@ static void trace_bts_cpu(void *arg) if (unlikely(!this_tracer)) return; - ds_suspend_bts(this_tracer); trace = ds_read_bts(this_tracer); if (!trace) - goto out; + return; for (at = trace->ds.top; (void *)at < trace->ds.end; at += trace->ds.size) @@ -303,18 +246,27 @@ static void trace_bts_cpu(void *arg) for (at = trace->ds.begin; (void *)at < trace->ds.top; at += trace->ds.size) trace_bts_at(trace, at); - -out: - ds_resume_bts(this_tracer); } static void trace_bts_prepare(struct trace_iterator *iter) { - spin_lock(&bts_tracer_lock); + int cpu; + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); + /* + * We need to collect the trace on the respective cpu since ftrace + * implicitly adds the record for the current cpu. + * Once that is more flexible, we could collect the data from any cpu. + */ on_each_cpu(trace_bts_cpu, iter->tr, 1); - spin_unlock(&bts_tracer_lock); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_resume_bts(per_cpu(tracer, cpu)); + put_online_cpus(); } static void trace_bts_close(struct trace_iterator *iter) @@ -324,12 +276,11 @@ static void trace_bts_close(struct trace_iterator *iter) void trace_hw_branch_oops(void) { - spin_lock(&bts_tracer_lock); - - if (trace_hw_branches_enabled) + if (this_tracer) { + ds_suspend_bts_noirq(this_tracer); trace_bts_cpu(hw_branch_trace); - - spin_unlock(&bts_tracer_lock); + ds_resume_bts_noirq(this_tracer); + } } struct tracer bts_tracer __read_mostly = From 4d657e51dfc042216febd4a007c6f36881f9256d Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:41 +0200 Subject: [PATCH 073/900] x86, hw-branch-tracer: allocate selftest iterator on heap Allocate the trace_iterator for the hw-branch-tracer selftest on the heap. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144556.578777000@intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_selftest.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 499d01c44cd..00dd6485bdd 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -757,7 +757,7 @@ int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr) { - struct trace_iterator iter; + struct trace_iterator *iter; struct tracer tracer; unsigned long count; int ret; @@ -777,17 +777,21 @@ trace_selftest_startup_hw_branches(struct tracer *trace, * The hw-branch tracer needs to collect the trace from the various * cpu trace buffers - before tracing is stopped. */ - memset(&iter, 0, sizeof(iter)); + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + memcpy(&tracer, trace, sizeof(tracer)); - iter.trace = &tracer; - iter.tr = tr; - iter.pos = -1; - mutex_init(&iter.mutex); + iter->trace = &tracer; + iter->tr = tr; + iter->pos = -1; + mutex_init(&iter->mutex); - trace->open(&iter); + trace->open(iter); - mutex_destroy(&iter.mutex); + mutex_destroy(&iter->mutex); + kfree(iter); tracing_stop(); From 353afeea24cc51aafc0ff21a72ec740b6f0af50c Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:42 +0200 Subject: [PATCH 074/900] x86, ds: fix compiler warning Size_t is defined differently on i386 and x86_64. Change type to avoid compiler warning. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144557.523964000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index e5a263c8a14..e1ba5101b57 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -87,7 +87,7 @@ static int ds_selftest_bts_read(struct bts_tracer *tracer, /* Now to the test itself. */ for (at = from; (void *)at < to; at += trace->ds.size) { struct bts_struct bts; - size_t index; + unsigned long index; int error; if (((void *)at - trace->ds.begin) % trace->ds.size) { From 84f201139245c30777ff858e71b8d7e134b8c3ed Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:43 +0200 Subject: [PATCH 075/900] x86, ds: fix bounds check in ds selftest Fix a bad bounds check in the debug store selftest. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144558.450027000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index e1ba5101b57..cccc19a38f6 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -47,8 +47,13 @@ static int ds_selftest_bts_consistency(const struct bts_trace *trace) printk(KERN_CONT "bad bts buffer setup..."); error = -1; } + /* + * We allow top in [begin; end], since its not clear when the + * overflow adjustment happens: after the increment or before the + * write. + */ if ((trace->ds.top < trace->ds.begin) || - (trace->ds.end <= trace->ds.top)) { + (trace->ds.end < trace->ds.top)) { printk(KERN_CONT "bts top out of bounds..."); error = -1; } From 01f6569ece6915616f6cae1d7d8b46ab8da9c1bd Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:44 +0200 Subject: [PATCH 076/900] x86, ds: selftest each cpu Perform debug store selftests on each cpu. Cover both the normal and the _noirq variant of the debug store interface. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144559.394583000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 182 +++++++++++++++++++++++++--------- 1 file changed, 135 insertions(+), 47 deletions(-) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index cccc19a38f6..599a9630062 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -11,13 +11,21 @@ #include #include #include +#include #include -#define BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ +#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ +struct ds_selftest_bts_conf { + struct bts_tracer *tracer; + int error; + int (*suspend)(struct bts_tracer *); + int (*resume)(struct bts_tracer *); +}; + static int ds_selftest_bts_consistency(const struct bts_trace *trace) { int error = 0; @@ -125,36 +133,32 @@ static int ds_selftest_bts_read(struct bts_tracer *tracer, return 0; } -int ds_selftest_bts(void) +static void ds_selftest_bts_cpu(void *arg) { + struct ds_selftest_bts_conf *conf = arg; const struct bts_trace *trace; - struct bts_tracer *tracer; - int error = 0; void *top; - unsigned char buffer[BUFFER_SIZE]; - printk(KERN_INFO "[ds] bts selftest..."); - - tracer = ds_request_bts_cpu(smp_processor_id(), buffer, BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - if (IS_ERR(tracer)) { - error = PTR_ERR(tracer); - tracer = NULL; + if (IS_ERR(conf->tracer)) { + conf->error = PTR_ERR(conf->tracer); + conf->tracer = NULL; printk(KERN_CONT - "initialization failed (err: %d)...", error); - goto out; + "initialization failed (err: %d)...", conf->error); + return; } - /* The return should already give us enough trace. */ - ds_suspend_bts(tracer); + /* We should meanwhile have enough trace. */ + conf->error = conf->suspend(conf->tracer); + if (conf->error < 0) + return; /* Let's see if we can access the trace. */ - trace = ds_read_bts(tracer); + trace = ds_read_bts(conf->tracer); - error = ds_selftest_bts_consistency(trace); - if (error < 0) - goto out; + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; /* If everything went well, we should have a few trace entries. */ if (trace->ds.top == trace->ds.begin) { @@ -168,10 +172,11 @@ int ds_selftest_bts(void) } /* Let's try to read the trace we collected. */ - error = ds_selftest_bts_read(tracer, trace, + conf->error = + ds_selftest_bts_read(conf->tracer, trace, trace->ds.begin, trace->ds.top); - if (error < 0) - goto out; + if (conf->error < 0) + return; /* * Let's read the trace again. @@ -179,26 +184,31 @@ int ds_selftest_bts(void) */ top = trace->ds.top; - trace = ds_read_bts(tracer); - error = ds_selftest_bts_consistency(trace); - if (error < 0) - goto out; + trace = ds_read_bts(conf->tracer); + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; if (top != trace->ds.top) { printk(KERN_CONT "suspend not working..."); - error = -1; - goto out; + conf->error = -1; + return; } /* Let's collect some more trace - see if resume is working. */ - ds_resume_bts(tracer); - ds_suspend_bts(tracer); + conf->error = conf->resume(conf->tracer); + if (conf->error < 0) + return; - trace = ds_read_bts(tracer); + conf->error = conf->suspend(conf->tracer); + if (conf->error < 0) + return; - error = ds_selftest_bts_consistency(trace); - if (error < 0) - goto out; + trace = ds_read_bts(conf->tracer); + + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; if (trace->ds.top == top) { /* @@ -210,35 +220,113 @@ int ds_selftest_bts(void) printk(KERN_CONT "no resume progress/overflow..."); - error = ds_selftest_bts_read(tracer, trace, + conf->error = + ds_selftest_bts_read(conf->tracer, trace, trace->ds.begin, trace->ds.end); } else if (trace->ds.top < top) { /* * We had a buffer overflow - the entire buffer should * contain trace records. */ - error = ds_selftest_bts_read(tracer, trace, + conf->error = + ds_selftest_bts_read(conf->tracer, trace, trace->ds.begin, trace->ds.end); } else { /* * It is quite likely that the buffer did not overflow. * Let's just check the delta trace. */ - error = ds_selftest_bts_read(tracer, trace, - top, trace->ds.top); + conf->error = + ds_selftest_bts_read(conf->tracer, trace, top, + trace->ds.top); } - if (error < 0) - goto out; + if (conf->error < 0) + return; - error = 0; + conf->error = 0; +} - /* The final test: release the tracer while tracing is suspended. */ +static int ds_suspend_bts_wrap(struct bts_tracer *tracer) +{ + ds_suspend_bts(tracer); + return 0; +} + +static int ds_resume_bts_wrap(struct bts_tracer *tracer) +{ + ds_resume_bts(tracer); + return 0; +} + +static void ds_release_bts_noirq_wrap(void *tracer) +{ + (void)ds_release_bts_noirq(tracer); +} + +static int ds_selftest_bts_bad_release_noirq(int cpu, + struct bts_tracer *tracer) +{ + int error = -EPERM; + + /* Try to release the tracer on the wrong cpu. */ + get_cpu(); + if (cpu != smp_processor_id()) { + error = ds_release_bts_noirq(tracer); + if (error != -EPERM) + printk(KERN_CONT "release on wrong cpu..."); + } + put_cpu(); + + return error ? 0 : -1; +} + +int ds_selftest_bts(void) +{ + struct ds_selftest_bts_conf conf; + unsigned char buffer[BUFFER_SIZE]; + int cpu; + + printk(KERN_INFO "[ds] bts selftest..."); + conf.error = 0; + + get_online_cpus(); + for_each_online_cpu(cpu) { + conf.suspend = ds_suspend_bts_wrap; + conf.resume = ds_resume_bts_wrap; + conf.tracer = + ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + ds_selftest_bts_cpu(&conf); + ds_release_bts(conf.tracer); + if (conf.error < 0) + goto out; + + conf.suspend = ds_suspend_bts_noirq; + conf.resume = ds_resume_bts_noirq; + conf.tracer = + ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1); + if (conf.error >= 0) { + conf.error = + ds_selftest_bts_bad_release_noirq(cpu, + conf.tracer); + /* We must not release the tracer twice. */ + if (conf.error < 0) + conf.tracer = NULL; + } + smp_call_function_single(cpu, ds_release_bts_noirq_wrap, + conf.tracer, 1); + if (conf.error < 0) + goto out; + } + + conf.error = 0; out: - ds_release_bts(tracer); + put_online_cpus(); + printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed")); - printk(KERN_CONT "%s.\n", (error ? "failed" : "passed")); - - return error; + return conf.error; } int ds_selftest_pebs(void) From 3a68eef945b234f286406d96dc690fe17863c203 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:45 +0200 Subject: [PATCH 077/900] x86, ds: add task tracing selftest Add selftests to cover per-task branch tracing. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144600.329346000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 71 +++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index 599a9630062..a40b2533c71 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -280,10 +280,51 @@ static int ds_selftest_bts_bad_release_noirq(int cpu, return error ? 0 : -1; } +static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer) +{ + struct bts_tracer *tracer; + int error; + + /* Try to request cpu tracing while task tracing is active. */ + tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, + (size_t)-1, BTS_KERNEL); + error = PTR_ERR(tracer); + if (!IS_ERR(tracer)) { + ds_release_bts(tracer); + error = 0; + } + + if (error != -EPERM) + printk(KERN_CONT "cpu/task tracing overlap..."); + + return error ? 0 : -1; +} + +static int ds_selftest_bts_bad_request_task(void *buffer) +{ + struct bts_tracer *tracer; + int error; + + /* Try to request cpu tracing while task tracing is active. */ + tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL, + (size_t)-1, BTS_KERNEL); + error = PTR_ERR(tracer); + if (!IS_ERR(tracer)) { + error = 0; + ds_release_bts(tracer); + } + + if (error != -EPERM) + printk(KERN_CONT "task/cpu tracing overlap..."); + + return error ? 0 : -1; +} + int ds_selftest_bts(void) { struct ds_selftest_bts_conf conf; unsigned char buffer[BUFFER_SIZE]; + unsigned long irq; int cpu; printk(KERN_INFO "[ds] bts selftest..."); @@ -297,6 +338,8 @@ int ds_selftest_bts(void) ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, (size_t)-1, BTS_KERNEL); ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_task(buffer); ds_release_bts(conf.tracer); if (conf.error < 0) goto out; @@ -315,12 +358,40 @@ int ds_selftest_bts(void) if (conf.error < 0) conf.tracer = NULL; } + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_task(buffer); smp_call_function_single(cpu, ds_release_bts_noirq_wrap, conf.tracer, 1); if (conf.error < 0) goto out; } + conf.suspend = ds_suspend_bts_wrap; + conf.resume = ds_resume_bts_wrap; + conf.tracer = + ds_request_bts_task(current, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); + ds_release_bts(conf.tracer); + if (conf.error < 0) + goto out; + + conf.suspend = ds_suspend_bts_noirq; + conf.resume = ds_resume_bts_noirq; + conf.tracer = + ds_request_bts_task(current, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + local_irq_save(irq); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); + ds_release_bts_noirq(conf.tracer); + local_irq_restore(irq); + if (conf.error < 0) + goto out; + conf.error = 0; out: put_online_cpus(); From 2311f0de21c17b2a8b960677a9cccfbfa52beb35 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:46 +0200 Subject: [PATCH 078/900] x86, ds: add leakage warning Add a warning in case a debug store context is not removed before the task it is attached to is freed. Remove the old warning at thread exit. It is too early. Declare the debug store context field in thread_struct unconditionally. Remove ds_copy_thread() and ds_exit_thread() and do the work directly in process*.c. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144601.254472000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 9 --------- arch/x86/include/asm/processor.h | 4 +--- arch/x86/kernel/ds.c | 10 ---------- arch/x86/kernel/process.c | 5 +++-- arch/x86/kernel/process_32.c | 3 ++- arch/x86/kernel/process_64.c | 3 ++- 6 files changed, 8 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 413e127e567..149e5208e96 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -285,21 +285,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); */ extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); -/* - * Task clone/init and cleanup work - */ -extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father); -extern void ds_exit_thread(struct task_struct *tsk); - #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} static inline void ds_switch_to(struct task_struct *prev, struct task_struct *next) {} -static inline void ds_copy_thread(struct task_struct *tsk, - struct task_struct *father) {} -static inline void ds_exit_thread(struct task_struct *tsk) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 1efeb497f1f..7c39de7e709 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -454,10 +454,8 @@ struct thread_struct { unsigned io_bitmap_max; /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ unsigned long debugctlmsr; -#ifdef CONFIG_X86_DS -/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */ + /* Debug Store context; see asm/ds.h */ struct ds_context *ds_ctx; -#endif /* CONFIG_X86_DS */ }; static inline unsigned long native_get_debugreg(int regno) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 21a3852abf6..71cab3b62dc 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1352,16 +1352,6 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next) update_debugctlmsr(debugctlmsr); } -void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) -{ - clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); - tsk->thread.ds_ctx = NULL; -} - -void ds_exit_thread(struct task_struct *tsk) -{ -} - static __init int ds_selftest(void) { if (ds_cfg.sizeof_rec[ds_bts]) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e84..fb5dfb891f0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -14,6 +14,7 @@ #include #include #include +#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -45,6 +46,8 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } + + WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } void free_thread_info(struct thread_info *ti) @@ -83,8 +86,6 @@ void exit_thread(void) put_cpu(); kfree(bp); } - - ds_exit_thread(current); } void flush_thread(void) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84043a..b5e4bfef447 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -290,7 +290,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.io_bitmap_max = 0; } - ds_copy_thread(p, current); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b751a41392b..5a1a1de292e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -335,7 +335,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, goto out; } - ds_copy_thread(p, me); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; From ee811517a5604aa63fae803b7c044712699e1303 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:47 +0200 Subject: [PATCH 079/900] x86, ds: use single debug store cpu configuration Use a single configuration for all cpus. Reported-by: Ingo Molnar Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144602.191165000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 71cab3b62dc..443f415441d 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -47,9 +47,8 @@ struct ds_configuration { /* Control bit-masks indexed by enum ds_feature: */ unsigned long ctl[dsf_ctl_max]; }; -static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); +static struct ds_configuration ds_cfg __read_mostly; -#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) /* Maximal size of a DS configuration: */ #define MAX_SIZEOF_DS (12 * 8) @@ -1268,6 +1267,10 @@ ds_configure(const struct ds_configuration *cfg, void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) { + /* Only configure the first cpu. Others are identical. */ + if (ds_cfg.name) + return; + switch (c->x86) { case 0x6: switch (c->x86_model) { From 0f4814065ff8c24ca8bfd75c9b73502be152c287 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:48 +0200 Subject: [PATCH 080/900] x86, ptrace: add bts context unconditionally Add the ptrace bts context field to task_struct unconditionally. Initialize the field directly in copy_process(). Remove all the unneeded functionality used to initialize that field. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144603.292754000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ptrace.h | 9 ++++----- arch/x86/kernel/ptrace.c | 20 +------------------- include/linux/ptrace.h | 10 ---------- include/linux/sched.h | 2 -- kernel/fork.c | 4 ++-- kernel/ptrace.c | 10 ---------- 6 files changed, 7 insertions(+), 48 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index e304b66abee..5cdd19f20b5 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -235,12 +235,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); -extern void x86_ptrace_untrace(struct task_struct *); -extern void x86_ptrace_fork(struct task_struct *child, - unsigned long clone_flags); +#ifdef CONFIG_X86_PTRACE_BTS +extern void ptrace_bts_untrace(struct task_struct *tsk); -#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk) -#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags) +#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk) +#endif /* CONFIG_X86_PTRACE_BTS */ #endif /* __KERNEL__ */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index adbb24322d8..b32a8ee5338 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -887,37 +887,19 @@ static int ptrace_bts_size(struct task_struct *child) return (trace->ds.top - trace->ds.begin) / trace->ds.size; } -static inline void ptrace_bts_fork(struct task_struct *tsk) -{ - tsk->bts = NULL; -} - /* * Called from __ptrace_unlink() after the child has been moved back * to its original parent. */ -static inline void ptrace_bts_untrace(struct task_struct *child) +void ptrace_bts_untrace(struct task_struct *child) { if (unlikely(child->bts)) { free_bts_context(child->bts); child->bts = NULL; } } -#else -static inline void ptrace_bts_fork(struct task_struct *tsk) {} -static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ -void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ - ptrace_bts_fork(child); -} - -void x86_ptrace_untrace(struct task_struct *child) -{ - ptrace_bts_untrace(child); -} - /* * Called by kernel/ptrace.c when detaching.. * diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 67c15653fc2..59e133d39d5 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -95,7 +95,6 @@ extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer); -extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags); #define PTRACE_MODE_READ 1 #define PTRACE_MODE_ATTACH 2 /* Returns 0 on success, -errno on denial. */ @@ -327,15 +326,6 @@ static inline void user_enable_block_step(struct task_struct *task) #define arch_ptrace_untrace(task) do { } while (0) #endif -#ifndef arch_ptrace_fork -/* - * Do machine-specific work to initialize a new task. - * - * This is called from copy_process(). - */ -#define arch_ptrace_fork(child, clone_flags) do { } while (0) -#endif - extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); diff --git a/include/linux/sched.h b/include/linux/sched.h index 52b8cd049c2..451186a22ef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1205,13 +1205,11 @@ struct task_struct { struct list_head ptraced; struct list_head ptrace_entry; -#ifdef CONFIG_X86_PTRACE_BTS /* * This is the tracer handle for the ptrace BTS extension. * This field actually belongs to the ptracer task. */ struct bts_context *bts; -#endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; diff --git a/kernel/fork.c b/kernel/fork.c index 660c2b8765b..69bde7a22e9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1086,8 +1086,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif - if (unlikely(current->ptrace)) - ptrace_fork(p, clone_flags); + + p->bts = NULL; /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aaad0ec3419..321127d965c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -26,16 +26,6 @@ #include -/* - * Initialize a new task whose father had been ptraced. - * - * Called from copy_process(). - */ -void ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ - arch_ptrace_fork(child, clone_flags); -} - /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. From 6047550d3d26fed88b18a208b31f8b90b5ef3e9b Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:49 +0200 Subject: [PATCH 081/900] x86, ds: dont use TIF_DEBUGCTLMSR Debug store already uses TIF_DS_AREA_MSR to trigger debug store context switch handling. No need to use TIF_DEBUGCTLMSR, as well. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144604.256645000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 443f415441d..cab28320dac 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -925,11 +925,6 @@ static void update_task_debugctlmsr(struct task_struct *task, get_cpu(); if (task == current) update_debugctlmsr(debugctlmsr); - - if (task->thread.debugctlmsr) - set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - else - clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); put_cpu(); } From 608780a9048efa3e85fbc4d8649b26805cc588aa Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:50 +0200 Subject: [PATCH 082/900] x86, ds: fix bad ds_reset_pebs() Ds_reset_pebs() passed the wrong qualifier to a shared function resulting in a reset of bts, rather than pebs. Reported-by: Stephane Eranian Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144605.206510000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index cab28320dac..ebfb0fde8e6 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1186,7 +1186,7 @@ int ds_reset_pebs(struct pebs_tracer *tracer) tracer->trace.ds.top = tracer->trace.ds.begin; - ds_set(tracer->ds.context->ds, ds_bts, ds_index, + ds_set(tracer->ds.context->ds, ds_pebs, ds_index, (unsigned long)tracer->trace.ds.top); return 0; From 150f5164c1258e05b7dea16f29e592f354c48f34 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:51 +0200 Subject: [PATCH 083/900] x86, ds: allow small debug store buffers Check the buffer size more precisely to allow buffers for exactly one element provided the base address is already properly aligned. Add a debug store selftest. Reported-by: Stephane Eranian Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144606.139137000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 9 +++++++-- arch/x86/kernel/ds_selftest.c | 6 +++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index ebfb0fde8e6..4e05157506a 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -656,6 +656,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, { struct ds_context *context; int error; + size_t req_size; error = -EOPNOTSUPP; if (!ds_cfg.sizeof_rec[qual]) @@ -665,9 +666,13 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, if (!base) goto out; - /* We need space for alignment adjustments in ds_init_ds_trace(). */ + req_size = ds_cfg.sizeof_rec[qual]; + /* We might need space for alignment adjustments. */ + if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT)) + req_size += DS_ALIGNMENT; + error = -EINVAL; - if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + if (size < req_size) goto out; if (th != (size_t)-1) { diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index a40b2533c71..5f104a0ace6 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -16,8 +16,8 @@ #include -#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ - +#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ +#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */ struct ds_selftest_bts_conf { struct bts_tracer *tracer; @@ -381,7 +381,7 @@ int ds_selftest_bts(void) conf.suspend = ds_suspend_bts_noirq; conf.resume = ds_resume_bts_noirq; conf.tracer = - ds_request_bts_task(current, buffer, BUFFER_SIZE, + ds_request_bts_task(current, buffer, SMALL_BUFFER_SIZE, NULL, (size_t)-1, BTS_KERNEL); local_irq_save(irq); ds_selftest_bts_cpu(&conf); From 017bc617657c928cb9a0c45a7a7e9f4e66695347 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:52 +0200 Subject: [PATCH 084/900] x86, ds: support Core i7 Add debug store support for Core i7. Core i7 adds a reset value for each performance counter and a new PEBS record format. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144607.088997000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 12 +++++-- arch/x86/kernel/ds.c | 69 +++++++++++++++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 149e5208e96..70dac199b09 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -234,8 +234,12 @@ struct bts_trace { struct pebs_trace { struct ds_trace ds; - /* the PEBS reset value */ - unsigned long long reset_value; + /* the number of valid counters in the below array */ + unsigned int counters; + +#define MAX_PEBS_COUNTERS 4 + /* the counter reset value */ + unsigned long long counter_reset[MAX_PEBS_COUNTERS]; }; @@ -270,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer); * Returns 0 on success; -Eerrno on error * * tracer: the tracer handle returned from ds_request_pebs() + * counter: the index of the counter * value: the new counter reset value */ -extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); +extern int ds_set_pebs_reset(struct pebs_tracer *tracer, + unsigned int counter, u64 value); /* * Initialization diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 4e05157506a..48bfe138603 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -44,6 +44,9 @@ struct ds_configuration { /* The size of a BTS/PEBS record in bytes: */ unsigned char sizeof_rec[2]; + /* The number of pebs counter reset values in the DS structure. */ + unsigned char nr_counter_reset; + /* Control bit-masks indexed by enum ds_feature: */ unsigned long ctl[dsf_ctl_max]; }; @@ -51,7 +54,7 @@ static struct ds_configuration ds_cfg __read_mostly; /* Maximal size of a DS configuration: */ -#define MAX_SIZEOF_DS (12 * 8) +#define MAX_SIZEOF_DS 0x80 /* Maximal size of a BTS record: */ #define MAX_SIZEOF_BTS (3 * 8) @@ -59,6 +62,12 @@ static struct ds_configuration ds_cfg __read_mostly; /* BTS and PEBS buffer alignment: */ #define DS_ALIGNMENT (1 << 3) +/* Number of buffer pointers in DS: */ +#define NUM_DS_PTR_FIELDS 8 + +/* Size of a pebs reset value in DS: */ +#define PEBS_RESET_FIELD_SIZE 8 + /* Mask of control bits in the DS MSR register: */ #define BTS_CONTROL \ ( ds_cfg.ctl[dsf_bts] | \ @@ -1164,9 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) return NULL; ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); - tracer->trace.reset_value = - *(u64 *)(tracer->ds.context->ds + - (ds_cfg.sizeof_ptr_field * 8)); + + tracer->trace.counters = ds_cfg.nr_counter_reset; + memcpy(tracer->trace.counter_reset, + tracer->ds.context->ds + + (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field), + ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE); return &tracer->trace; } @@ -1197,13 +1209,18 @@ int ds_reset_pebs(struct pebs_tracer *tracer) return 0; } -int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, + unsigned int counter, u64 value) { if (!tracer) return -EINVAL; + if (ds_cfg.nr_counter_reset < counter) + return -EINVAL; + *(u64 *)(tracer->ds.context->ds + - (ds_cfg.sizeof_ptr_field * 8)) = value; + (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) + + (counter * PEBS_RESET_FIELD_SIZE)) = value; return 0; } @@ -1213,16 +1230,26 @@ static const struct ds_configuration ds_cfg_netburst = { .ctl[dsf_bts] = (1 << 2) | (1 << 3), .ctl[dsf_bts_kernel] = (1 << 5), .ctl[dsf_bts_user] = (1 << 6), + .nr_counter_reset = 1, }; static const struct ds_configuration ds_cfg_pentium_m = { .name = "Pentium M", .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .nr_counter_reset = 1, }; static const struct ds_configuration ds_cfg_core2_atom = { .name = "Core 2/Atom", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .ctl[dsf_bts_kernel] = (1 << 9), .ctl[dsf_bts_user] = (1 << 10), + .nr_counter_reset = 1, +}; +static const struct ds_configuration ds_cfg_core_i7 = { + .name = "Core i7", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + .nr_counter_reset = 4, }; static void @@ -1239,6 +1266,32 @@ ds_configure(const struct ds_configuration *cfg, nr_pebs_fields = 18; #endif + /* + * Starting with version 2, architectural performance + * monitoring supports a format specifier. + */ + if ((cpuid_eax(0xa) & 0xff) > 1) { + unsigned long perf_capabilities, format; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities); + + format = (perf_capabilities >> 8) & 0xf; + + switch (format) { + case 0: + nr_pebs_fields = 18; + break; + case 1: + nr_pebs_fields = 22; + break; + default: + printk(KERN_INFO + "[ds] unknown PEBS format: %lu\n", format); + nr_pebs_fields = 0; + break; + } + } + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; @@ -1262,7 +1315,7 @@ ds_configure(const struct ds_configuration *cfg, printk("bts/pebs record: %u/%u bytes\n", ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); - WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field)); + WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -1284,6 +1337,8 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) ds_configure(&ds_cfg_core2_atom, c); break; case 0x1a: /* Core i7 */ + ds_configure(&ds_cfg_core_i7, c); + break; default: /* Sorry, don't know about them. */ break; From a5dec5573f3c7e63f2f9b5852b9759ea342a5ff9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 27 Mar 2009 14:55:44 +0800 Subject: [PATCH 085/900] tracing: use macros to denote usec and nsec per second Impact: cleanup Use USEC_PER_SEC and NSEC_PER_SEC instead of 1000000 and 1000000000. Signed-off-by: Li Zefan LKML-Reference: <49CC7870.9000309@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_boot.c | 5 +++-- kernel/trace/trace_mmiotrace.c | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7a30fc4c364..a29ef23ffb4 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" @@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter) trace_assign_type(field, entry); call = &field->boot_call; ts = iter->ts; - nsec_rem = do_div(ts, 1000000000); + nsec_rem = do_div(ts, NSEC_PER_SEC); ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", (unsigned long)ts, nsec_rem, call->func, call->caller); @@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter) trace_assign_type(field, entry); init_ret = &field->boot_ret; ts = iter->ts; - nsec_rem = do_div(ts, 1000000000); + nsec_rem = do_div(ts, NSEC_PER_SEC); ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " "returned %d after %llu msecs\n", diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 8e37fcddd8b..d53b45ed080 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,8 @@ #include #include #include +#include + #include #include "trace.h" @@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) struct mmiotrace_rw *rw; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret = 1; @@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) struct mmiotrace_map *m; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret; From 5452af664f6fba26b80eb2c8c4ceae2999d5cf56 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Mar 2009 00:25:38 +0100 Subject: [PATCH 086/900] tracing/ftrace: factorize the tracing files creation Impact: cleanup Most of the tracing files creation follow the same pattern: ret = debugfs_create_file(...) if (!ret) pr_warning("Couldn't create ... entry\n") Unify it! Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker LKML-Reference: <1238109938-11840-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 39 ++------ kernel/trace/ring_buffer.c | 7 +- kernel/trace/trace.c | 143 +++++++++++------------------ kernel/trace/trace.h | 6 ++ kernel/trace/trace_event_profile.c | 1 - kernel/trace/trace_printk.c | 6 +- kernel/trace/trace_stack.c | 13 +-- kernel/trace/trace_sysprof.c | 6 +- 8 files changed, 78 insertions(+), 143 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 678e3d6caf8..6ea5a1ae6a9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2698,38 +2698,23 @@ static const struct file_operations ftrace_graph_fops = { static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { - struct dentry *entry; - entry = debugfs_create_file("available_filter_functions", 0444, - d_tracer, NULL, &ftrace_avail_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'available_filter_functions' entry\n"); + trace_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); - entry = debugfs_create_file("failures", 0444, - d_tracer, NULL, &ftrace_failures_fops); - if (!entry) - pr_warning("Could not create debugfs 'failures' entry\n"); + trace_create_file("failures", 0444, + d_tracer, NULL, &ftrace_failures_fops); - entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, - NULL, &ftrace_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_filter' entry\n"); + trace_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); - entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, + trace_create_file("set_ftrace_notrace", 0644, d_tracer, NULL, &ftrace_notrace_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_notrace' entry\n"); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - entry = debugfs_create_file("set_graph_function", 0444, d_tracer, + trace_create_file("set_graph_function", 0444, d_tracer, NULL, &ftrace_graph_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ return 0; @@ -2987,7 +2972,6 @@ static const struct file_operations ftrace_pid_fops = { static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -2995,11 +2979,8 @@ static __init int ftrace_init_debugfs(void) ftrace_init_dyn_debugfs(d_tracer); - entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, - NULL, &ftrace_pid_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_pid' entry\n"); + trace_create_file("set_ftrace_pid", 0644, d_tracer, + NULL, &ftrace_pid_fops); ftrace_profile_debugfs(d_tracer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 960cbf44c84..74a11808c28 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2845,14 +2845,11 @@ static const struct file_operations rb_simple_fops = { static __init int rb_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_on", 0644, d_tracer, - &ring_buffer_flags, &rb_simple_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_on' entry\n"); + trace_create_file("tracing_on", 0644, d_tracer, + &ring_buffer_flags, &rb_simple_fops); return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 32653c8c6e2..0615751a3ed 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3581,7 +3581,7 @@ struct dentry *tracing_dentry_percpu(void) static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); - struct dentry *entry, *d_cpu; + struct dentry *d_cpu; /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ char cpu_dir[7]; @@ -3596,21 +3596,15 @@ static void tracing_init_debugfs_percpu(long cpu) } /* per cpu trace_pipe */ - entry = debugfs_create_file("trace_pipe", 0444, d_cpu, - (void *) cpu, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_pipe' entry\n"); + trace_create_file("trace_pipe", 0444, d_cpu, + (void *) cpu, &tracing_pipe_fops); /* per cpu trace */ - entry = debugfs_create_file("trace", 0644, d_cpu, - (void *) cpu, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + trace_create_file("trace", 0644, d_cpu, + (void *) cpu, &tracing_fops); - entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu, - (void *) cpu, &tracing_buffers_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n"); + trace_create_file("trace_pipe_raw", 0444, d_cpu, + (void *) cpu, &tracing_buffers_fops); } #ifdef CONFIG_FTRACE_SELFTEST @@ -3766,6 +3760,22 @@ static const struct file_operations trace_options_core_fops = { .write = trace_options_core_write, }; +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops) +{ + struct dentry *ret; + + ret = debugfs_create_file(name, mode, parent, data, fops); + if (!ret) + pr_warning("Could not create debugfs '%s' entry\n", name); + + return ret; +} + + static struct dentry *trace_options_init_dentry(void) { struct dentry *d_tracer; @@ -3793,7 +3803,6 @@ create_trace_option_file(struct trace_option_dentry *topt, struct tracer_opt *opt) { struct dentry *t_options; - struct dentry *entry; t_options = trace_options_init_dentry(); if (!t_options) @@ -3802,11 +3811,9 @@ create_trace_option_file(struct trace_option_dentry *topt, topt->flags = flags; topt->opt = opt; - entry = debugfs_create_file(opt->name, 0644, t_options, topt, + topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops); - topt->entry = entry; - } static struct trace_option_dentry * @@ -3861,123 +3868,81 @@ static struct dentry * create_trace_option_core_file(const char *option, long index) { struct dentry *t_options; - struct dentry *entry; t_options = trace_options_init_dentry(); if (!t_options) return NULL; - entry = debugfs_create_file(option, 0644, t_options, (void *)index, + return trace_create_file(option, 0644, t_options, (void *)index, &trace_options_core_fops); - - return entry; } static __init void create_trace_options_dir(void) { struct dentry *t_options; - struct dentry *entry; int i; t_options = trace_options_init_dentry(); if (!t_options) return; - for (i = 0; trace_options[i]; i++) { - entry = create_trace_option_core_file(trace_options[i], i); - if (!entry) - pr_warning("Could not create debugfs %s entry\n", - trace_options[i]); - } + for (i = 0; trace_options[i]; i++) + create_trace_option_core_file(trace_options[i], i); } static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; int cpu; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, - &global_trace, &tracing_ctrl_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); + trace_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); - entry = debugfs_create_file("trace_options", 0644, d_tracer, - NULL, &tracing_iter_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_options' entry\n"); + trace_create_file("trace_options", 0644, d_tracer, + NULL, &tracing_iter_fops); - create_trace_options_dir(); + trace_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); - entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, - NULL, &tracing_cpumask_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); + trace_create_file("trace", 0644, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); - entry = debugfs_create_file("trace", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + trace_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); - entry = debugfs_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); - if (!entry) - pr_warning("Could not create debugfs 'available_tracers' entry\n"); + trace_create_file("current_tracer", 0444, d_tracer, + &global_trace, &set_tracer_fops); - entry = debugfs_create_file("current_tracer", 0444, d_tracer, - &global_trace, &set_tracer_fops); - if (!entry) - pr_warning("Could not create debugfs 'current_tracer' entry\n"); + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, &tracing_max_lat_fops); - entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, - &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_max_latency' entry\n"); + trace_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); - entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_thresh' entry\n"); - entry = debugfs_create_file("README", 0644, d_tracer, - NULL, &tracing_readme_fops); - if (!entry) - pr_warning("Could not create debugfs 'README' entry\n"); + trace_create_file("README", 0644, d_tracer, + NULL, &tracing_readme_fops); - entry = debugfs_create_file("trace_pipe", 0444, d_tracer, + trace_create_file("trace_pipe", 0444, d_tracer, (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'trace_pipe' entry\n"); - entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, - &global_trace, &tracing_entries_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'buffer_size_kb' entry\n"); + trace_create_file("buffer_size_kb", 0644, d_tracer, + &global_trace, &tracing_entries_fops); - entry = debugfs_create_file("trace_marker", 0220, d_tracer, - NULL, &tracing_mark_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'trace_marker' entry\n"); + trace_create_file("trace_marker", 0220, d_tracer, + NULL, &tracing_mark_fops); #ifdef CONFIG_DYNAMIC_FTRACE - entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, - &tracing_dyn_info_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'dyn_ftrace_total_info' entry\n"); + trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif #ifdef CONFIG_SYSPROF_TRACER init_tracer_sysprof_debugfs(d_tracer); #endif + create_trace_options_dir(); + for_each_tracing_cpu(cpu) tracing_init_debugfs_percpu(cpu); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 47aa6d0c97a..f76a8f8689d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -470,6 +470,12 @@ void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops); + struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 22cba997077..199de9c7422 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -28,4 +28,3 @@ void ftrace_profile_disable(int event_id) return event->profile_disable(event); } } - diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index eb81556107f..9bece9687b6 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = { static __init int init_trace_printk_function_export(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; - entry = debugfs_create_file("printk_formats", 0444, d_tracer, + trace_create_file("printk_formats", 0444, d_tracer, NULL, &ftrace_formats_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'printk_formats' entry\n"); return 0; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c750f65f966..1796f00524e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace); static __init int stack_trace_init(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("stack_max_size", 0644, d_tracer, - &max_stack_size, &stack_max_size_fops); - if (!entry) - pr_warning("Could not create debugfs 'stack_max_size' entry\n"); + trace_create_file("stack_max_size", 0644, d_tracer, + &max_stack_size, &stack_max_size_fops); - entry = debugfs_create_file("stack_trace", 0444, d_tracer, - NULL, &stack_trace_fops); - if (!entry) - pr_warning("Could not create debugfs 'stack_trace' entry\n"); + trace_create_file("stack_trace", 0444, d_tracer, + NULL, &stack_trace_fops); if (stack_tracer_enabled) register_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 91fd19c2149..e04b76cc238 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = { void init_tracer_sysprof_debugfs(struct dentry *d_tracer) { - struct dentry *entry; - entry = debugfs_create_file("sysprof_sample_period", 0644, + trace_create_file("sysprof_sample_period", 0644, d_tracer, NULL, &sysprof_sample_fops); - if (entry) - return; - pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n"); } From 597af81537654097b67fd7a0c92775e66d4a86fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 3 Apr 2009 15:24:12 -0400 Subject: [PATCH 087/900] function-graph: use int instead of atomic for ftrace_graph_active Impact: cleanup The variable ftrace_graph_active is only modified under the ftrace_lock mutex, thus an atomic is not necessary for modification. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6ea5a1ae6a9..8e6a0b5c994 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3092,7 +3092,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static atomic_t ftrace_graph_active; +static int ftrace_graph_active; static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) @@ -3244,7 +3244,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_lock); /* we currently allow only one tracer registered at a time */ - if (atomic_read(&ftrace_graph_active)) { + if (ftrace_graph_active) { ret = -EBUSY; goto out; } @@ -3252,10 +3252,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); - atomic_inc(&ftrace_graph_active); + ftrace_graph_active++; ret = start_graph_tracing(); if (ret) { - atomic_dec(&ftrace_graph_active); + ftrace_graph_active--; goto out; } @@ -3273,10 +3273,10 @@ void unregister_ftrace_graph(void) { mutex_lock(&ftrace_lock); - if (!unlikely(atomic_read(&ftrace_graph_active))) + if (unlikely(!ftrace_graph_active)) goto out; - atomic_dec(&ftrace_graph_active); + ftrace_graph_active--; unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; @@ -3290,7 +3290,7 @@ void unregister_ftrace_graph(void) /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { - if (atomic_read(&ftrace_graph_active)) { + if (ftrace_graph_active) { t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH * sizeof(struct ftrace_ret_stack), GFP_KERNEL); From dcef788eb9659b61a2110284fcce3ca6e63480d2 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Tue, 31 Mar 2009 15:26:14 +0800 Subject: [PATCH 088/900] ftrace: clean up enable logic for sched_switch Unify sched_switch and sched_wakeup's action to following logic: Do record_cmdline when start_cmdline_record() is called. Start tracing events when the tracer is started. Signed-off-by: Zhao Lei LKML-Reference: <49D1C596.5050203@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_switch.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 9117cea6f1a..9d8cccdfaa0 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, int cpu; int pc; - if (!sched_ref || sched_stopped) + if (unlikely(!sched_ref)) return; tracing_record_cmdline(prev); tracing_record_cmdline(next); - if (!tracer_enabled) + if (!tracer_enabled || sched_stopped) return; pc = preempt_count(); @@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) unsigned long flags; int cpu, pc; - if (!likely(tracer_enabled)) + if (unlikely(!sched_ref)) + return; + + tracing_record_cmdline(current); + + if (!tracer_enabled || sched_stopped) return; pc = preempt_count(); - tracing_record_cmdline(current); - - if (sched_stopped) - return; - local_irq_save(flags); cpu = raw_smp_processor_id(); data = ctx_trace->data[cpu]; From 169aafbc8d3f05431b5cfeb60294a12b8ef2bcee Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 7 Apr 2009 13:37:26 -0700 Subject: [PATCH 089/900] lguest: update lazy mmu changes to match lguest's use of kvm hypercalls Duplicate hcall -> kvm_hypercall0 convertion from "lguest: use KVM hypercalls". Signed-off-by: Jeremy Fitzhardinge Cc: Matias Zabaljauregui Cc: Rusty Russell --- arch/x86/lguest/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 5ab239711cc..cfb2d68dc79 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -168,7 +168,7 @@ static void lazy_hcall3(unsigned long call, * issue the do-nothing hypercall to flush any stored calls. */ static void lguest_leave_lazy_mmu_mode(void) { - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); + kvm_hypercall0(LHCALL_FLUSH_ASYNC); paravirt_leave_lazy_mmu(); } From 44bc9dc729e33a4ec6ebed4d0b6c08e8d20b42cf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 10:47:17 +0200 Subject: [PATCH 090/900] mm, x86, ptrace, bts: defer branch trace stopping, cleanup Andrew Morton noticed that mm.h needlessly includes sched.h - remove it. Reported-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 64d8ed2538a..776b641f37e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -13,7 +13,6 @@ #include #include #include -#include struct mempolicy; struct anon_vma; From a34b50ddc265bae058c66661b096ef6384c5a8b1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 10:56:54 +0200 Subject: [PATCH 091/900] mm, x86, ptrace, bts: defer branch trace stopping, remove dead code Remove the unused free_locked_buffer() API. Signed-off-by: Ingo Molnar --- include/linux/mm.h | 1 - mm/mlock.c | 6 ------ 2 files changed, 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 776b641f37e..a3963ba23a6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1320,7 +1320,6 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); extern void *alloc_locked_buffer(size_t size); -extern void free_locked_buffer(void *buffer, size_t size); extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 749383b442c..28be15ead9c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -671,9 +671,3 @@ void refund_locked_buffer_memory(struct mm_struct *mm, size_t size) up_write(&mm->mmap_sem); } - -void free_locked_buffer(void *buffer, size_t size) -{ - refund_locked_buffer_memory(current->mm, size); - kfree(buffer); -} From bab5bc9e857638880facef76e4b4c3fa807f8c73 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Tue, 7 Apr 2009 23:23:50 -0700 Subject: [PATCH 092/900] futex: fixup unlocked requeue pi case Thomas's testing caught a problem when the requeue target futex is unowned and multiple tasks are requeued to it. This patch ensures the FUTEX_WAITERS bit gets set if futex_requeue() will requeue one or more tasks in addition to the one acquiring the lock. Signed-off-by: Darren Hart Signed-off-by: Thomas Gleixner --- kernel/futex.c | 65 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 185c981d89e..041bf3ac4be 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -565,12 +565,14 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, /** * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex - * @uaddr: the pi futex user address - * @hb: the pi futex hash bucket - * @key: the futex key associated with uaddr and hb - * @ps: the pi_state pointer where we store the result of the lookup - * @task: the task to perform the atomic lock work for. This will be - * "current" except in the case of requeue pi. + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the + * lookup + * @task: the task to perform the atomic lock work for. This will + * be "current" except in the case of requeue pi. + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Returns: * 0 - ready to wait @@ -582,7 +584,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps, - struct task_struct *task) + struct task_struct *task, int set_waiters) { int lock_taken, ret, ownerdied = 0; u32 uval, newval, curval; @@ -596,6 +598,8 @@ retry: * the locks. It will most likely not succeed. */ newval = task_pid_vnr(task); + if (set_waiters) + newval |= FUTEX_WAITERS; curval = cmpxchg_futex_value_locked(uaddr, 0, newval); @@ -1004,14 +1008,18 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) /** * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter - * @pifutex: the user address of the to futex - * @hb1: the from futex hash bucket, must be locked by the caller - * @hb2: the to futex hash bucket, must be locked by the caller - * @key1: the from futex key - * @key2: the to futex key + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * @ps: address to store the pi_state pointer + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Try and get the lock on behalf of the top waiter if we can do it atomically. - * Wake the top waiter if we succeed. hb1 and hb2 must be held by the caller. + * Wake the top waiter if we succeed. If the caller specified set_waiters, + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. * * Returns: * 0 - failed to acquire the lock atomicly @@ -1022,15 +1030,23 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2, union futex_key *key1, union futex_key *key2, - struct futex_pi_state **ps) + struct futex_pi_state **ps, int set_waiters) { - struct futex_q *top_waiter; + struct futex_q *top_waiter = NULL; u32 curval; int ret; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; + /* + * Find the top_waiter and determine if there are additional waiters. + * If the caller intends to requeue more than 1 waiter to pifutex, + * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, + * as we have means to handle the possible fault. If not, don't set + * the bit unecessarily as it will force the subsequent unlock to enter + * the kernel. + */ top_waiter = futex_top_waiter(hb1, key1); /* There are no waiters, nothing for us to do. */ @@ -1038,10 +1054,12 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, return 0; /* - * Either take the lock for top_waiter or set the FUTEX_WAITERS bit. - * The pi_state is returned in ps in contended cases. + * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in + * the contended case or if set_waiters is 1. The pi_state is returned + * in ps in contended cases. */ - ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task); + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, + set_waiters); if (ret == 1) requeue_pi_wake_futex(top_waiter, key2); @@ -1146,9 +1164,14 @@ retry_private: } if (requeue_pi && (task_count - nr_wake < nr_requeue)) { - /* Attempt to acquire uaddr2 and wake the top_waiter. */ + /* + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + */ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state); + &key2, &pi_state, nr_requeue); /* * At this point the top_waiter has either taken uaddr2 or is @@ -1810,7 +1833,7 @@ retry: retry_private: hb = queue_lock(&q); - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current); + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); if (unlikely(ret)) { switch (ret) { case 1: From fcb2ac5bdfa3a7a04fb9749b916f64400f4c35a8 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 8 Apr 2009 13:31:58 +0200 Subject: [PATCH 093/900] x86_32: introduce restore_fpu_checking() Impact: cleanup, prepare FPU code unificaton Like on x86_64, return an error from restore_fpu and kill the task if it fails. Also rename restore_fpu to restore_fpu_checking which allows ifdefs to be removed in math_state_restore(). Signed-off-by: Jiri Slaby LKML-Reference: <1239190320-23952-1-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 9 ++++----- arch/x86/kernel/traps.c | 5 +---- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 71c9e518398..09a2d6dfd85 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -185,12 +185,10 @@ static inline void tolerant_fwait(void) asm volatile("fnclex ; fwait"); } -static inline void restore_fpu(struct task_struct *tsk) +static inline int restore_fpu_checking(struct task_struct *tsk) { - if (task_thread_info(tsk)->status & TS_XSAVE) { - xrstor_checking(&tsk->thread.xstate->xsave); - return; - } + if (task_thread_info(tsk)->status & TS_XSAVE) + return xrstor_checking(&tsk->thread.xstate->xsave); /* * The "nop" is needed to make the instructions the same * length. @@ -200,6 +198,7 @@ static inline void restore_fpu(struct task_struct *tsk) "fxrstor %1", X86_FEATURE_FXSR, "m" (tsk->thread.xstate->fxsave)); + return 0; } /* We need a safe address that is cheap to find and that is already diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff..d696145855b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ -#ifdef CONFIG_X86_32 - restore_fpu(tsk); -#else /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ @@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void) force_sig(SIGSEGV, tsk); return; } -#endif + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } From 34ba476a01e128aad51e02f9be854584e9ec73cf Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 8 Apr 2009 13:31:59 +0200 Subject: [PATCH 094/900] x86: unify restore_fpu_checking Impact: cleanup On x86_32, separate f*rstor to an inline function which makes restore_fpu_checking the same on both platforms -> move it outside the ifdefs. Signed-off-by: Jiri Slaby LKML-Reference: <1239190320-23952-2-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 09a2d6dfd85..7a6f21d95cf 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) return err; } -static inline int restore_fpu_checking(struct task_struct *tsk) -{ - if (task_thread_info(tsk)->status & TS_XSAVE) - return xrstor_checking(&tsk->thread.xstate->xsave); - else - return fxrstor_checking(&tsk->thread.xstate->fxsave); -} - /* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. The kernel data segment can be sometimes 0 and sometimes @@ -185,10 +177,9 @@ static inline void tolerant_fwait(void) asm volatile("fnclex ; fwait"); } -static inline int restore_fpu_checking(struct task_struct *tsk) +/* perform fxrstor iff the processor has extended states, otherwise frstor */ +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) { - if (task_thread_info(tsk)->status & TS_XSAVE) - return xrstor_checking(&tsk->thread.xstate->xsave); /* * The "nop" is needed to make the instructions the same * length. @@ -197,7 +188,8 @@ static inline int restore_fpu_checking(struct task_struct *tsk) "nop ; frstor %1", "fxrstor %1", X86_FEATURE_FXSR, - "m" (tsk->thread.xstate->fxsave)); + "m" (*fx)); + return 0; } @@ -261,6 +253,14 @@ end: #endif /* CONFIG_X86_64 */ +static inline int restore_fpu_checking(struct task_struct *tsk) +{ + if (task_thread_info(tsk)->status & TS_XSAVE) + return xrstor_checking(&tsk->thread.xstate->xsave); + else + return fxrstor_checking(&tsk->thread.xstate->fxsave); +} + /* * Signal frame handlers... */ From 4ecf458492c2d97b3f9d850a5f92d79792e0a7e7 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 8 Apr 2009 13:32:00 +0200 Subject: [PATCH 095/900] x86_64: fix incorrect comments Impact: cleanup The comments which fxrstor_checking and fxsave_uset refer to is now in fxsave. Change the comments appropriately. Signed-off-by: Jiri Slaby Cc: Jiri Slaby LKML-Reference: <1239190320-23952-3-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 7a6f21d95cf..63d185087d9 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) ".previous\n" _ASM_EXTABLE(1b, 3b) : [err] "=r" (err) -#if 0 /* See comment in __save_init_fpu() below. */ +#if 0 /* See comment in fxsave() below. */ : [fx] "r" (fx), "m" (*fx), "0" (0)); #else : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); @@ -112,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) ".previous\n" _ASM_EXTABLE(1b, 3b) : [err] "=r" (err), "=m" (*fx) -#if 0 /* See comment in __fxsave_clear() below. */ +#if 0 /* See comment in fxsave() below. */ : [fx] "r" (fx), "0" (0)); #else : [fx] "cdaSDb" (fx), "0" (0)); From a59dacfdc9ba06903652fa4883bf1106278b18ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Oct 2008 14:38:08 +0200 Subject: [PATCH 096/900] x86 early quirks: eliminate unused function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup this warning: arch/x86/kernel/early-quirks.c:99: warning: ‘ati_ixp4x0_rev’ defined but not used triggers because ati_ixp4x0_rev() is only used in the ACPI && X86_IO_APIC case. Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 76b8cd953de..ebdb85cf268 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -96,6 +96,7 @@ static void __init nvidia_bugs(int num, int slot, int func) } +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { @@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func) d &= 0xff; return d; } +#endif static void __init ati_bugs(int num, int slot, int func) { From cdc1cb0d4445f39561a65204d26f89365f917550 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 3 Apr 2009 17:15:14 -0700 Subject: [PATCH 097/900] x86: make wakeup_secondary_cpu_via_init static Impact: cleanup Signed-off-by: Yinghai Lu LKML-Reference: <49D6A692.6040400@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58d24ef917d..bddf2ccaf32 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -int __devinit +static int __devinit wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; From 02421f98ec55c3ff118f358740ff640f096c7ad6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 3 Apr 2009 17:15:53 -0700 Subject: [PATCH 098/900] x86: consistent about warm_reset_vector for UN_NON_UNIQUE_APIC Impact: cleanup didn't set it for UV_NON_UNIQUE_APIC, so don't restore it Signed-off-by: Yinghai Lu LKML-Reference: <49D6A6B9.6060501@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bddf2ccaf32..bf8ad6344b1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -822,10 +822,12 @@ do_rest: /* mark "stuck" area as not stuck */ *((volatile unsigned long *)trampoline_base) = 0; - /* - * Cleanup possible dangling ends... - */ - smpboot_restore_warm_reset_vector(); + if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { + /* + * Cleanup possible dangling ends... + */ + smpboot_restore_warm_reset_vector(); + } return boot_error; } From ceb5ac3264686e75e6951de6a18d4baa9bdecb92 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:15 -0500 Subject: [PATCH 099/900] swiotlb: comment corrections Impact: cleanup swiotlb_map/unmap_single are now swiotlb_map/unmap_page; trivially change all the comments to reference new names. Also, there were some comments that should have been referring to just plain old map_single, not swiotlb_map_single; fix those as well. Also change a use of the word "pointer", when what is referred to is actually a dma/physical address. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-2-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 2b0b5a7d2ce..170cf56af6a 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -60,8 +60,8 @@ enum dma_sync_target { int swiotlb_force; /* - * Used to do a quick range check in swiotlb_unmap_single and - * swiotlb_sync_single_*, to see if the memory was in fact allocated by this + * Used to do a quick range check in unmap_single and + * sync_single_*, to see if the memory was in fact allocated by this * API. */ static char *io_tlb_start, *io_tlb_end; @@ -560,7 +560,6 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, size)) { /* * The allocated memory isn't reachable by the device. - * Fall back on swiotlb_map_single(). */ free_pages((unsigned long) ret, order); ret = NULL; @@ -568,9 +567,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, if (!ret) { /* * We are either out of memory or the device can't DMA - * to GFP_DMA memory; fall back on - * swiotlb_map_single(), which will grab memory from - * the lowest available address range. + * to GFP_DMA memory; fall back on map_single(), which + * will grab memory from the lowest available address range. */ ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE); if (!ret) @@ -634,7 +632,7 @@ swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) * physical address to use is returned. * * Once the device is given the dma address, the device owns this memory until - * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. + * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed. */ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, @@ -648,7 +646,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, BUG_ON(dir == DMA_NONE); /* - * If the pointer passed in happens to be in the device's DMA window, + * If the address happens to be in the device's DMA window, * we can safely return the device addr and not worry about bounce * buffering it. */ @@ -679,7 +677,7 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); /* * Unmap a single streaming mode DMA translation. The dma_addr and size must - * match what was provided for in a previous swiotlb_map_single call. All + * match what was provided for in a previous swiotlb_map_page call. All * other usages are undefined. * * After this call, reads by the cpu to the buffer are guaranteed to see @@ -703,7 +701,7 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page); * Make physical memory consistent for a single streaming mode DMA translation * after a transfer. * - * If you perform a swiotlb_map_single() but wish to interrogate the buffer + * If you perform a swiotlb_map_page() but wish to interrogate the buffer * using the cpu, yet do not wish to teardown the dma mapping, you must * call this function before doing so. At the next point you give the dma * address back to the card, you must first perform a @@ -777,7 +775,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device); /* * Map a set of buffers described by scatterlist in streaming mode for DMA. - * This is the scatter-gather version of the above swiotlb_map_single + * This is the scatter-gather version of the above swiotlb_map_page * interface. Here the scatter gather list elements are each tagged with the * appropriate dma address and length. They are obtained via * sg_dma_{address,length}(SG). @@ -788,7 +786,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device); * The routine returns the number of addr/length pairs actually * used, at most nents. * - * Device ownership issues as mentioned above for swiotlb_map_single are the + * Device ownership issues as mentioned above for swiotlb_map_page are the * same here. */ int @@ -836,7 +834,7 @@ EXPORT_SYMBOL(swiotlb_map_sg); /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules - * concerning calls here are the same as for swiotlb_unmap_single() above. + * concerning calls here are the same as for swiotlb_unmap_page() above. */ void swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, From 67131ad0514d7105b55003a0506209cf1bba3f00 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:16 -0500 Subject: [PATCH 100/900] swiotlb: fix compile warning Squash a build warning seen on 32-bit powerpc caused by calling min() with 2 different types. Use min_t() instead. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-3-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 170cf56af6a..4fd6a76e728 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -341,7 +341,7 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, unsigned long flags; while (size) { - sz = min(PAGE_SIZE - offset, size); + sz = min_t(size_t, PAGE_SIZE - offset, size); local_irq_save(flags); buffer = kmap_atomic(pfn_to_page(pfn), From dd6b02fe427f30520d0adc94aa52352367227873 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:17 -0500 Subject: [PATCH 101/900] swiotlb: map_page fix for highmem systems The current code calls virt_to_phys() on address that might be in highmem, which is bad. This wasn't needed, anyway, because we already have the physical address we need. Get rid of the now-unused virtual address as well. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-4-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 4fd6a76e728..e8a47c8cf77 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -640,7 +640,6 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, struct dma_attrs *attrs) { phys_addr_t phys = page_to_phys(page) + offset; - void *ptr = page_address(page) + offset; dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys); void *map; @@ -651,7 +650,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * buffering it. */ if (!address_needs_mapping(dev, dev_addr, size) && - !range_needs_mapping(virt_to_phys(ptr), size)) + !range_needs_mapping(phys, size)) return dev_addr; /* From ef5722f698bde01cfec2b98fff733a48663ebf55 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:18 -0500 Subject: [PATCH 102/900] swiotlb: allow arch override of address_needs_mapping Some architectures require additional checking to determine if a device can dma to an address and need to provide their own address_needs_mapping.. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-5-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index e8a47c8cf77..d81afab8516 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -145,6 +145,12 @@ static void *swiotlb_bus_to_virt(dma_addr_t address) return phys_to_virt(swiotlb_bus_to_phys(address)); } +int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev, + dma_addr_t addr, size_t size) +{ + return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); +} + int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) { return 0; @@ -309,10 +315,10 @@ cleanup1: return -ENOMEM; } -static int +static inline int address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size) { - return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); + return swiotlb_arch_address_needs_mapping(hwdev, addr, size); } static inline int range_needs_mapping(phys_addr_t paddr, size_t size) From 7fcebbd2d984eac3fdd6da2f4453e7c43d32de89 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:19 -0500 Subject: [PATCH 103/900] swiotlb: rename unmap_single to do_unmap_single Previously, swiotlb_unmap_page and swiotlb_unmap_sg were duplicating very similar code. Refactor that code into a new unmap_single and unmap_single use do_unmap_single. Note that the swiotlb_unmap_sg code was previously doing a complicated comparison to determine if an addresses needed to be unmapped where a simple is_swiotlb_buffer() call would have sufficed. Signed-off-by: Becky Bruce Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-6-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index d81afab8516..2bde54a40d8 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -482,7 +482,7 @@ found: * dma_addr is the kernel virtual address of the bounce buffer to unmap. */ static void -unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) { unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; @@ -591,7 +591,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, (unsigned long long)dev_addr); /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - unmap_single(hwdev, ret, size, DMA_TO_DEVICE); + do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE); return NULL; } *dma_handle = dev_addr; @@ -608,7 +608,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, free_pages((unsigned long) vaddr, get_order(size)); else /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); + do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); } EXPORT_SYMBOL(swiotlb_free_coherent); @@ -688,17 +688,29 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); * After this call, reads by the cpu to the buffer are guaranteed to see * whatever the device wrote there. */ -void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) +static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir) { char *dma_addr = swiotlb_bus_to_virt(dev_addr); BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dma_addr)) - unmap_single(hwdev, dma_addr, size, dir); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(dma_addr, size); + + if (is_swiotlb_buffer(dma_addr)) { + do_unmap_single(hwdev, dma_addr, size, dir); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(dma_addr, size); +} + +void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + unmap_single(hwdev, dev_addr, size, dir); } EXPORT_SYMBOL_GPL(swiotlb_unmap_page); @@ -850,13 +862,9 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); - for_each_sg(sgl, sg, nelems, i) { - if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg))) - unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), - sg->dma_length, dir); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length); - } + for_each_sg(sgl, sg, nelems, i) + unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + } EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); From 380d687833aee098c4a2c3b35beaefe1c1f48d01 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:20 -0500 Subject: [PATCH 104/900] swiotlb: use swiotlb_sync_single instead of duplicating code Right now both swiotlb_sync_single_range and swiotlb_sync_sg were duplicating the code in swiotlb_sync_single. Just call it instead. Also rearrange the sync_single code for readability. Note that the swiotlb_sync_sg code was previously doing a complicated comparison to determine if an addresses needed to be unmapped where a simple is_swiotlb_buffer() call would have sufficed. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-7-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 2bde54a40d8..d912f068145 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -731,10 +731,16 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, char *dma_addr = swiotlb_bus_to_virt(dev_addr); BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dma_addr)) + + if (is_swiotlb_buffer(dma_addr)) { sync_single(hwdev, dma_addr, size, dir, target); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(dma_addr, size); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(dma_addr, size); } void @@ -761,13 +767,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr, unsigned long offset, size_t size, int dir, int target) { - char *dma_addr = swiotlb_bus_to_virt(dev_addr) + offset; - - BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dma_addr)) - sync_single(hwdev, dma_addr, size, dir, target); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(dma_addr, size); + swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target); } void @@ -890,15 +890,9 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, struct scatterlist *sg; int i; - BUG_ON(dir == DMA_NONE); - - for_each_sg(sgl, sg, nelems, i) { - if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg))) - sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), + for_each_sg(sgl, sg, nelems, i) + swiotlb_sync_single(hwdev, sg->dma_address, sg->dma_length, dir, target); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length); - } } void From 42d7c5e353cef9062129b0de3ec9ddf10567b9ca Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:21 -0500 Subject: [PATCH 105/900] swiotlb: change swiotlb_bus_to[phys,virt] prototypes Add a hwdev argument that is needed on some architectures in order to access a per-device offset that is taken into account when producing a physical address (also needed to get from bus address to virtual address because the physical address is an intermediate step). Also make swiotlb_bus_to_virt weak so architectures can override it. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-8-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb.c | 2 +- include/linux/swiotlb.h | 3 ++- lib/swiotlb.c | 10 +++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 34f12e9996e..887388a1c57 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) return paddr; } -phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) { return baddr; } diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index ac9ff54f7cb..cb1a6631b8f 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -29,7 +29,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t address); -extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address); +extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, + dma_addr_t address); extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size); diff --git a/lib/swiotlb.c b/lib/swiotlb.c index d912f068145..bffe6d7ef9d 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -129,7 +129,7 @@ dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) return paddr; } -phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr) +phys_addr_t __weak swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) { return baddr; } @@ -140,9 +140,9 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, return swiotlb_phys_to_bus(hwdev, virt_to_phys(address)); } -static void *swiotlb_bus_to_virt(dma_addr_t address) +void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address) { - return phys_to_virt(swiotlb_bus_to_phys(address)); + return phys_to_virt(swiotlb_bus_to_phys(hwdev, address)); } int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev, @@ -691,7 +691,7 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir) { - char *dma_addr = swiotlb_bus_to_virt(dev_addr); + char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); @@ -728,7 +728,7 @@ static void swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir, int target) { - char *dma_addr = swiotlb_bus_to_virt(dev_addr); + char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); From a4e94ef0dd391eae05bdeacd12b8da3510957a97 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 27 Mar 2009 17:07:05 +0800 Subject: [PATCH 106/900] printk: add support of hh length modifier for printk Impact: new feature, extend vsprintf format strings hh is used as length modifier for signed char or unsigned char. It is supported by glibc, we add kernel support now. Signed-off-by: Zhao Lei Acked-by: Lai Jiangshan Acked-by: Frederic Weisbecker Cc: torvalds@linux-foundation.org Cc: Steven Rostedt LKML-Reference: <49CC9739.30107@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- lib/vsprintf.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 7536acea135..b56f6d039d2 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -408,6 +408,8 @@ enum format_type { FORMAT_TYPE_LONG_LONG, FORMAT_TYPE_ULONG, FORMAT_TYPE_LONG, + FORMAT_TYPE_UBYTE, + FORMAT_TYPE_BYTE, FORMAT_TYPE_USHORT, FORMAT_TYPE_SHORT, FORMAT_TYPE_UINT, @@ -853,11 +855,15 @@ qualifier: spec->qualifier = -1; if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || *fmt == 'Z' || *fmt == 'z' || *fmt == 't') { - spec->qualifier = *fmt; - ++fmt; - if (spec->qualifier == 'l' && *fmt == 'l') { - spec->qualifier = 'L'; - ++fmt; + spec->qualifier = *fmt++; + if (unlikely(spec->qualifier == *fmt)) { + if (spec->qualifier == 'l') { + spec->qualifier = 'L'; + ++fmt; + } else if (spec->qualifier == 'h') { + spec->qualifier = 'H'; + ++fmt; + } } } @@ -919,6 +925,11 @@ qualifier: spec->type = FORMAT_TYPE_SIZE_T; } else if (spec->qualifier == 't') { spec->type = FORMAT_TYPE_PTRDIFF; + } else if (spec->qualifier == 'H') { + if (spec->flags & SIGN) + spec->type = FORMAT_TYPE_BYTE; + else + spec->type = FORMAT_TYPE_UBYTE; } else if (spec->qualifier == 'h') { if (spec->flags & SIGN) spec->type = FORMAT_TYPE_SHORT; @@ -1087,6 +1098,12 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) case FORMAT_TYPE_PTRDIFF: num = va_arg(args, ptrdiff_t); break; + case FORMAT_TYPE_UBYTE: + num = (unsigned char) va_arg(args, int); + break; + case FORMAT_TYPE_BYTE: + num = (signed char) va_arg(args, int); + break; case FORMAT_TYPE_USHORT: num = (unsigned short) va_arg(args, int); break; @@ -1363,6 +1380,10 @@ do { \ case FORMAT_TYPE_PTRDIFF: save_arg(ptrdiff_t); break; + case FORMAT_TYPE_UBYTE: + case FORMAT_TYPE_BYTE: + save_arg(char); + break; case FORMAT_TYPE_USHORT: case FORMAT_TYPE_SHORT: save_arg(short); @@ -1538,6 +1559,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) case FORMAT_TYPE_PTRDIFF: num = get_arg(ptrdiff_t); break; + case FORMAT_TYPE_UBYTE: + num = get_arg(unsigned char); + break; + case FORMAT_TYPE_BYTE: + num = get_arg(signed char); + break; case FORMAT_TYPE_USHORT: num = get_arg(unsigned short); break; From ae9e6bc9f74f8247cbca50a6a93c80e0d686fa19 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Apr 2009 13:19:54 +0900 Subject: [PATCH 107/900] percpu: don't put the first chunk in reverse-map rbtree Impact: both first chunks don't use rbtree, no functional change There can be two first chunks - reserved and dynamic with the former one being optional. Dynamic first chunk was linked on reverse-mapping rbtree while the reserved one was mapped manually using the start address and reserved offset limit. This patch makes both first chunks to be looked up manually without using the rbtree. This is to help getting rid of the rbtree. Signed-off-by: Tejun Heo Cc: Martin Schwidefsky Cc: rusty@rustcorp.com.au Cc: Paul Mundt Cc: rmk@arm.linux.org.uk Cc: starvik@axis.com Cc: ralf@linux-mips.org Cc: davem@davemloft.net Cc: cooloney@kernel.org Cc: kyle@mcmartin.ca Cc: matthew@wil.cx Cc: grundler@parisc-linux.org Cc: takata@linux-m32r.org Cc: benh@kernel.crashing.org Cc: rth@twiddle.net Cc: ink@jurassic.park.msu.ru Cc: heiko.carstens@de.ibm.com Cc: Linus Torvalds Cc: Nick Piggin Cc: Christoph Lameter LKML-Reference: <49D43CEA.3040609@kernel.org> Signed-off-by: Ingo Molnar --- mm/percpu.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 1aa5d8fbca1..bf1bf1f4a72 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -110,9 +110,21 @@ static size_t pcpu_chunk_struct_size __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); -/* optional reserved chunk, only accessible for reserved allocations */ +/* + * The first chunk which always exists. Note that unlike other + * chunks, this one can be allocated and mapped in several different + * ways and thus often doesn't live in the vmalloc area. + */ +static struct pcpu_chunk *pcpu_first_chunk; + +/* + * Optional reserved chunk. This chunk reserves part of the first + * chunk and serves it for reserved allocations. The amount of + * reserved offset is in pcpu_reserved_chunk_limit. When reserved + * area doesn't exist, the following variables contain NULL and 0 + * respectively. + */ static struct pcpu_chunk *pcpu_reserved_chunk; -/* offset limit of the reserved chunk */ static int pcpu_reserved_chunk_limit; /* @@ -297,15 +309,16 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr, */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { + void *first_start = pcpu_first_chunk->vm->addr; struct rb_node *n, *parent; struct pcpu_chunk *chunk; - /* is it in the reserved chunk? */ - if (pcpu_reserved_chunk) { - void *start = pcpu_reserved_chunk->vm->addr; - - if (addr >= start && addr < start + pcpu_reserved_chunk_limit) + /* is it in the first chunk? */ + if (addr >= first_start && addr < first_start + pcpu_chunk_size) { + /* is it in the reserved area? */ + if (addr < first_start + pcpu_reserved_chunk_limit) return pcpu_reserved_chunk; + return pcpu_first_chunk; } /* nah... search the regular ones */ @@ -1147,7 +1160,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (reserved_size) { schunk->free_size = reserved_size; - pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ + pcpu_reserved_chunk = schunk; + pcpu_reserved_chunk_limit = static_size + reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ @@ -1158,8 +1172,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; - pcpu_reserved_chunk_limit = static_size + schunk->free_size; - /* init dynamic chunk if necessary */ if (dyn_size) { dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); @@ -1226,13 +1238,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, } /* link the first chunk in */ - if (!dchunk) { - pcpu_chunk_relocate(schunk, -1); - pcpu_chunk_addr_insert(schunk); - } else { - pcpu_chunk_relocate(dchunk, -1); - pcpu_chunk_addr_insert(dchunk); - } + pcpu_first_chunk = dchunk ?: schunk; + pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); From e1b9aa3f47242e757c776a3771bb6613e675bf9c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 2 Apr 2009 13:21:44 +0900 Subject: [PATCH 108/900] percpu: remove rbtree and use page->index instead Impact: use page->index for addr to chunk mapping instead of dedicated rbtree The rbtree is used to determine the chunk from the virtual address. However, we can already determine the page struct from a virtual address and there are several unused fields in page struct used by vmalloc. Use the index field to store a pointer to the chunk. Then there is no need anymore for an rbtree. tj: * s/(set|get)_chunk/pcpu_\1_page_chunk/ * Drop inline from the above two functions and moved them upwards so that they are with other simple helpers. * Initial pages might not (actually most of the time don't) live in the vmalloc area. With the previous patch to manually reverse-map both first chunks, this is no longer an issue. Removed pcpu_set_chunk() call on initial pages. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo Cc: Martin Schwidefsky Cc: rusty@rustcorp.com.au Cc: Paul Mundt Cc: rmk@arm.linux.org.uk Cc: starvik@axis.com Cc: ralf@linux-mips.org Cc: davem@davemloft.net Cc: cooloney@kernel.org Cc: kyle@mcmartin.ca Cc: matthew@wil.cx Cc: grundler@parisc-linux.org Cc: takata@linux-m32r.org Cc: benh@kernel.crashing.org Cc: rth@twiddle.net Cc: ink@jurassic.park.msu.ru Cc: heiko.carstens@de.ibm.com Cc: Linus Torvalds Cc: Nick Piggin LKML-Reference: <49D43D58.4050102@kernel.org> Signed-off-by: Ingo Molnar --- mm/percpu.c | 100 +++++++++++----------------------------------------- 1 file changed, 20 insertions(+), 80 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index bf1bf1f4a72..c0b2c1a76e8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -23,7 +23,7 @@ * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers UNIT_SIZE apart. + * percpu base registers pcpu_unit_size apart. * * There are usually many small percpu allocations many of them as * small as 4 bytes. The allocator organizes chunks into lists @@ -38,8 +38,8 @@ * region and negative allocated. Allocation inside a chunk is done * by scanning this map sequentially and serving the first matching * entry. This is mostly copied from the percpu_modalloc() allocator. - * Chunks are also linked into a rb tree to ease address to chunk - * mapping during free. + * Chunks can be determined from the address using the index field + * in the page struct. The index field contains a pointer to the chunk. * * To use this allocator, arch code should do the followings. * @@ -61,7 +61,6 @@ #include #include #include -#include #include #include #include @@ -88,7 +87,6 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ - struct rb_node rb_node; /* key is chunk->vm->addr */ int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ struct vm_struct *vm; /* mapped vmalloc region */ @@ -133,7 +131,7 @@ static int pcpu_reserved_chunk_limit; * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former * protects allocation/reclaim paths, chunks and chunk->page arrays. * The latter is a spinlock and protects the index data structures - - * chunk slots, rbtree, chunks and area maps in chunks. + * chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory @@ -152,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ -static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ /* reclaim work to release fully free chunks, scheduled from free path */ static void pcpu_reclaim(struct work_struct *work); @@ -203,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; } +/* set the pointer to a chunk in a page struct */ +static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) +{ + page->index = (unsigned long)pcpu; +} + +/* obtain pointer to a chunk from a page struct */ +static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) +{ + return (struct pcpu_chunk *)page->index; +} + /** * pcpu_mem_alloc - allocate memory * @size: bytes to allocate @@ -269,40 +278,9 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) } } -static struct rb_node **pcpu_chunk_rb_search(void *addr, - struct rb_node **parentp) -{ - struct rb_node **p = &pcpu_addr_root.rb_node; - struct rb_node *parent = NULL; - struct pcpu_chunk *chunk; - - while (*p) { - parent = *p; - chunk = rb_entry(parent, struct pcpu_chunk, rb_node); - - if (addr < chunk->vm->addr) - p = &(*p)->rb_left; - else if (addr > chunk->vm->addr) - p = &(*p)->rb_right; - else - break; - } - - if (parentp) - *parentp = parent; - return p; -} - /** - * pcpu_chunk_addr_search - search for chunk containing specified address - * @addr: address to search for - * - * Look for chunk which might contain @addr. More specifically, it - * searchs for the chunk with the highest start address which isn't - * beyond @addr. - * - * CONTEXT: - * pcpu_lock. + * pcpu_chunk_addr_search - determine chunk containing specified address + * @addr: address for which the chunk needs to be determined. * * RETURNS: * The address of the found chunk. @@ -310,8 +288,6 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr, static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { void *first_start = pcpu_first_chunk->vm->addr; - struct rb_node *n, *parent; - struct pcpu_chunk *chunk; /* is it in the first chunk? */ if (addr >= first_start && addr < first_start + pcpu_chunk_size) { @@ -321,42 +297,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) return pcpu_first_chunk; } - /* nah... search the regular ones */ - n = *pcpu_chunk_rb_search(addr, &parent); - if (!n) { - /* no exactly matching chunk, the parent is the closest */ - n = parent; - BUG_ON(!n); - } - chunk = rb_entry(n, struct pcpu_chunk, rb_node); - - if (addr < chunk->vm->addr) { - /* the parent was the next one, look for the previous one */ - n = rb_prev(n); - BUG_ON(!n); - chunk = rb_entry(n, struct pcpu_chunk, rb_node); - } - - return chunk; -} - -/** - * pcpu_chunk_addr_insert - insert chunk into address rb tree - * @new: chunk to insert - * - * Insert @new into address rb tree. - * - * CONTEXT: - * pcpu_lock. - */ -static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) -{ - struct rb_node **p, *parent; - - p = pcpu_chunk_rb_search(new->vm->addr, &parent); - BUG_ON(*p); - rb_link_node(&new->rb_node, parent, p); - rb_insert_color(&new->rb_node, &pcpu_addr_root); + return pcpu_get_page_chunk(vmalloc_to_page(addr)); } /** @@ -768,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) alloc_mask, 0); if (!*pagep) goto err; + pcpu_set_page_chunk(*pagep, chunk); } } @@ -892,7 +834,6 @@ restart: spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); - pcpu_chunk_addr_insert(chunk); goto restart; area_found: @@ -981,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work) if (chunk == list_first_entry(head, struct pcpu_chunk, list)) continue; - rb_erase(&chunk->rb_node, &pcpu_addr_root); list_move(&chunk->list, &todo); } From e85abf8f432bb2a13733ab7609fbb8e1500af51d Mon Sep 17 00:00:00 2001 From: Gary Hade Date: Wed, 8 Apr 2009 14:07:25 -0700 Subject: [PATCH 109/900] x86: consolidate SMP code in io_apic.c Impact: Cleanup Reorganizes the code in arch/x86/kernel/io_apic.c by combining two '#ifdef CONFIG_SMP' regions. In addition to making the code easier to understand the first '#ifdef CONFIG_SMP' region is moved to a location later in the file which will reduce the need for function forward declarations when the code subsequently revised. The only changes other than relocating code to a different position in the file were the removal of the assign_irq_vector() forward declaration which was no longer needed and some line length reduction formatting changes. Signed-off-by: Gary Hade Cc: lcm@us.ibm.com LKML-Reference: <20090408210725.GC11159@us.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 223 ++++++++++++++++----------------- 1 file changed, 109 insertions(+), 114 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 767fe7e46d6..7c9d045ac83 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -518,120 +518,6 @@ static void ioapic_mask_entry(int apic, int pin) spin_unlock_irqrestore(&ioapic_lock, flags); } -#ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) -{ - cpumask_var_t cleanup_mask; - - if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { - unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); - } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); - apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - free_cpumask_var(cleanup_mask); - } - cfg->move_in_progress = 0; -} - -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(irq)) - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - -/* - * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and - * leaves desc->affinity untouched. - */ -static unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned int irq; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return BAD_APICID; - - irq = desc->irq; - cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) - return BAD_APICID; - - /* check that before desc->addinity get updated */ - set_extra_move_desc(desc, mask); - - cpumask_copy(desc->affinity, mask); - - return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); -} - -static void -set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - dest = set_desc_affinity(desc, mask); - if (dest != BAD_APICID) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, cfg); - } - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void -set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - set_ioapic_affinity_irq_desc(desc, mask); -} -#endif /* CONFIG_SMP */ - /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -2360,6 +2246,115 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP +static void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +static void +__target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +/* + * Either sets desc->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned int irq; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; + + irq = desc->irq; + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) + return BAD_APICID; + + /* check that before desc->addinity get updated */ + set_extra_move_desc(desc, mask); + + cpumask_copy(desc->affinity, mask); + + return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); +} + +static void +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + } + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + set_ioapic_affinity_irq_desc(desc, mask); +} #ifdef CONFIG_INTR_REMAP From d2de688891909b148efe83a6fc9520a9cd6015f0 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 9 Apr 2009 15:17:22 +1000 Subject: [PATCH 110/900] sparc64: extend TI_RESTART_BLOCK space by 8 bytes Impact: build fix Today's linux-next build (sparc64 defconfig) failed like this: arch/sparc/kernel/built-in.o: In function `trap_init': (.init.text+0x4): undefined reference to `thread_info_offsets_are_bolixed_dave' Caused by commit 52400ba946759af28442dee6265c5c0180ac7122 ("futex: add requeue_pi functionality") (from the tip-core tree) which changed the size of struct restart_block. Shift TI_KUNA_REGS and TI_KUNA_INSN up by 8 bytes to make space for the larger restart block. Signed-off-by: Stephen Rothwell Acked-by: "David S. Miller" Cc: Darren Hart LKML-Reference: <20090409151722.c8eabb56.sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar --- arch/sparc/include/asm/thread_info_64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 639ac805448..65865726b28 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -102,8 +102,8 @@ struct thread_info { #define TI_KERN_CNTD1 0x00000488 #define TI_PCR 0x00000490 #define TI_RESTART_BLOCK 0x00000498 -#define TI_KUNA_REGS 0x000004c0 -#define TI_KUNA_INSN 0x000004c8 +#define TI_KUNA_REGS 0x000004c8 +#define TI_KUNA_INSN 0x000004d0 #define TI_FPREGS 0x00000500 /* We embed this in the uppermost byte of thread_info->flags */ From 002f128b473fb82f454654be5081b0919ee01ab2 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Wed, 8 Apr 2009 15:29:43 -0700 Subject: [PATCH 111/900] sched: remove redundant hierarchy walk in check_preempt_wakeup Impact: micro-optimization Under group scheduling we traverse up until we are at common siblings to make the wakeup comparison on. At this point however, they should have the same parent so continuing to check up the tree is redundant. Signed-off-by: Paul Turner Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3816f217f11..5f9650e8fe7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) find_matching_se(&se, &pse); - while (se) { - BUG_ON(!pse); + BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) { - resched_task(curr); - break; - } - - se = parent_entity(se); - pse = parent_entity(pse); - } + if (wakeup_preempt_entity(se, pse) == 1) + resched_task(curr); } static struct task_struct *pick_next_task_fair(struct rq *rq) From e7c064889606aab3569669078c69b87b2c527e72 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 7 Mar 2009 23:48:41 -0800 Subject: [PATCH 112/900] xen: add FIX_TEXT_POKE to fixmap FIX_TEXT_POKE[01] are used to map kernel addresses, so they're mapping pfns, not mfns. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 77b242c9a11..a96f5b9393e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1812,6 +1812,9 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) #ifdef CONFIG_X86_LOCAL_APIC case FIX_APIC_BASE: /* maps dummy local APIC */ #endif + case FIX_TEXT_POKE0: + case FIX_TEXT_POKE1: + /* All local page mappings */ pte = pfn_pte(phys, prot); break; From 7a734e7dd93b9aea08ed51036a9a0e2c9dfd8dac Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:08:28 -0700 Subject: [PATCH 113/900] x86, setup: "glove box" BIOS calls -- infrastructure Impact: new interfaces (not yet used) For all the platforms out there, there is an infinite number of buggy BIOSes. This adds infrastructure to treat BIOS interrupts more like toxic waste and "glove box" them -- we switch out the register set, perform the BIOS interrupt, and then restore the previous state. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin Cc: Pavel Machek Cc: Rafael J. Wysocki --- arch/x86/boot/Makefile | 5 +- arch/x86/boot/bioscall.S | 82 ++++++++++++++++++++++++ arch/x86/boot/boot.h | 48 ++++++++++++++ arch/x86/boot/header.S | 2 +- arch/x86/boot/regs.c | 29 +++++++++ arch/x86/boot/setup.ld | 6 ++ arch/x86/kernel/acpi/realmode/Makefile | 2 +- arch/x86/kernel/acpi/realmode/bioscall.S | 1 + arch/x86/kernel/acpi/realmode/regs.c | 1 + 9 files changed, 172 insertions(+), 4 deletions(-) create mode 100644 arch/x86/boot/bioscall.S create mode 100644 arch/x86/boot/regs.c create mode 100644 arch/x86/kernel/acpi/realmode/bioscall.S create mode 100644 arch/x86/kernel/acpi/realmode/regs.c diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 6633b6e7505..658bc525cac 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -26,9 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage targets += fdimage fdimage144 fdimage288 image.iso mtools.conf subdir- := compressed -setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o +setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o setup-y += header.o main.o mca.o memory.o pm.o pmjump.o -setup-y += printf.o string.o tty.o video.o video-mode.o version.o +setup-y += printf.o regs.o string.o tty.o video.o video-mode.o +setup-y += version.o setup-$(CONFIG_X86_APM_BOOT) += apm.o # The link order of the video-*.o modules can matter. In particular, diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S new file mode 100644 index 00000000000..22b4b3efb9f --- /dev/null +++ b/arch/x86/boot/bioscall.S @@ -0,0 +1,82 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2009 Intel Corporation; author H. Peter Anvin + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes + * touching memory they shouldn't be. + */ + + .code16 + .text + .globl intcall + .type intcall, @function +intcall: + /* Self-modify the INT instruction. Ugly, but works. */ + cmpb %al, 3f + je 1f + movb %al, 3f + jmp 1f /* Synchronize pipeline */ +1: + /* Save state */ + pushfl + pushw %fs + pushw %gs + pushal + + /* Copy input state to stack frame */ + subw $44, %sp + movw %dx, %si + movw %sp, %di + movw $11, %cx + rep; movsd + + /* Pop full state from the stack */ + popal + popw %gs + popw %fs + popw %es + popw %ds + popfl + + /* Actual INT */ + .byte 0xcd /* INT opcode */ +3: .byte 0 + + /* Push full state to the stack */ + pushfl + pushw %ds + pushw %es + pushw %fs + pushw %gs + pushal + + /* Re-establish C environment invariants */ + cld + movzwl %sp, %esp + movw %cs, %ax + movw %ax, %ds + movw %ax, %es + + /* Copy output state from stack frame */ + movw 68(%esp), %di /* Original %cx == 3rd argument */ + andw %di, %di + jz 4f + movw %sp, %si + movw $11, %cx + rep; movsd +4: addw $44, %sp + + /* Restore state and return */ + popal + popw %gs + popw %fs + popfl + retl + .size intcall, .-intcall diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 7b2692e897e..98239d2658f 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -26,6 +27,7 @@ #include #include "bitops.h" #include +#include /* Useful macros */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) @@ -241,6 +243,49 @@ int enable_a20(void); /* apm.c */ int query_apm_bios(void); +/* bioscall.c */ +struct biosregs { + union { + struct { + u32 edi; + u32 esi; + u32 ebp; + u32 _esp; + u32 ebx; + u32 edx; + u32 ecx; + u32 eax; + u32 _fsgs; + u32 _dses; + u32 eflags; + }; + struct { + u16 di, hdi; + u16 si, hsi; + u16 bp, hbp; + u16 _sp, _hsp; + u16 bx, hbx; + u16 dx, hdx; + u16 cx, hcx; + u16 ax, hax; + u16 gs, fs; + u16 es, ds; + u16 flags, hflags; + }; + struct { + u8 dil, dih, edi2, edi3; + u8 sil, sih, esi2, esi3; + u8 bpl, bph, ebp2, ebp3; + u8 _spl, _sph, _esp2, _esp3; + u8 bl, bh, ebx2, ebx3; + u8 dl, dh, edx2, edx3; + u8 cl, ch, ecx2, ecx3; + u8 al, ah, eax2, eax3; + }; + }; +}; +void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); + /* cmdline.c */ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); @@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...); int vsprintf(char *buf, const char *fmt, va_list args); int printf(const char *fmt, ...); +/* regs.c */ +void initregs(struct biosregs *regs); + /* string.c */ int strcmp(const char *str1, const char *str2); size_t strnlen(const char *s, size_t maxlen); diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 5d84d1c74e4..486d97fa7f4 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -221,7 +221,7 @@ setup_data: .quad 0 # 64-bit physical pointer to # End of setup header ##################################################### - .section ".inittext", "ax" + .section ".entrytext", "ax" start_of_setup: #ifdef SAFE_RESET_DISK_CONTROLLER # Reset the disk controller. diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c new file mode 100644 index 00000000000..958019b1cfa --- /dev/null +++ b/arch/x86/boot/regs.c @@ -0,0 +1,29 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2009 Intel Corporation; author H. Peter Anvin + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * Simple helper function for initializing a register set. + * + * Note that this sets EFLAGS_CF in the input register set; this + * makes it easier to catch functions which do nothing but don't + * explicitly set CF. + */ + +#include "boot.h" + +void initregs(struct biosregs *reg) +{ + memset(reg, 0, sizeof *reg); + reg->eflags |= X86_EFLAGS_CF; + reg->ds = ds(); + reg->es = ds(); + reg->fs = fs(); + reg->gs = gs(); +} diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index bb8dc2de796..0f6ec455a2b 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -15,8 +15,11 @@ SECTIONS . = 497; .header : { *(.header) } + .entrytext : { *(.entrytext) } .inittext : { *(.inittext) } .initdata : { *(.initdata) } + __end_init = .; + .text : { *(.text) } .text32 : { *(.text32) } @@ -52,4 +55,7 @@ SECTIONS . = ASSERT(_end <= 0x8000, "Setup too big!"); . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); + /* Necessary for the very-old-loader check to work... */ + . = ASSERT(__end_init <= 5*512, "init sections too big!"); + } diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile index 1c31cc0e9de..167bc16ce0e 100644 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ b/arch/x86/kernel/acpi/realmode/Makefile @@ -9,7 +9,7 @@ always := wakeup.bin targets := wakeup.elf wakeup.lds -wakeup-y += wakeup.o wakemain.o video-mode.o copy.o +wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o # The link order of the video-*.o modules can matter. In particular, # video-vga.o *must* be listed first, followed by video-vesa.o. diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S new file mode 100644 index 00000000000..f51eb0bb56c --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/bioscall.S @@ -0,0 +1 @@ +#include "../../../boot/bioscall.S" diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c new file mode 100644 index 00000000000..6206033ba20 --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/regs.c @@ -0,0 +1 @@ +#include "../../../boot/regs.c" From df7699c56421c0476704f24a43409ac8c505f3d2 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:13:46 -0700 Subject: [PATCH 114/900] x86, setup: "glove box" BIOS interrupts in the core boot code Impact: BIOS proofing "Glove box" off BIOS interrupts in the core boot code. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin --- arch/x86/boot/a20.c | 9 +++-- arch/x86/boot/main.c | 39 ++++++++++---------- arch/x86/boot/memory.c | 81 +++++++++++++++++++++--------------------- arch/x86/boot/tty.c | 52 ++++++++++++++------------- 4 files changed, 95 insertions(+), 86 deletions(-) diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c index 7c19ce8c244..64a31a6d751 100644 --- a/arch/x86/boot/a20.c +++ b/arch/x86/boot/a20.c @@ -2,7 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007-2008 rPath, Inc. - All Rights Reserved - * Copyright 2009 Intel Corporation + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -90,8 +90,11 @@ static int a20_test_long(void) static void enable_a20_bios(void) { - asm volatile("pushfl; int $0x15; popfl" - : : "a" ((u16)0x2401)); + struct biosregs ireg; + + initregs(&ireg); + ireg.ax = 0x2401; + intcall(0x15, &ireg, NULL); } static void enable_a20_kbc(void) diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 58f0415d3ae..140172b895b 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -61,11 +62,10 @@ static void copy_boot_params(void) */ static void keyboard_set_repeat(void) { - u16 ax = 0x0305; - u16 bx = 0; - asm volatile("int $0x16" - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); + struct biosregs ireg; + initregs(&ireg); + ireg.ax = 0x0305; + intcall(0x16, &ireg, NULL); } /* @@ -73,18 +73,22 @@ static void keyboard_set_repeat(void) */ static void query_ist(void) { + struct biosregs ireg, oreg; + /* Some older BIOSes apparently crash on this call, so filter it from machines too old to have SpeedStep at all. */ if (cpu.level < 6) return; - asm("int $0x15" - : "=a" (boot_params.ist_info.signature), - "=b" (boot_params.ist_info.command), - "=c" (boot_params.ist_info.event), - "=d" (boot_params.ist_info.perf_level) - : "a" (0x0000e980), /* IST Support */ - "d" (0x47534943)); /* Request value */ + initregs(&ireg); + ireg.ax = 0xe980; /* IST Support */ + ireg.edx = 0x47534943; /* Request value */ + intcall(0x15, &ireg, &oreg); + + boot_params.ist_info.signature = oreg.eax; + boot_params.ist_info.command = oreg.ebx; + boot_params.ist_info.event = oreg.ecx; + boot_params.ist_info.perf_level = oreg.edx; } /* @@ -93,13 +97,12 @@ static void query_ist(void) static void set_bios_mode(void) { #ifdef CONFIG_X86_64 - u32 eax, ebx; + struct biosregs ireg; - eax = 0xec00; - ebx = 2; - asm volatile("int $0x15" - : "+a" (eax), "+b" (ebx) - : : "ecx", "edx", "esi", "edi"); + initregs(&ireg); + ireg.ax = 0xec00; + ireg.bx = 2; + intcall(0x15, &ireg, NULL); #endif } diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index 5054c2ddd1a..d989de810ca 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -25,12 +25,16 @@ struct e820_ext_entry { static int detect_memory_e820(void) { int count = 0; - u32 next = 0; - u32 size, id, edi; - u8 err; + struct biosregs ireg, oreg; struct e820entry *desc = boot_params.e820_map; static struct e820_ext_entry buf; /* static so it is zeroed */ + initregs(&ireg); + ireg.ax = 0xe820; + ireg.cx = sizeof buf; + ireg.edx = SMAP; + ireg.di = (size_t)&buf; + /* * Set this here so that if the BIOS doesn't change this field * but still doesn't change %ecx, we're still okay... @@ -38,22 +42,13 @@ static int detect_memory_e820(void) buf.ext_flags = 1; do { - size = sizeof buf; - - /* Important: %edx and %esi are clobbered by some BIOSes, - so they must be either used for the error output - or explicitly marked clobbered. Given that, assume there - is something out there clobbering %ebp and %edi, too. */ - asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0" - : "=d" (err), "+b" (next), "=a" (id), "+c" (size), - "=D" (edi), "+m" (buf) - : "D" (&buf), "d" (SMAP), "a" (0xe820) - : "esi"); + intcall(0x15, &ireg, &oreg); + ireg.ebx = oreg.ebx; /* for next iteration... */ /* BIOSes which terminate the chain with CF = 1 as opposed to %ebx = 0 don't always report the SMAP signature on the final, failing, probe. */ - if (err) + if (oreg.eflags & X86_EFLAGS_CF) break; /* Some BIOSes stop returning SMAP in the middle of @@ -61,7 +56,7 @@ static int detect_memory_e820(void) screwed up the map at that point, we might have a partial map, the full map, or complete garbage, so just return failure. */ - if (id != SMAP) { + if (oreg.eax != SMAP) { count = 0; break; } @@ -69,58 +64,62 @@ static int detect_memory_e820(void) /* ACPI 3.0 added the extended flags support. If bit 0 in the extended flags is zero, we're supposed to simply ignore the entry -- a backwards incompatible change! */ - if (size > 20 && !(buf.ext_flags & 1)) + if (oreg.cx > 20 && !(buf.ext_flags & 1)) continue; *desc++ = buf.std; count++; - } while (next && count < ARRAY_SIZE(boot_params.e820_map)); + } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map)); return boot_params.e820_entries = count; } static int detect_memory_e801(void) { - u16 ax, bx, cx, dx; - u8 err; + struct biosregs ireg, oreg; - bx = cx = dx = 0; - ax = 0xe801; - asm("stc; int $0x15; setc %0" - : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx)); + initregs(&ireg); + ireg.ax = 0xe801; + intcall(0x15, &ireg, &oreg); - if (err) + if (oreg.eflags & X86_EFLAGS_CF) return -1; /* Do we really need to do this? */ - if (cx || dx) { - ax = cx; - bx = dx; + if (oreg.cx || oreg.dx) { + oreg.ax = oreg.cx; + oreg.bx = oreg.dx; } - if (ax > 15*1024) + if (oreg.ax > 15*1024) { return -1; /* Bogus! */ - - /* This ignores memory above 16MB if we have a memory hole - there. If someone actually finds a machine with a memory - hole at 16MB and no support for 0E820h they should probably - generate a fake e820 map. */ - boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax; + } else if (oreg.ax == 15*1024) { + boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax; + } else { + /* + * This ignores memory above 16MB if we have a memory + * hole there. If someone actually finds a machine + * with a memory hole at 16MB and no support for + * 0E820h they should probably generate a fake e820 + * map. + */ + boot_params.alt_mem_k = oreg.ax; + } return 0; } static int detect_memory_88(void) { - u16 ax; - u8 err; + struct biosregs ireg, oreg; - ax = 0x8800; - asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax)); + initregs(&ireg); + ireg.ah = 0x88; + intcall(0x15, &ireg, &oreg); - boot_params.screen_info.ext_mem_k = ax; + boot_params.screen_info.ext_mem_k = oreg.ax; - return -err; + return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */ } int detect_memory(void) diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index 7e8e8b25f5f..01ec69c901c 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -22,24 +23,23 @@ void __attribute__((section(".inittext"))) putchar(int ch) { - unsigned char c = ch; + struct biosregs ireg; - if (c == '\n') + if (ch == '\n') putchar('\r'); /* \n -> \r\n */ - /* int $0x10 is known to have bugs involving touching registers - it shouldn't. Be extra conservative... */ - asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal" - : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch)); + initregs(&ireg); + ireg.bx = 0x0007; + ireg.cx = 0x0001; + ireg.ah = 0x0e; + ireg.al = ch; + intcall(0x10, &ireg, NULL); } void __attribute__((section(".inittext"))) puts(const char *str) { - int n = 0; - while (*str) { + while (*str) putchar(*str++); - n++; - } } /* @@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str) static u8 gettime(void) { - u16 ax = 0x0200; - u16 cx, dx; + struct biosregs ireg, oreg; - asm volatile("int $0x1a" - : "+a" (ax), "=c" (cx), "=d" (dx) - : : "ebx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x02; + intcall(0x1a, &ireg, &oreg); - return dx >> 8; + return oreg.dh; } /* @@ -64,19 +63,24 @@ static u8 gettime(void) */ int getchar(void) { - u16 ax = 0; - asm volatile("int $0x16" : "+a" (ax)); + struct biosregs ireg, oreg; - return ax & 0xff; + initregs(&ireg); + /* ireg.ah = 0x00; */ + intcall(0x16, &ireg, &oreg); + + return oreg.al; } static int kbd_pending(void) { - u8 pending; - asm volatile("int $0x16; setnz %0" - : "=qm" (pending) - : "a" (0x0100)); - return pending; + struct biosregs ireg, oreg; + + initregs(&ireg); + ireg.ah = 0x01; + intcall(0x16, &ireg, &oreg); + + return !(oreg.eflags & X86_EFLAGS_ZF); } void kbd_flush(void) From d54ea252e4c92357226992cf65d94616a96e6fce Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:14:26 -0700 Subject: [PATCH 115/900] x86, setup: "glove box" BIOS interrupts in the APM code Impact: BIOS proofing "Glove box" off BIOS interrupts in the APM code. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin Cc: Stephen Rothwell --- arch/x86/boot/apm.c | 72 +++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 45 deletions(-) diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c index 7aa6033001f..ee274834ea8 100644 --- a/arch/x86/boot/apm.c +++ b/arch/x86/boot/apm.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * Original APM BIOS checking by Stephen Rothwell, May 1994 * (sfr@canb.auug.org.au) @@ -19,75 +20,56 @@ int query_apm_bios(void) { - u16 ax, bx, cx, dx, di; - u32 ebx, esi; - u8 err; + struct biosregs ireg, oreg; /* APM BIOS installation check */ - ax = 0x5300; - bx = cx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" - : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx) - : : "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x53; + intcall(0x15, &ireg, &oreg); - if (err) + if (oreg.flags & X86_EFLAGS_CF) return -1; /* No APM BIOS */ - if (bx != 0x504d) /* "PM" signature */ + if (oreg.bx != 0x504d) /* "PM" signature */ return -1; - if (!(cx & 0x02)) /* 32 bits supported? */ + if (!(oreg.cx & 0x02)) /* 32 bits supported? */ return -1; /* Disconnect first, just in case */ - ax = 0x5304; - bx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp" - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); - - /* Paranoia */ - ebx = esi = 0; - cx = dx = di = 0; + ireg.al = 0x04; + intcall(0x15, &ireg, NULL); /* 32-bit connect */ - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6" - : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx), - "+S" (esi), "+D" (di), "=m" (err) - : "a" (0x5303)); + ireg.al = 0x03; + intcall(0x15, &ireg, &oreg); - boot_params.apm_bios_info.cseg = ax; - boot_params.apm_bios_info.offset = ebx; - boot_params.apm_bios_info.cseg_16 = cx; - boot_params.apm_bios_info.dseg = dx; - boot_params.apm_bios_info.cseg_len = (u16)esi; - boot_params.apm_bios_info.cseg_16_len = esi >> 16; - boot_params.apm_bios_info.dseg_len = di; + boot_params.apm_bios_info.cseg = oreg.ax; + boot_params.apm_bios_info.offset = oreg.ebx; + boot_params.apm_bios_info.cseg_16 = oreg.cx; + boot_params.apm_bios_info.dseg = oreg.dx; + boot_params.apm_bios_info.cseg_len = oreg.si; + boot_params.apm_bios_info.cseg_16_len = oreg.hsi; + boot_params.apm_bios_info.dseg_len = oreg.di; - if (err) + if (oreg.flags & X86_EFLAGS_CF) return -1; /* Redo the installation check as the 32-bit connect; some BIOSes return different flags this way... */ - ax = 0x5300; - bx = cx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" - : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx) - : : "esi", "edi"); + ireg.al = 0x00; + intcall(0x15, &ireg, &oreg); - if (err || bx != 0x504d) { + if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) { /* Failure with 32-bit connect, try to disconect and ignore */ - ax = 0x5304; - bx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp" - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); + ireg.al = 0x04; + intcall(0x15, &ireg, NULL); return -1; } - boot_params.apm_bios_info.version = ax; - boot_params.apm_bios_info.flags = cx; + boot_params.apm_bios_info.version = oreg.ax; + boot_params.apm_bios_info.flags = oreg.cx; return 0; } From 3435d3476c5ed955d56a6216ed2d156847b3a575 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:17:17 -0700 Subject: [PATCH 116/900] x86, setup: "glove box" BIOS interrupts in the EDD code Impact: BIOS proofing "Glove box" off BIOS interrupts in the EDD code. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin --- arch/x86/boot/edd.c | 69 ++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 39 deletions(-) diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index 1aae8f3e5ca..c501a5b466f 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -22,17 +23,17 @@ */ static int read_mbr(u8 devno, void *buf) { - u16 ax, bx, cx, dx; + struct biosregs ireg, oreg; - ax = 0x0201; /* Legacy Read, one sector */ - cx = 0x0001; /* Sector 0-0-1 */ - dx = devno; - bx = (size_t)buf; - asm volatile("pushfl; stc; int $0x13; setc %%al; popfl" - : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx) - : : "esi", "edi", "memory"); + initregs(&ireg); + ireg.ax = 0x0201; /* Legacy Read, one sector */ + ireg.cx = 0x0001; /* Sector 0-0-1 */ + ireg.dl = devno; + ireg.bx = (size_t)buf; - return -(u8)ax; /* 0 or -1 */ + intcall(0x13, &ireg, &oreg); + + return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */ } static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) @@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) static int get_edd_info(u8 devno, struct edd_info *ei) { - u16 ax, bx, cx, dx, di; + struct biosregs ireg, oreg; memset(ei, 0, sizeof *ei); /* Check Extensions Present */ - ax = 0x4100; - bx = EDDMAGIC1; - dx = devno; - asm("pushfl; stc; int $0x13; setc %%al; popfl" - : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx) - : : "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x41; + ireg.bx = EDDMAGIC1; + ireg.dl = devno; + intcall(0x13, &ireg, &oreg); - if ((u8)ax) + if (oreg.eflags & X86_EFLAGS_CF) return -1; /* No extended information */ - if (bx != EDDMAGIC2) + if (oreg.bx != EDDMAGIC2) return -1; ei->device = devno; - ei->version = ax >> 8; /* EDD version number */ - ei->interface_support = cx; /* EDD functionality subsets */ + ei->version = oreg.ah; /* EDD version number */ + ei->interface_support = oreg.cx; /* EDD functionality subsets */ /* Extended Get Device Parameters */ ei->params.length = sizeof(ei->params); - ax = 0x4800; - dx = devno; - asm("pushfl; int $0x13; popfl" - : "+a" (ax), "+d" (dx), "=m" (ei->params) - : "S" (&ei->params) - : "ebx", "ecx", "edi"); + ireg.ah = 0x48; + ireg.si = (size_t)&ei->params; + intcall(0x13, &ireg, &oreg); /* Get legacy CHS parameters */ /* Ralf Brown recommends setting ES:DI to 0:0 */ - ax = 0x0800; - dx = devno; - di = 0; - asm("pushw %%es; " - "movw %%di,%%es; " - "pushfl; stc; int $0x13; setc %%al; popfl; " - "popw %%es" - : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di) - : : "esi"); + ireg.ah = 0x08; + ireg.es = 0; + intcall(0x13, &ireg, &oreg); - if ((u8)ax == 0) { - ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2); - ei->legacy_max_head = dx >> 8; - ei->legacy_sectors_per_track = cx & 0x3f; + if (!(oreg.eflags & X86_EFLAGS_CF)) { + ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2); + ei->legacy_max_head = oreg.dh; + ei->legacy_sectors_per_track = oreg.cl & 0x3f; } return 0; From 0a706db320768f8f6e43bbf73b58d2aabdc93354 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:19:00 -0700 Subject: [PATCH 117/900] x86, setup: "glove box" BIOS interrupts in the MCA code Impact: BIOS proofing "Glove box" off BIOS interrupts in the MCA code. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin Cc: James Bottomley --- arch/x86/boot/mca.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c index 911eaae5d69..a95a531148e 100644 --- a/arch/x86/boot/mca.c +++ b/arch/x86/boot/mca.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -16,26 +17,22 @@ int query_mca(void) { - u8 err; - u16 es, bx, len; + struct biosregs ireg, oreg; + u16 len; - asm("pushw %%es ; " - "int $0x15 ; " - "setc %0 ; " - "movw %%es, %1 ; " - "popw %%es" - : "=acd" (err), "=acdSD" (es), "=b" (bx) - : "a" (0xc000)); + initregs(&ireg); + ireg.ah = 0xc0; + intcall(0x15, &ireg, &oreg); - if (err) + if (oreg.eflags & X86_EFLAGS_CF) return -1; /* No MCA present */ - set_fs(es); - len = rdfs16(bx); + set_fs(oreg.es); + len = rdfs16(oreg.bx); if (len > sizeof(boot_params.sys_desc_table)) len = sizeof(boot_params.sys_desc_table); - copy_from_fs(&boot_params.sys_desc_table, bx, len); + copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len); return 0; } From cf06de7b9cdd3efee7a59dced1977b3c21d43732 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:20:11 -0700 Subject: [PATCH 118/900] x86, setup: "glove box" BIOS interrupts in the video code Impact: BIOS proofing "Glove box" off BIOS interrupts in the video code. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin Cc: Pavel Machek Cc: Rafael J. Wysocki --- arch/x86/boot/video-bios.c | 27 ++++---- arch/x86/boot/video-vesa.c | 137 ++++++++++++++++--------------------- arch/x86/boot/video-vga.c | 95 +++++++++++++++---------- arch/x86/boot/video.c | 42 +++++------- arch/x86/boot/video.h | 14 ---- 5 files changed, 151 insertions(+), 164 deletions(-) diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index 3fa979c9c36..d660be49236 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -29,21 +30,21 @@ static int bios_set_mode(struct mode_info *mi) static int set_bios_mode(u8 mode) { - u16 ax; + struct biosregs ireg, oreg; u8 new_mode; - ax = mode; /* AH=0x00 Set Video Mode */ - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + initregs(&ireg); + ireg.al = mode; /* AH=0x00 Set Video Mode */ + intcall(0x10, &ireg, NULL); - ax = 0x0f00; /* Get Current Video Mode */ - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + + ireg.ah = 0x0f; /* Get Current Video Mode */ + intcall(0x10, &ireg, &oreg); do_restore = 1; /* Assume video contents were lost */ - new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */ + + /* Not all BIOSes are clean with the top bit */ + new_mode = ireg.al & 0x7f; if (new_mode == mode) return 0; /* Mode change OK */ @@ -53,10 +54,8 @@ static int set_bios_mode(u8 mode) /* Mode setting failed, but we didn't end up where we started. That's bad. Try to revert to the original video mode. */ - ax = boot_params.screen_info.orig_video_mode; - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ax = boot_params.screen_info.orig_video_mode; + intcall(0x10, &ireg, NULL); } #endif return -1; diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 4a58c8ce3f6..c700147d6ff 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {} static int vesa_probe(void) { #if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) - u16 ax, cx, di; + struct biosregs ireg, oreg; u16 mode; addr_t mode_ptr; struct mode_info *mi; @@ -39,13 +40,12 @@ static int vesa_probe(void) video_vesa.modes = GET_HEAP(struct mode_info, 0); - ax = 0x4f00; - di = (size_t)&vginfo; - asm(INT10 - : "+a" (ax), "+D" (di), "=m" (vginfo) - : : "ebx", "ecx", "edx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f00; + ireg.di = (size_t)&vginfo; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f || + if (ireg.ax != 0x004f || vginfo.signature != VESA_MAGIC || vginfo.version < 0x0102) return 0; /* Not present */ @@ -65,14 +65,12 @@ static int vesa_probe(void) memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ - ax = 0x4f01; - cx = mode; - di = (size_t)&vminfo; - asm(INT10 - : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo) - : : "ebx", "edx", "esi"); + ireg.ax = 0x4f01; + ireg.cx = mode; + ireg.di = (size_t)&vminfo; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (ireg.ax != 0x004f) continue; if ((vminfo.mode_attr & 0x15) == 0x05) { @@ -111,20 +109,19 @@ static int vesa_probe(void) static int vesa_set_mode(struct mode_info *mode) { - u16 ax, bx, cx, di; + struct biosregs ireg, oreg; int is_graphic; u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA; memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ - ax = 0x4f01; - cx = vesa_mode; - di = (size_t)&vminfo; - asm(INT10 - : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo) - : : "ebx", "edx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f01; + ireg.cx = vesa_mode; + ireg.di = (size_t)&vminfo; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return -1; if ((vminfo.mode_attr & 0x15) == 0x05) { @@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode) } - ax = 0x4f02; - bx = vesa_mode; - di = 0; - asm volatile(INT10 - : "+a" (ax), "+b" (bx), "+D" (di) - : : "ecx", "edx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f02; + ireg.bx = vesa_mode; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return -1; graphic_mode = is_graphic; @@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode) /* Switch DAC to 8-bit mode */ static void vesa_dac_set_8bits(void) { + struct biosregs ireg, oreg; u8 dac_size = 6; /* If possible, switch the DAC to 8-bit mode */ if (vginfo.capabilities & 1) { - u16 ax, bx; - - ax = 0x4f08; - bx = 0x0800; - asm volatile(INT10 - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); - - if (ax == 0x004f) - dac_size = bx >> 8; + initregs(&ireg); + ireg.ax = 0x4f08; + ireg.bh = 0x08; + intcall(0x10, &ireg, &oreg); + if (oreg.ax == 0x004f) + dac_size = oreg.bh; } /* Set the color sizes to the DAC size, and offsets to 0 */ - boot_params.screen_info.red_size = dac_size; + boot_params.screen_info.red_size = dac_size; boot_params.screen_info.green_size = dac_size; - boot_params.screen_info.blue_size = dac_size; - boot_params.screen_info.rsvd_size = dac_size; + boot_params.screen_info.blue_size = dac_size; + boot_params.screen_info.rsvd_size = dac_size; - boot_params.screen_info.red_pos = 0; - boot_params.screen_info.green_pos = 0; - boot_params.screen_info.blue_pos = 0; - boot_params.screen_info.rsvd_pos = 0; + boot_params.screen_info.red_pos = 0; + boot_params.screen_info.green_pos = 0; + boot_params.screen_info.blue_pos = 0; + boot_params.screen_info.rsvd_pos = 0; } /* Save the VESA protected mode info */ static void vesa_store_pm_info(void) { - u16 ax, bx, di, es; + struct biosregs ireg, oreg; - ax = 0x4f0a; - bx = di = 0; - asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es" - : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di) - : : "ecx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f0a; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return; - boot_params.screen_info.vesapm_seg = es; - boot_params.screen_info.vesapm_off = di; + boot_params.screen_info.vesapm_seg = oreg.es; + boot_params.screen_info.vesapm_off = oreg.di; } /* @@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void) void vesa_store_edid(void) { #ifdef CONFIG_FIRMWARE_EDID - u16 ax, bx, cx, dx, di; + struct biosregs ireg, oreg; /* Apparently used as a nonsense token... */ memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info); @@ -260,33 +250,26 @@ void vesa_store_edid(void) if (vginfo.version < 0x0200) return; /* EDID requires VBE 2.0+ */ - ax = 0x4f15; /* VBE DDC */ - bx = 0x0000; /* Report DDC capabilities */ - cx = 0; /* Controller 0 */ - di = 0; /* ES:DI must be 0 by spec */ + initregs(&ireg); + ireg.ax = 0x4f15; /* VBE DDC */ + /* ireg.bx = 0x0000; */ /* Report DDC capabilities */ + /* ireg.cx = 0; */ /* Controller 0 */ + ireg.es = 0; /* ES:DI must be 0 by spec */ + intcall(0x10, &ireg, &oreg); - /* Note: The VBE DDC spec is different from the main VESA spec; - we genuinely have to assume all registers are destroyed here. */ - - asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es" - : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di) - : : "esi", "edx"); - - if (ax != 0x004f) + if (oreg.ax != 0x004f) return; /* No EDID */ /* BH = time in seconds to transfer EDD information */ /* BL = DDC level supported */ - ax = 0x4f15; /* VBE DDC */ - bx = 0x0001; /* Read EDID */ - cx = 0; /* Controller 0 */ - dx = 0; /* EDID block number */ - di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */ - asm(INT10 - : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info), - "+c" (cx), "+D" (di) - : : "esi"); + ireg.ax = 0x4f15; /* VBE DDC */ + ireg.bx = 0x0001; /* Read EDID */ + /* ireg.cx = 0; */ /* Controller 0 */ + /* ireg.dx = 0; */ /* EDID block number */ + ireg.es = ds(); + ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */ + intcall(0x10, &ireg, &oreg); #endif /* CONFIG_FIRMWARE_EDID */ } diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 9e0587a3776..8f8d827e254 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -39,30 +40,30 @@ static __videocard video_vga; /* Set basic 80x25 mode */ static u8 vga_set_basic_mode(void) { + struct biosregs ireg, oreg; u16 ax; u8 rows; u8 mode; + initregs(&ireg); + #ifdef CONFIG_VIDEO_400_HACK if (adapter >= ADAPTER_VGA) { - asm volatile(INT10 - : : "a" (0x1202), "b" (0x0030) - : "ecx", "edx", "esi", "edi"); + ireg.ax = 0x1202; + ireg.bx = 0x0030; + intcall(0x10, &ireg, NULL); } #endif ax = 0x0f00; - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); - - mode = (u8)ax; + intcall(0x10, &ireg, &oreg); + mode = oreg.al; set_fs(0); rows = rdfs8(0x484); /* rows minus one */ #ifndef CONFIG_VIDEO_400_HACK - if ((ax == 0x5003 || ax == 0x5007) && + if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && (rows == 0 || rows == 24)) return mode; #endif @@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void) mode = 3; /* Set the mode */ - ax = mode; - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ax = mode; /* AH=0: set mode */ + intcall(0x10, &ireg, NULL); do_restore = 1; return mode; } @@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void) static void vga_set_8font(void) { /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */ + struct biosregs ireg; + + initregs(&ireg); /* Set 8x8 font */ - asm volatile(INT10 : : "a" (0x1112), "b" (0)); + ireg.ax = 0x1112; + /* ireg.bl = 0; */ + intcall(0x10, &ireg, NULL); /* Use alternate print screen */ - asm volatile(INT10 : : "a" (0x1200), "b" (0x20)); + ireg.ax = 0x1200; + ireg.bl = 0x20; + intcall(0x10, &ireg, NULL); /* Turn off cursor emulation */ - asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); + ireg.ax = 0x1201; + ireg.bl = 0x34; + intcall(0x10, &ireg, NULL); /* Cursor is scan lines 6-7 */ - asm volatile(INT10 : : "a" (0x0100), "c" (0x0607)); + ireg.ax = 0x0100; + ireg.cx = 0x0607; + intcall(0x10, &ireg, NULL); } static void vga_set_14font(void) { /* Set 9x14 font - 80x28 on VGA */ + struct biosregs ireg; + + initregs(&ireg); /* Set 9x14 font */ - asm volatile(INT10 : : "a" (0x1111), "b" (0)); + ireg.ax = 0x1111; + /* ireg.bl = 0; */ + intcall(0x10, &ireg, NULL); /* Turn off cursor emulation */ - asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); + ireg.ax = 0x1201; + ireg.bl = 0x34; + intcall(0x10, &ireg, NULL); /* Cursor is scan lines 11-12 */ - asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c)); + ireg.ax = 0x0100; + ireg.cx = 0x0b0c; + intcall(0x10, &ireg, NULL); } static void vga_set_80x43(void) { /* Set 80x43 mode on VGA (not EGA) */ + struct biosregs ireg; + + initregs(&ireg); /* Set 350 scans */ - asm volatile(INT10 : : "a" (0x1201), "b" (0x30)); + ireg.ax = 0x1201; + ireg.bl = 0x30; + intcall(0x10, &ireg, NULL); /* Reset video mode */ - asm volatile(INT10 : : "a" (0x0003)); + ireg.ax = 0x0003; + intcall(0x10, &ireg, NULL); vga_set_8font(); } @@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode) */ static int vga_probe(void) { - u16 ega_bx; - static const char *card_name[] = { "CGA/MDA/HGC", "EGA", "VGA" }; @@ -240,26 +263,26 @@ static int vga_probe(void) sizeof(ega_modes)/sizeof(struct mode_info), sizeof(vga_modes)/sizeof(struct mode_info), }; - u8 vga_flag; - asm(INT10 - : "=b" (ega_bx) - : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */ - : "ecx", "edx", "esi", "edi"); + struct biosregs ireg, oreg; + + initregs(&ireg); + + ireg.ax = 0x1200; + ireg.bl = 0x10; /* Check EGA/VGA */ + intcall(0x10, &ireg, &oreg); #ifndef _WAKEUP - boot_params.screen_info.orig_video_ega_bx = ega_bx; + boot_params.screen_info.orig_video_ega_bx = oreg.bx; #endif /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */ - if ((u8)ega_bx != 0x10) { + if (oreg.bl != 0x10) { /* EGA/VGA */ - asm(INT10 - : "=a" (vga_flag) - : "a" (0x1a00) - : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ax = 0x1a00; + intcall(0x10, &ireg, &oreg); - if (vga_flag == 0x1a) { + if (oreg.al == 0x1a) { adapter = ADAPTER_VGA; #ifndef _WAKEUP boot_params.screen_info.orig_video_isVGA = 1; diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 3bef2c1febe..bad728b76fc 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -18,33 +19,29 @@ static void store_cursor_position(void) { - u16 curpos; - u16 ax, bx; + struct biosregs ireg, oreg; - ax = 0x0300; - bx = 0; - asm(INT10 - : "=d" (curpos), "+a" (ax), "+b" (bx) - : : "ecx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x03; + intcall(0x10, &ireg, &oreg); - boot_params.screen_info.orig_x = curpos; - boot_params.screen_info.orig_y = curpos >> 8; + boot_params.screen_info.orig_x = oreg.dl; + boot_params.screen_info.orig_y = oreg.dh; } static void store_video_mode(void) { - u16 ax, page; + struct biosregs ireg, oreg; /* N.B.: the saving of the video page here is a bit silly, since we pretty much assume page 0 everywhere. */ - ax = 0x0f00; - asm(INT10 - : "+a" (ax), "=b" (page) - : : "ecx", "edx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x0f; + intcall(0x10, &ireg, &oreg); /* Not all BIOSes are clean with respect to the top bit */ - boot_params.screen_info.orig_video_mode = ax & 0x7f; - boot_params.screen_info.orig_video_page = page >> 8; + boot_params.screen_info.orig_video_mode = oreg.al & 0x7f; + boot_params.screen_info.orig_video_page = oreg.bh; } /* @@ -257,7 +254,7 @@ static void restore_screen(void) int y; addr_t dst = 0; u16 *src = saved.data; - u16 ax, bx, dx; + struct biosregs ireg; if (graphic_mode) return; /* Can't restore onto a graphic mode */ @@ -296,12 +293,11 @@ static void restore_screen(void) } /* Restore cursor position */ - ax = 0x0200; /* Set cursor position */ - bx = 0; /* Page number (<< 8) */ - dx = (saved.cury << 8)+saved.curx; - asm volatile(INT10 - : "+a" (ax), "+b" (bx), "+d" (dx) - : : "ecx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x02; /* Set cursor position */ + ireg.dh = saved.cury; + ireg.dl = saved.curx; + intcall(0x10, &ireg, NULL); } #else #define save_screen() ((void)0) diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index ee63f5d1446..5bb174a997f 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -112,20 +112,6 @@ extern int force_x, force_y; /* Don't query the BIOS for cols/rows */ extern int do_restore; /* Restore screen contents */ extern int graphic_mode; /* Graphics mode with linear frame buffer */ -/* - * int $0x10 is notorious for touching registers it shouldn't. - * gcc doesn't like %ebp being clobbered, so define it as a push/pop - * sequence here. - * - * A number of systems, including the original PC can clobber %bp in - * certain circumstances, like when scrolling. There exists at least - * one Trident video card which could clobber DS under a set of - * circumstances that we are unlikely to encounter (scrolling when - * using an extended graphics mode of more than 800x600 pixels), but - * it's cheap insurance to deal with that here. - */ -#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp" - /* Accessing VGA indexed registers */ static inline u8 in_idx(u16 port, u8 index) { From 2062501ae6505dbc5bff3a792246c2661d114050 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Apr 2009 01:49:33 +0200 Subject: [PATCH 119/900] tracing/lockdep: report the time waited for a lock While trying to optimize the new lock on reiserfs to replace the bkl, I find the lock tracing very useful though it lacks something important for performance (and latency) instrumentation: the time a task waits for a lock. That's what this patch implements: bash-4816 [000] 202.652815: lock_contended: lock_contended: &sb->s_type->i_mutex_key bash-4816 [000] 202.652819: lock_acquired: &rq->lock (0.000 us) <...>-4787 [000] 202.652825: lock_acquired: &rq->lock (0.000 us) <...>-4787 [000] 202.652829: lock_acquired: &rq->lock (0.000 us) bash-4816 [000] 202.652833: lock_acquired: &sb->s_type->i_mutex_key (16.005 us) As shown above, the "lock acquired" field is followed by the time it has been waiting for the lock. Usually, a lock contended entry is followed by a near lock_acquired entry with a non-zero time waited. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Steven Rostedt LKML-Reference: <1238975373-15739-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/trace/lockdep_event_types.h | 23 ++++++++++++++++++----- kernel/lockdep.c | 8 ++++---- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h index adccfcd2ec8..863f1e4583a 100644 --- a/include/trace/lockdep_event_types.h +++ b/include/trace/lockdep_event_types.h @@ -32,11 +32,24 @@ TRACE_FORMAT(lock_contended, TP_FMT("%s", lock->name) ); -TRACE_FORMAT(lock_acquired, - TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip), - TP_FMT("%s", lock->name) - ); +TRACE_EVENT(lock_acquired, + TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), + + TP_ARGS(lock, ip, waittime), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, wait_usec) + __field(unsigned long, wait_nsec_rem) + ), + TP_fast_assign( + __entry->name = lock->name; + __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); + __entry->wait_usec = (unsigned long) waittime; + ), + TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, + __entry->wait_nsec_rem) +); #endif #endif diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b0f01186696..c4582a6ea95 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3061,6 +3061,8 @@ found_it: put_lock_stats(stats); } +DEFINE_TRACE(lock_acquired); + static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { @@ -3099,6 +3101,8 @@ found_it: hlock->holdtime_stamp = now; } + trace_lock_acquired(lock, ip, waittime); + stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) @@ -3137,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_contended); -DEFINE_TRACE(lock_acquired); - void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - trace_lock_acquired(lock, ip); - if (unlikely(!lock_stat)) return; From e71e99c294058a61b7a8b9bb6da2f745ac51aa4f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 14:30:04 -0400 Subject: [PATCH 120/900] x86, function-graph: only save return values on x86_64 Impact: speed up The return to handler portion of the function graph tracer should only need to save the return values. The caller already saved off the registers that the callee can modify. The returning function already saved the registers it modified. When we call our own trace function it too will save the registers that the callee must restore. There's no reason to save off anything more that the registers used to return the values. Note, I did a complete kernel build with this modification and the function graph tracer running on x86_64. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a331ec38af9..1ac99865591 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -147,27 +147,14 @@ END(ftrace_graph_caller) GLOBAL(return_to_handler) subq $80, %rsp + /* Save the return values */ movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) - movq %r10, 56(%rsp) - movq %r11, 64(%rsp) + movq %rdx, 8(%rsp) call ftrace_return_to_handler movq %rax, 72(%rsp) - movq 64(%rsp), %r11 - movq 56(%rsp), %r10 - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx + movq 8(%rsp), %rdx movq (%rsp), %rax addq $72, %rsp retq From 5cb3d1d9d34ac04bcaa2034139345b2a5fea54c1 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Thu, 9 Apr 2009 14:08:18 +0800 Subject: [PATCH 121/900] tracing, net, skb tracepoint: make skb tracepoint use the TRACE_EVENT() macro TRACE_EVENT is a more generic way to define a tracepoint. Doing so adds these new capabilities to this tracepoint: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions Signed-off-by: Zhao Lei Acked-by: Neil Horman Cc: "David S. Miller" Cc: Arnaldo Carvalho de Melo Cc: "Steven Rostedt ;" Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49DD90D2.5020604@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/skb.h | 4 +--- include/trace/skb_event_types.h | 38 +++++++++++++++++++++++++++++++ include/trace/trace_event_types.h | 1 + include/trace/trace_events.h | 1 + 4 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 include/trace/skb_event_types.h diff --git a/include/trace/skb.h b/include/trace/skb.h index b66206d9be7..d2de7174a6e 100644 --- a/include/trace/skb.h +++ b/include/trace/skb.h @@ -4,8 +4,6 @@ #include #include -DECLARE_TRACE(kfree_skb, - TP_PROTO(struct sk_buff *skb, void *location), - TP_ARGS(skb, location)); +#include #endif diff --git a/include/trace/skb_event_types.h b/include/trace/skb_event_types.h new file mode 100644 index 00000000000..4a1c504c0e1 --- /dev/null +++ b/include/trace/skb_event_types.h @@ -0,0 +1,38 @@ + +/* use instead */ +#ifndef TRACE_EVENT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM skb + +/* + * Tracepoint for free an sk_buff: + */ +TRACE_EVENT(kfree_skb, + + TP_PROTO(struct sk_buff *skb, void *location), + + TP_ARGS(skb, location), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned short, protocol ) + __field( void *, location ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + if (skb) { + __entry->protocol = ntohs(skb->protocol); + } + __entry->location = location; + ), + + TP_printk("skbaddr=%p protocol=%u location=%p", + __entry->skbaddr, __entry->protocol, __entry->location) +); + +#undef TRACE_SYSTEM diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h index df56f5694be..33b6bfcba93 100644 --- a/include/trace/trace_event_types.h +++ b/include/trace/trace_event_types.h @@ -3,3 +3,4 @@ #include #include #include +#include diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index fd13750ca4b..0e2aa80076d 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -3,3 +3,4 @@ #include #include #include +#include From bda869c614c937c318547c3ee1d65a316b693c21 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:05:10 +0200 Subject: [PATCH 122/900] x86: cacheinfo: use L3 cache index disable feature only for CPUs that support it AMD family 0x11 CPU doesn't support the feature. Some AMD family 0x10 CPUs do not support it or have an erratum, see erratum #382 in "Revision Guide for AMD Family 10h Processors, 41322 Rev. 3.40 February 2009". Signed-off-by: Andreas Herrmann CC: Mark Langsdorf Cc: Andrew Morton LKML-Reference: <20090409130510.GG31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 483eda96e10..72401264912 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -291,6 +291,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { if (index < 3) return; + + if (boot_cpu_data.x86 == 0x11) + return; + + /* see erratum #382 */ + if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + return; + this_leaf->can_disable = 1; } From 845d8c761ec763871936c62b837c4a9ea6d0fbdb Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:07:29 +0200 Subject: [PATCH 123/900] x86: cacheinfo: correct return value when cache_disable feature is not active Impact: bug fix If user writes to "cache_disable" attribute on a CPU that does not support this feature, the process hangs due to an invalid return value in store_cache_disable(). Signed-off-by: Andreas Herrmann Cc: Andrew Morton Cc: Mark Langsdorf LKML-Reference: <20090409130729.GH31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 72401264912..1ab46e05adf 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -771,7 +771,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, unsigned int ret, index, val; if (!this_leaf->can_disable) - return 0; + return -EINVAL; if (strlen(buf) > 15) return -EINVAL; From afd9fceec55225d33be878927056a548c2eef26c Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:16:17 +0200 Subject: [PATCH 124/900] x86: cacheinfo: use cached K8 NB_MISC devices instead of scanning for it Impact: avoid code duplication Signed-off-by: Andreas Herrmann Cc: Andrew Morton Cc: Mark Langsdorf LKML-Reference: <20090409131617.GI31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/k8.h | 8 ++++++ arch/x86/kernel/cpu/intel_cacheinfo.c | 37 +++------------------------ 2 files changed, 11 insertions(+), 34 deletions(-) diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h index 54c8cc53b24..c23b3d171be 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/k8.h @@ -12,4 +12,12 @@ extern int cache_k8_northbridges(void); extern void k8_flush_garts(void); extern int k8_scan_nodes(unsigned long start, unsigned long end); +#ifdef CONFIG_K8_NB +#define node_to_k8_nb_misc(node) \ + (node < num_k8_northbridges) ? k8_northbridges[node] : NULL +#else +#define node_to_k8_nb_misc(node) NULL +#endif + + #endif /* _ASM_X86_K8_H */ diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 1ab46e05adf..0cde0715369 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -17,6 +17,7 @@ #include #include +#include #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -159,14 +160,6 @@ struct _cpuid4_info_regs { unsigned long can_disable; }; -#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS) -static struct pci_device_id k8_nb_id[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, - {} -}; -#endif - unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -704,30 +697,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -#ifdef CONFIG_PCI -static struct pci_dev *get_k8_northbridge(int node) -{ - struct pci_dev *dev = NULL; - int i; - - for (i = 0; i <= node; i++) { - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(&k8_nb_id[0], dev)); - if (!dev) - break; - } - return dev; -} -#else -static struct pci_dev *get_k8_northbridge(int node) -{ - return NULL; -} -#endif - static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) { const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); @@ -739,7 +708,7 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) if (!this_leaf->can_disable) return sprintf(buf, "Feature not enabled\n"); - dev = get_k8_northbridge(node); + dev = node_to_k8_nb_misc(node); if (!dev) { printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; @@ -783,7 +752,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, return -EINVAL; val |= 0xc0000000; - dev = get_k8_northbridge(node); + dev = node_to_k8_nb_misc(node); if (!dev) { printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; From f8b201fc7110c3673437254e8ba02451461ece0b Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 9 Apr 2009 15:18:49 +0200 Subject: [PATCH 125/900] x86: cacheinfo: replace sysfs interface for cache_disable feature Impact: replace sysfs attribute Current interface violates against "one-value-per-sysfs-attribute rule". This patch replaces current attribute with two attributes -- one for each L3 Cache Index Disable register. Signed-off-by: Mark Langsdorf Signed-off-by: Andreas Herrmann Cc: Andrew Morton LKML-Reference: <20090409131849.GJ31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 94 +++++++++++++-------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 0cde0715369..fc28291e40b 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -697,74 +697,70 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - ssize_t ret = 0; - int i; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; if (!this_leaf->can_disable) - return sprintf(buf, "Feature not enabled\n"); - - dev = node_to_k8_nb_misc(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; - } - for (i = 0; i < 2; i++) { - unsigned int reg; + if (!dev) + return -EINVAL; - pci_read_config_dword(dev, 0x1BC + i * 4, ®); - - ret += sprintf(buf, "%sEntry: %d\n", buf, i); - ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", - buf, - reg & 0x80000000 ? "Disabled" : "Allowed", - reg & 0x40000000 ? "Disabled" : "Allowed"); - ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", - buf, (reg & 0x30000) >> 16, reg & 0xfff); - } - return ret; + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "%x\n", reg); } -static ssize_t -store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, - size_t count) +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - unsigned int ret, index, val; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; if (!this_leaf->can_disable) return -EINVAL; - if (strlen(buf) > 15) + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) return -EINVAL; - ret = sscanf(buf, "%x %x", &index, &val); - if (ret != 2) - return -EINVAL; - if (index > 1) + if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; val |= 0xc0000000; - dev = node_to_k8_nb_misc(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); - return -EINVAL; - } - pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); - - return 1; + return count; } +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ +} +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + struct _cache_attr { struct attribute attr; ssize_t (*show)(struct _cpuid4_info *, char *); @@ -785,7 +781,10 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); static struct attribute * default_attrs[] = { &type.attr, @@ -797,7 +796,8 @@ static struct attribute * default_attrs[] = { &size.attr, &shared_cpu_map.attr, &shared_cpu_list.attr, - &cache_disable.attr, + &cache_disable_0.attr, + &cache_disable_1.attr, NULL }; From ba518bea2db21c72d44a6cbfd825b026ef9cdcb6 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 9 Apr 2009 15:24:06 +0200 Subject: [PATCH 126/900] x86: cacheinfo: disable L3 ECC scrubbing when L3 cache index is disabled (Use correct mask to zero out bits 24-28 by Andreas) Signed-off-by: Mark Langsdorf Signed-off-by: Andreas Herrmann Cc: Andrew Morton LKML-Reference: <20090409132406.GK31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index fc28291e40b..d46a849f44a 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -731,6 +731,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, int node = cpu_to_node(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned long val = 0; + unsigned int scrubber = 0; if (!this_leaf->can_disable) return -EINVAL; @@ -745,6 +746,11 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, return -EINVAL; val |= 0xc0000000; + + pci_read_config_dword(dev, 0x58, &scrubber); + scrubber &= ~0x1f000000; + pci_write_config_dword(dev, 0x58, scrubber); + pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); From 2fad2d9bb8310889f3261035b594b4e068b6eb8b Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 9 Apr 2009 15:31:53 +0200 Subject: [PATCH 127/900] x86/docs: add description for cache_disable sysfs interface Signed-off-by: Mark Langsdorf Signed-off-by: Andreas Herrmann Cc: Andrew Morton LKML-Reference: <20090409133153.GL31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- .../ABI/testing/sysfs-devices-cache_disable | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-devices-cache_disable diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable new file mode 100644 index 00000000000..175bb4f7051 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-cache_disable @@ -0,0 +1,18 @@ +What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X +Date: August 2008 +KernelVersion: 2.6.27 +Contact: mark.langsdorf@amd.com +Description: These files exist in every cpu's cache index directories. + There are currently 2 cache_disable_# files in each + directory. Reading from these files on a supported + processor will return that cache disable index value + for that processor and node. Writing to one of these + files will cause the specificed cache index to be disabled. + + Currently, only AMD Family 10h Processors support cache index + disable, and only for their L3 caches. See the BIOS and + Kernel Developer's Guide at + http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf + for formatting information and other details on the + cache index disable. +Users: joachim.deguara@amd.com From f465145235313c451164bdfa9037ac254bf00c9a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:18 +0300 Subject: [PATCH 128/900] x86: move x86_quirk_pre_intr_init() to irqinit_32.c Impact: cleanup In preparation for unifying irqinit_{32,64}.c, make x86_quirk_pre_intr_init() local to irqinit_32.c. Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i8259.h | 4 ---- arch/x86/include/asm/setup.h | 1 - arch/x86/kernel/irqinit_32.c | 20 +++++++++++++++++++- arch/x86/kernel/setup.c | 18 ------------------ 4 files changed, 19 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 1a99e6c092a..58d7091eeb1 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -60,8 +60,4 @@ extern struct irq_chip i8259A_chip; extern void mask_8259A(void); extern void unmask_8259A(void); -#ifdef CONFIG_X86_32 -extern void init_ISA_irqs(void); -#endif - #endif /* _ASM_X86_I8259_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index bdc2ada05ae..4093d1ed6db 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -33,7 +33,6 @@ struct x86_quirks { int (*setup_ioapic_ids)(void); }; -extern void x86_quirk_pre_intr_init(void); extern void x86_quirk_intr_init(void); extern void x86_quirk_trap_init(void); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 368b0a8836f..0c0dedccd03 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -53,7 +53,7 @@ static struct irqaction fpu_irq = { .name = "fpu", }; -void __init init_ISA_irqs(void) +static void __init init_ISA_irqs(void) { int i; @@ -121,6 +121,24 @@ int vector_used_by_percpu_irq(unsigned int vector) /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } + init_ISA_irqs(); +} + void __init native_init_IRQ(void) { int i; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf6..523bb697120 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -996,24 +996,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} - /** * x86_quirk_intr_init - post gate setup interrupt initialisation * From 7371d9fcb88dc9185be9719f64744a339c537a92 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:19 +0300 Subject: [PATCH 129/900] x86: move init_ISA_irqs() in irqinit_32.c to match ordering in irqinit_64.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 48 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 0c0dedccd03..c5cb769db7b 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -53,30 +53,6 @@ static struct irqaction fpu_irq = { .name = "fpu", }; -static void __init init_ISA_irqs(void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - /* * IRQ2 is cascade interrupt to second interrupt controller */ @@ -118,6 +94,30 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } +static void __init init_ISA_irqs(void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + /* + * 16 old-style INTA-cycle interrupts: + */ + for (i = 0; i < NR_IRQS_LEGACY; i++) { + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } +} + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); From 36290d87f5abf260a543e5b711be4ceed03e6b1a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:20 +0300 Subject: [PATCH 130/900] x86: introduce smp_intr_init() in irqinit_32.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 61 +++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index c5cb769db7b..df0aad5a062 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -121,6 +121,38 @@ static void __init init_ISA_irqs(void) /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +static void __init smp_intr_init(void) +{ +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPIs for invalidation */ + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* IPI for single call function */ + alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); +#endif +} + /** * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors * @@ -158,34 +190,7 @@ void __init native_init_IRQ(void) } -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for single call function */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); -#endif + smp_intr_init(); #ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ From 22813c45228160b07244a7c4ed7580388ac0f33d Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:21 +0300 Subject: [PATCH 131/900] x86: introduce apic_intr_init() in irqinit_32.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 40 ++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index df0aad5a062..9ba68c4557b 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -171,25 +171,8 @@ static void __init x86_quirk_pre_intr_init(void) init_ISA_irqs(); } -void __init native_init_IRQ(void) +static void __init apic_intr_init(void) { - int i; - - /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); - } - - smp_intr_init(); #ifdef CONFIG_X86_LOCAL_APIC @@ -208,6 +191,27 @@ void __init native_init_IRQ(void) /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif +} + +void __init native_init_IRQ(void) +{ + int i; + + /* Execute any quirks before the call gates are initialised: */ + x86_quirk_pre_intr_init(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + } + + apic_intr_init(); if (!acpi_ioapic) setup_irq(2, &irq2); From d3496c85cae22fb7713af6ed542a6aeae8ee4210 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:22 +0300 Subject: [PATCH 132/900] x86: use identical loop constructs in 32-bit and 64-bit native_init_IRQ() Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/irqinit_64.c | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 9ba68c4557b..1029a1855f9 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -205,7 +205,7 @@ void __init native_init_IRQ(void) * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* SYSCALL_VECTOR was reserved in trap_init. */ if (i != SYSCALL_VECTOR) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 8cd10537fd4..1c8858bb27f 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -159,15 +159,16 @@ void __init native_init_IRQ(void) int i; init_ISA_irqs(); + /* * Cover the whole vector space, no vector can escape * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ + if (i != IA32_SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } apic_intr_init(); From b0096bb0b640d0a7713618b3472fd0f4adf30a96 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:23 +0300 Subject: [PATCH 133/900] x86: unify smp_intr_init() in irqinit_{32,64}.h Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 8 +++++--- arch/x86/kernel/irqinit_64.c | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 1029a1855f9..ef2528d298b 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -123,7 +123,8 @@ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); static void __init smp_intr_init(void) { -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -143,14 +144,15 @@ static void __init smp_intr_init(void) /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - /* IPI for single call function */ + /* IPI for generic single function call */ alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); + call_function_single_interrupt); /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif +#endif /* CONFIG_SMP */ } /** diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 1c8858bb27f..9e7c57dc79e 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -107,6 +107,7 @@ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); static void __init smp_intr_init(void) { #ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -134,6 +135,7 @@ static void __init smp_intr_init(void) set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif +#endif /* CONFIG_SMP */ } static void __init apic_intr_init(void) From 598c73d250ffb112715aa48fb325d79e255be23b Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:24 +0300 Subject: [PATCH 134/900] x86: unify init_ISA_irqs() in irqinit_{32,64}.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/irqinit_64.c | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index ef2528d298b..4488b713396 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -98,7 +98,7 @@ static void __init init_ISA_irqs(void) { int i; -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) init_bsp_APIC(); #endif init_8259A(0); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 9e7c57dc79e..61c9a922e80 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -84,9 +84,14 @@ static void __init init_ISA_irqs(void) { int i; +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) init_bsp_APIC(); +#endif init_8259A(0); + /* + * 16 old-style INTA-cycle interrupts: + */ for (i = 0; i < NR_IRQS_LEGACY; i++) { struct irq_desc *desc = irq_to_desc(i); @@ -94,11 +99,8 @@ static void __init init_ISA_irqs(void) desc->action = NULL; desc->depth = 1; - /* - * 16 old-style INTA-cycle interrupts: - */ set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); + handle_level_irq, "XT"); } } From 320fd99672a44ece6d1cd0d838ba31c8ebbf5979 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:25 +0300 Subject: [PATCH 135/900] x86: unify native_init_IRQ() in irqinit_{32,64}.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 53 ++++++++++++++--------- arch/x86/kernel/irqinit_64.c | 82 +++++++++++++++++++++++++++++++++++- 2 files changed, 115 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 4488b713396..a780de3ad5d 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -22,7 +22,7 @@ #include #include - +#ifdef CONFIG_X86_32 /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 * as the irq is unreliable, and exception 16 works correctly @@ -52,6 +52,7 @@ static struct irqaction fpu_irq = { .handler = math_error_irq, .name = "fpu", }; +#endif /* * IRQ2 is cascade interrupt to second interrupt controller @@ -155,24 +156,6 @@ static void __init smp_intr_init(void) #endif /* CONFIG_SMP */ } -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} - static void __init apic_intr_init(void) { smp_intr_init(); @@ -195,12 +178,36 @@ static void __init apic_intr_init(void) #endif } +#ifdef CONFIG_X86_32 +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } + init_ISA_irqs(); +} +#endif + void __init native_init_IRQ(void) { int i; +#ifdef CONFIG_X86_32 /* Execute any quirks before the call gates are initialised: */ x86_quirk_pre_intr_init(); +#else + init_ISA_irqs(); +#endif /* * Cover the whole vector space, no vector can escape @@ -208,9 +215,15 @@ void __init native_init_IRQ(void) * 'special' SMP interrupts) */ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { +#ifdef CONFIG_X86_32 /* SYSCALL_VECTOR was reserved in trap_init. */ if (i != SYSCALL_VECTOR) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#else + /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ + if (i != IA32_SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#endif } apic_intr_init(); @@ -218,6 +231,7 @@ void __init native_init_IRQ(void) if (!acpi_ioapic) setup_irq(2, &irq2); +#ifdef CONFIG_X86_32 /* * Call quirks after call gates are initialised (usually add in * the architecture specific gates): @@ -232,4 +246,5 @@ void __init native_init_IRQ(void) setup_irq(FPU_IRQ, &fpu_irq); irq_ctx_init(smp_processor_id()); +#endif } diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 61c9a922e80..ed50e35ce97 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -39,14 +39,46 @@ * (these are usually mapped into the 0x30-0xff vector range) */ +#ifdef CONFIG_X86_32 +/* + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + +static irqreturn_t math_error_irq(int cpl, void *dev_id) +{ + outb(0, 0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)get_irq_regs()->ip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. + */ +static struct irqaction fpu_irq = { + .handler = math_error_irq, + .name = "fpu", +}; +#endif + /* * IRQ2 is cascade interrupt to second interrupt controller */ - static struct irqaction irq2 = { .handler = no_action, .name = "cascade", }; + DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... IRQ0_VECTOR - 1] = -1, [IRQ0_VECTOR] = 0, @@ -158,11 +190,36 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); } +#ifdef CONFIG_X86_32 +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } + init_ISA_irqs(); +} +#endif + void __init native_init_IRQ(void) { int i; +#ifdef CONFIG_X86_32 + /* Execute any quirks before the call gates are initialised: */ + x86_quirk_pre_intr_init(); +#else init_ISA_irqs(); +#endif /* * Cover the whole vector space, no vector can escape @@ -170,13 +227,36 @@ void __init native_init_IRQ(void) * 'special' SMP interrupts) */ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { +#ifdef CONFIG_X86_32 + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#else /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ if (i != IA32_SYSCALL_VECTOR) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#endif } apic_intr_init(); if (!acpi_ioapic) setup_irq(2, &irq2); + +#ifdef CONFIG_X86_32 + /* + * Call quirks after call gates are initialised (usually add in + * the architecture specific gates): + */ + x86_quirk_intr_init(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +#endif } From 778838600eb6973bdb6fd11e7f91b43cea4d6f45 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:26 +0300 Subject: [PATCH 136/900] x86: unify trivial differences in irqinit_{32,64}.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 20 ++++++++++++++++++++ arch/x86/kernel/irqinit_64.c | 4 ++++ 2 files changed, 24 insertions(+) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index a780de3ad5d..72ce94268d3 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -1,20 +1,24 @@ +#include #include #include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -22,6 +26,22 @@ #include #include +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + #ifdef CONFIG_X86_32 /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index ed50e35ce97..687b6c33cd7 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -17,11 +17,14 @@ #include #include +#include #include #include #include #include +#include #include +#include /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: @@ -136,6 +139,7 @@ static void __init init_ISA_irqs(void) } } +/* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); static void __init smp_intr_init(void) From ab19c25abd14db28d7454f00805ea59f22ed6057 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:27 +0300 Subject: [PATCH 137/900] x86: unify apic_intr_init() in irqinit_{32,64}.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 9 ++++++++- arch/x86/kernel/irqinit_64.c | 11 +++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 72ce94268d3..f3be5e97427 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -180,7 +180,12 @@ static void __init apic_intr_init(void) { smp_intr_init(); -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_X86_64 + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -192,10 +197,12 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); #endif +#ifdef CONFIG_X86_32 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif +#endif } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 687b6c33cd7..f3be5e97427 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -180,9 +180,12 @@ static void __init apic_intr_init(void) { smp_intr_init(); +#ifdef CONFIG_X86_64 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -192,6 +195,14 @@ static void __init apic_intr_init(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +#endif + +#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) + /* thermal monitor LVT interrupt */ + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +#endif } #ifdef CONFIG_X86_32 From 31cb45ef2600d47191d51253ec94b5e3f689260d Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:28 +0300 Subject: [PATCH 138/900] x86: unify irqinit_{32,64}.c into irqinit.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/{irqinit_32.c => irqinit.c} | 0 arch/x86/kernel/irqinit_64.c | 277 -------------------- 3 files changed, 1 insertion(+), 278 deletions(-) rename arch/x86/kernel/{irqinit_32.c => irqinit.c} (100%) delete mode 100644 arch/x86/kernel/irqinit_64.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 145cce75cda..16e3acfe19e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,7 +28,7 @@ CFLAGS_paravirt.o := $(nostackp) obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y += setup.o i8259.o irqinit_$(BITS).o +obj-y += setup.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c similarity index 100% rename from arch/x86/kernel/irqinit_32.c rename to arch/x86/kernel/irqinit.c diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c deleted file mode 100644 index f3be5e97427..00000000000 --- a/arch/x86/kernel/irqinit_64.c +++ /dev/null @@ -1,277 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - -#ifdef CONFIG_X86_32 -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - outb(0, 0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error((void __user *)get_irq_regs()->ip); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .name = "fpu", -}; -#endif - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ -static struct irqaction irq2 = { - .handler = no_action, - .name = "cascade", -}; - -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] != -1) - return 1; - } - - return 0; -} - -static void __init init_ISA_irqs(void) -{ - int i; - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); -#endif -#endif /* CONFIG_SMP */ -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - -#ifdef CONFIG_X86_64 - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#endif - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -#endif - -#ifdef CONFIG_X86_32 -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) - /* thermal monitor LVT interrupt */ - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -#endif -} - -#ifdef CONFIG_X86_32 -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} -#endif - -void __init native_init_IRQ(void) -{ - int i; - -#ifdef CONFIG_X86_32 - /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); -#else - init_ISA_irqs(); -#endif - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { -#ifdef CONFIG_X86_32 - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#else - /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ - if (i != IA32_SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#endif - } - - apic_intr_init(); - - if (!acpi_ioapic) - setup_irq(2, &irq2); - -#ifdef CONFIG_X86_32 - /* - * Call quirks after call gates are initialised (usually add in - * the architecture specific gates): - */ - x86_quirk_intr_init(); - - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - - irq_ctx_init(smp_processor_id()); -#endif -} From ac3048dfd4740becf8d768844cf47ebee363c9f8 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:29 +0300 Subject: [PATCH 139/900] x86: define IA32_SYSCALL_VECTOR on 32-bit to reduce ifdefs Impact: cleanup We can remove some #ifdefs if we define IA32_SYSCALL_VECTOR on 32-bit. Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 1 + arch/x86/kernel/irqinit.c | 6 ------ arch/x86/kernel/traps.c | 5 +---- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 3cbd79bbb47..910b5a3d675 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -34,6 +34,7 @@ #ifdef CONFIG_X86_32 # define SYSCALL_VECTOR 0x80 +# define IA32_SYSCALL_VECTOR 0x80 #else # define IA32_SYSCALL_VECTOR 0x80 #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f3be5e97427..f2c60a59f47 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -242,15 +242,9 @@ void __init native_init_IRQ(void) * 'special' SMP interrupts) */ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { -#ifdef CONFIG_X86_32 - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#else /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ if (i != IA32_SYSCALL_VECTOR) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#endif } apic_intr_init(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff..2310700faca 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -969,11 +969,8 @@ void __init trap_init(void) for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) set_bit(i, used_vectors); -#ifdef CONFIG_X86_64 set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#else - set_bit(SYSCALL_VECTOR, used_vectors); -#endif + /* * Should be a barrier for any external CPU state: */ From abdb5a5713330e17dfe91ab0d3e29c4744d95162 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:30 +0300 Subject: [PATCH 140/900] x86: remove some ifdefs from native_init_IRQ() Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f2c60a59f47..626977200a5 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -205,7 +205,6 @@ static void __init apic_intr_init(void) #endif } -#ifdef CONFIG_X86_32 /** * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors * @@ -217,24 +216,21 @@ static void __init apic_intr_init(void) **/ static void __init x86_quirk_pre_intr_init(void) { +#ifdef CONFIG_X86_32 if (x86_quirks->arch_pre_intr_init) { if (x86_quirks->arch_pre_intr_init()) return; } +#endif init_ISA_irqs(); } -#endif void __init native_init_IRQ(void) { int i; -#ifdef CONFIG_X86_32 /* Execute any quirks before the call gates are initialised: */ x86_quirk_pre_intr_init(); -#else - init_ISA_irqs(); -#endif /* * Cover the whole vector space, no vector can escape From 6265ff19ca08df0d96c859ae5e4dc2d9ad07070e Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:47:10 +0200 Subject: [PATCH 141/900] x86: cacheinfo: complete L2/L3 Cache and TLB associativity field definitions See "CPUID Specification" (AMD Publication #: 25481, Rev. 2.28, April 2008) Signed-off-by: Andreas Herrmann Cc: Mark Langsdorf LKML-Reference: <20090409134710.GA8026@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index d46a849f44a..789efe217e1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -200,10 +200,17 @@ union l3_cache { }; static const unsigned short __cpuinitconst assocs[] = { - [1] = 1, [2] = 2, [4] = 4, [6] = 8, - [8] = 16, [0xa] = 32, [0xb] = 48, + [1] = 1, + [2] = 2, + [4] = 4, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, [0xc] = 64, - [0xf] = 0xffff // ?? + [0xd] = 96, + [0xe] = 128, + [0xf] = 0xffff /* fully associative - no way to show this currently */ }; static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; @@ -264,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; if (leaf == 3) - eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; + eax->split.num_threads_sharing = + current_cpu_data.x86_max_cores - 1; else eax->split.num_threads_sharing = 0; eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; From 47f16ca7631f9c6bad8e6d968cfb1433029b09ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 10 Apr 2009 14:58:05 +0200 Subject: [PATCH 142/900] x86, irqinit: preempt merge conflicts To make the topic merge life easier for tip:perfcounters/core, include two (inactive in this topic) IRQ vector initializations here. Also fix build bug - missing kprobes.h inclusion. Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 626977200a5..b424c32c4a0 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -195,6 +196,13 @@ static void __init apic_intr_init(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* Performance monitoring interrupts: */ +# ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); +# endif + #endif #ifdef CONFIG_X86_32 From a5a2a0c7fa039c59619bc908b3b1ed24734d442a Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 10 Apr 2009 09:50:05 -0700 Subject: [PATCH 143/900] futex: fix futex_wait_setup key handling If the get_futex_key() call were to fail, the existing code would try and put_futex_key() prior to returning. This patch makes sure we only put_futex_key() if get_futex_key() succeeded. Reported-by: Clark Williams Signed-off-by: Darren Hart LKML-Reference: <20090410165005.14342.16973.stgit@Aeon> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/futex.c b/kernel/futex.c index 041bf3ac4be..6d2daa46f9f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1668,7 +1668,7 @@ retry: q->key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q->key); if (unlikely(ret != 0)) - goto out; + return ret; retry_private: *hb = queue_lock(q); From cf9972a921470b0a2da7906104bcd540b20e33bf Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 11 Apr 2009 22:24:05 -0700 Subject: [PATCH 144/900] x86, setup: fix comment in the "glove box" code Impact: Comment change only The glove box is about avoiding problems with *registers* being touched, not *memory*. Signed-off-by: H. Peter Anvin --- arch/x86/boot/bioscall.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S index 22b4b3efb9f..507793739ea 100644 --- a/arch/x86/boot/bioscall.S +++ b/arch/x86/boot/bioscall.S @@ -10,7 +10,7 @@ /* * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes - * touching memory they shouldn't be. + * touching registers they shouldn't be. */ .code16 From 2de1f33e99cec5fd79542a1d0e26efb9c36a98bb Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 11 Apr 2009 12:55:26 +0530 Subject: [PATCH 145/900] x86: apic/x2apic_cluster.c x86_cpu_to_logical_apicid should be static Impact: reduce kernel size a bit, address sparse warning Addresses the problem pointed out by this sparse warning: arch/x86/kernel/apic/x2apic_cluster.c:13:1: warning: symbol 'per_cpu__x86_cpu_to_logical_apicid' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput Cc: Suresh Siddha LKML-Reference: <1239434726.4418.24.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 4a903e2f0d1..8e4cbb255c3 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -10,7 +10,7 @@ #include #include -DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { From 56c49951747f250d8398582509e02ae5ce1d36d1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 11 Apr 2009 15:51:19 -0400 Subject: [PATCH 146/900] tracing: Add documentation for the power tracer Signed-off-by: "Theodore Ts'o" Acked-by: Arjan van de Ven Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1239479479-2603-4-git-send-email-tytso@mit.edu> Signed-off-by: Ingo Molnar --- Documentation/trace/power.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 Documentation/trace/power.txt diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt new file mode 100644 index 00000000000..cd805e16dc2 --- /dev/null +++ b/Documentation/trace/power.txt @@ -0,0 +1,17 @@ +The power tracer collects detailed information about C-state and P-state +transitions, instead of just looking at the high-level "average" +information. + +There is a helper script found in scrips/tracing/power.pl in the kernel +sources which can be used to parse this information and create a +Scalable Vector Graphics (SVG) picture from the trace data. + +To use this tracer: + + echo 0 > /sys/kernel/debug/tracing/tracing_enabled + echo power > /sys/kernel/debug/tracing/current_tracer + echo 1 > /sys/kernel/debug/tracing/tracing_enabled + sleep 1 + echo 0 > /sys/kernel/debug/tracing/tracing_enabled + cat /sys/kernel/debug/tracing/trace | \ + perl scripts/tracing/power.pl > out.sv From abd41443ac76d3e9c29a8c1d9e9a3312306cc55e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 11 Apr 2009 15:51:18 -0400 Subject: [PATCH 147/900] tracing: Document the event tracing system Signed-off-by: "Theodore Ts'o" Cc: Theodore Ts'o Cc: Steven Rostedt LKML-Reference: <1239479479-2603-3-git-send-email-tytso@mit.edu> Signed-off-by: Ingo Molnar --- Documentation/trace/events.txt | 135 +++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 Documentation/trace/events.txt diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt new file mode 100644 index 00000000000..abdee664c0f --- /dev/null +++ b/Documentation/trace/events.txt @@ -0,0 +1,135 @@ + Event Tracing + + Documentation written by Theodore Ts'o + +Introduction +============ + +Tracepoints (see Documentation/trace/tracepoints.txt) can be used +without creating custom kernel modules to register probe functions +using the event tracing infrastructure. + +Not all tracepoints can be traced using the event tracing system; +the kernel developer must provide code snippets which define how the +tracing information is saved into the tracing buffer, and how the +the tracing information should be printed. + +Using Event Tracing +=================== + +The events which are available for tracing can be found in the file +/sys/kernel/debug/tracing/available_events. + +To enable a particular event, such as 'sched_wakeup', simply echo it +to /sys/debug/tracing/set_event. For example: + + # echo sched_wakeup > /sys/kernel/debug/tracing/set_event + +[ Note: events can also be enabled/disabled via the 'enabled' toggle + found in the /sys/kernel/tracing/events/ hierarchy of directories. ] + +To disable an event, echo the event name to the set_event file prefixed +with an exclamation point: + + # echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event + +To disable events, echo an empty line to the set_event file: + + # echo > /sys/kernel/debug/tracing/set_event + +The events are organized into subsystems, such as ext4, irq, sched, +etc., and a full event name looks like this: :. The +subsystem name is optional, but it is displayed in the available_events +file. All of the events in a subsystem can be specified via the syntax +":*"; for example, to enable all irq events, you can use the +command: + + # echo 'irq:*' > /sys/kernel/debug/tracing/set_event + +Defining an event-enabled tracepoint +------------------------------------ + +A kernel developer which wishes to define an event-enabled tracepoint +must declare the tracepoint using TRACE_EVENT instead of DECLARE_TRACE. +This is done via two header files in include/trace. For example, to +event-enable the jbd2 subsystem, we must create two files, +include/trace/jbd2.h and include/trace/jbd2_event_types.h. The +include/trace/jbd2.h file should be included by kernel source files that +will have a tracepoint inserted, and might look like this: + +#ifndef _TRACE_JBD2_H +#define _TRACE_JBD2_H + +#include +#include + +#include + +#endif + +In a file that utilizes a jbd2 tracepoint, this header file would be +included. Note that you still have to use DEFINE_TRACE(). So for +example, if fs/jbd2/commit.c planned to use the jbd2_start_commit +tracepoint, it would have the following near the beginning of the file: + +#include + +DEFINE_TRACE(jbd2_start_commit); + +Then in the function that would call the tracepoint, it would call the +tracepoint function. (For more information, please see the tracepoint +documentation in Documentation/trace/tracepoints.txt): + + trace_jbd2_start_commit(journal, commit_transaction); + +The code snippets which allow jbd2_start_commit to be an event-enabled +tracepoint are placed in the file include/trace/jbd2_event_types.h: + +/* use instead */ +#ifndef TRACE_EVENT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM jbd2 + +#include + +TRACE_EVENT(jbd2_start_commit, + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + TP_ARGS(journal, commit_transaction), + TP_STRUCT__entry( + __array( char, devname, BDEVNAME_SIZE+24 ) + __field( int, transaction ) + ), + TP_fast_assign( + memcpy(__entry->devname, journal->j_devname, BDEVNAME_SIZE+24); + __entry->transaction = commit_transaction->t_tid; + ), + TP_printk("dev %s transaction %d", + __entry->devname, __entry->transaction) +); + +The TP_PROTO and TP_ARGS are unchanged from DECLARE_TRACE. The new +arguments to TRACE_EVENT are TP_STRUCT__entry, TP_fast_assign, and +TP_printk. + +TP_STRUCT__entry defines the data structure which will be stored in the +trace buffer. Normally, fields in __entry will be arrays or simple +types. It is possible to place data structures in __entry --- however, +pointers in the data structure can not be trusted, since they will be +accessed sometime later by TP_printk, and if the data structure contains +fields that will not or cannot be used by TP_printk, this will waste +space in the trace buffer. In general, data structures should be +avoided, unless they do only contain non-pointer types and all of the +fields will be used by TP_printk. + +TP_fast_assign defines the code snippet which saves information into the +__entry data structure, using the passed-in arguments defined in +TP_PROTO and TP_ARGS. + +Finally, TP_printk will print the __entry data structure. At the time +when the code snippet defined by TP_printk is executed, it will not have +access to the TP_ARGS arguments; it can only use the information saved +in the __entry data structure. From 2c1b284e4fa260fd922b9a65c99169e2630c6862 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 11 Apr 2009 00:03:10 +0530 Subject: [PATCH 148/900] x86: clean up declarations and variables Impact: cleanup, no code changed - syscalls.h update declarations due to unifications - irq.c declare smp_generic_interrupt() before it gets used - process.c declare sys_fork() and sys_vfork() before they get used - tsc.c rename tsc_khz shadowed variable - apic/probe_32.c declare apic_default before it gets used - apic/nmi.c prev_nmi_count should be unsigned - apic/io_apic.c declare smp_irq_move_cleanup_interrupt() before it gets used - mm/init.c declare direct_gbpages and free_initrd_mem before they get used Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 3 +++ arch/x86/include/asm/hw_irq.h | 4 +++ arch/x86/include/asm/pgtable.h | 2 ++ arch/x86/include/asm/pgtable_64.h | 6 ----- arch/x86/include/asm/syscalls.h | 45 ++++++++++++++++--------------- arch/x86/kernel/apic/io_apic.c | 1 + arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/irq.c | 1 + arch/x86/kernel/process.c | 1 + arch/x86/kernel/tsc.c | 8 +++--- arch/x86/mm/init.c | 1 + 12 files changed, 42 insertions(+), 33 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 42f2f837742..5773660c8cd 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -478,6 +478,9 @@ static inline unsigned int read_apic_id(void) extern void default_setup_apic_routing(void); #ifdef CONFIG_X86_32 + +extern struct apic apic_default; + /* * Set up the logical destination ID. * diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b762ea49bd7..be9ae4111c9 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -78,7 +78,11 @@ extern void eisa_set_level_irq(unsigned int irq); /* SMP */ extern void smp_apic_timer_interrupt(struct pt_regs *); extern void smp_spurious_interrupt(struct pt_regs *); +extern void smp_generic_interrupt(struct pt_regs *); extern void smp_error_interrupt(struct pt_regs *); +#ifdef CONFIG_X86_IO_APIC +extern asmlinkage void smp_irq_move_cleanup_interrupt(void); +#endif #ifdef CONFIG_SMP extern void smp_reschedule_interrupt(struct pt_regs *); extern void smp_call_function_interrupt(struct pt_regs *); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 29d96d168bc..3f8d09d94eb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -503,6 +503,8 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ +extern int direct_gbpages; + /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) { diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 6b87bc6d501..abde308fdb0 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[]; extern void paging_init(void); -#endif /* !__ASSEMBLY__ */ - -#ifndef __ASSEMBLY__ - #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%016lx).\n", \ __FILE__, __LINE__, &(e), pte_val(e)) @@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define update_mmu_cache(vma, address, pte) do { } while (0) -extern int direct_gbpages; - /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 7043408f690..372b76edd63 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -1,7 +1,7 @@ /* * syscalls.h - Linux syscall interfaces (arch-specific) * - * Copyright (c) 2008 Jaswinder Singh + * Copyright (c) 2008 Jaswinder Singh Rajput * * This file is released under the GPLv2. * See the file COPYING for more details. @@ -12,50 +12,55 @@ #include #include -#include #include +#include /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); +/* kernel/process.c */ +int sys_fork(struct pt_regs *); +int sys_vfork(struct pt_regs *); + /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +/* kernel/signal.c */ +long sys_rt_sigreturn(struct pt_regs *); + /* kernel/tls.c */ asmlinkage int sys_set_thread_area(struct user_desc __user *); asmlinkage int sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ #ifdef CONFIG_X86_32 +/* kernel/ioport.c */ +long sys_iopl(struct pt_regs *); + /* kernel/process_32.c */ -int sys_fork(struct pt_regs *); int sys_clone(struct pt_regs *); -int sys_vfork(struct pt_regs *); int sys_execve(struct pt_regs *); -/* kernel/signal_32.c */ +/* kernel/signal.c */ asmlinkage int sys_sigsuspend(int, int, old_sigset_t); asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); int sys_sigaltstack(struct pt_regs *); unsigned long sys_sigreturn(struct pt_regs *); -long sys_rt_sigreturn(struct pt_regs *); - -/* kernel/ioport.c */ -long sys_iopl(struct pt_regs *); /* kernel/sys_i386_32.c */ +struct mmap_arg_struct; +struct sel_arg_struct; +struct oldold_utsname; +struct old_utsname; + asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -struct mmap_arg_struct; asmlinkage int old_mmap(struct mmap_arg_struct __user *); -struct sel_arg_struct; asmlinkage int old_select(struct sel_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); -struct old_utsname; asmlinkage int sys_uname(struct old_utsname __user *); -struct oldold_utsname; asmlinkage int sys_olduname(struct oldold_utsname __user *); /* kernel/vm86_32.c */ @@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *); #else /* CONFIG_X86_32 */ /* X86_64 only */ +/* kernel/ioport.c */ +asmlinkage long sys_iopl(unsigned int, struct pt_regs *); + /* kernel/process_64.c */ -asmlinkage long sys_fork(struct pt_regs *); asmlinkage long sys_clone(unsigned long, unsigned long, void __user *, void __user *, struct pt_regs *); -asmlinkage long sys_vfork(struct pt_regs *); asmlinkage long sys_execve(char __user *, char __user * __user *, char __user * __user *, struct pt_regs *); long sys_arch_prctl(int, unsigned long); -/* kernel/ioport.c */ -asmlinkage long sys_iopl(unsigned int, struct pt_regs *); - -/* kernel/signal_64.c */ +/* kernel/signal.c */ asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, struct pt_regs *); -long sys_rt_sigreturn(struct pt_regs *); /* kernel/sys_x86_64.c */ +struct new_utsname; + asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -struct new_utsname; asmlinkage long sys_uname(struct new_utsname __user *); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 767fe7e46d6..870c92ddaf9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index d6bd6240715..02056310f2f 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data) } #endif -static void report_broken_nmi(int cpu, int *prev_nmi_count) +static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) { printk(KERN_CONT "\n"); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 01eda2ac65e..440a8bccd91 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -160,7 +160,6 @@ extern struct apic apic_summit; extern struct apic apic_bigsmp; extern struct apic apic_es7000; extern struct apic apic_es7000_cluster; -extern struct apic apic_default; struct apic *apic = &apic_default; EXPORT_SYMBOL_GPL(apic); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3aaf7b9e3a8..2188267f523 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include #include #include +#include atomic_t irq_err_count; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e84..3e21e38d7e3 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7a567ebe636..a8dc0d00b83 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, tsc_khz; + unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; - tsc_khz = get_hypervisor_tsc_freq(); - if (tsc_khz) { + hv_tsc_khz = get_hypervisor_tsc_freq(); + if (hv_tsc_khz) { printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return tsc_khz; + return hv_tsc_khz; } local_irq_save(flags); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index fd3da1dda1c..40924e445f5 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1,3 +1,4 @@ +#include #include #include From 02af61bb50f5d5f0322dbe5ab2a0d75808d25c7b Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 10 Apr 2009 14:26:18 +0800 Subject: [PATCH 149/900] tracing, kmemtrace: Separate include/trace/kmemtrace.h to kmemtrace part and tracepoint part Impact: refactor code for future changes Current kmemtrace.h is used both as header file of kmemtrace and kmem's tracepoints definition. Tracepoints' definition file may be used by other code, and should only have definition of tracepoint. We can separate include/trace/kmemtrace.h into 2 files: include/linux/kmemtrace.h: header file for kmemtrace include/trace/kmem.h: definition of kmem tracepoints Signed-off-by: Zhao Lei Acked-by: Eduard - Gabriel Munteanu Acked-by: Pekka Enberg Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49DEE68A.5040902@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/kmemtrace.h | 25 +++++++++++++++++++++++++ include/linux/slab_def.h | 2 +- include/linux/slub_def.h | 2 +- include/trace/{kmemtrace.h => kmem.h} | 25 +++---------------------- init/main.c | 2 +- kernel/trace/kmemtrace.c | 2 +- kernel/trace/trace.h | 2 +- mm/slab.c | 2 +- mm/slob.c | 2 +- mm/slub.c | 2 +- 10 files changed, 36 insertions(+), 30 deletions(-) create mode 100644 include/linux/kmemtrace.h rename include/trace/{kmemtrace.h => kmem.h} (78%) diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h new file mode 100644 index 00000000000..15c45a27a92 --- /dev/null +++ b/include/linux/kmemtrace.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2008 Eduard - Gabriel Munteanu + * + * This file is released under GPL version 2. + */ + +#ifndef _LINUX_KMEMTRACE_H +#define _LINUX_KMEMTRACE_H + +#ifdef __KERNEL__ + +#include + +#ifdef CONFIG_KMEMTRACE +extern void kmemtrace_init(void); +#else +static inline void kmemtrace_init(void) +{ +} +#endif + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_KMEMTRACE_H */ + diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 5ac9b0bcaf9..713f841ecaa 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -14,7 +14,7 @@ #include /* kmalloc_sizes.h needs PAGE_SIZE */ #include /* kmalloc_sizes.h needs L1_CACHE_BYTES */ #include -#include +#include /* Size description struct for general caches. */ struct cache_sizes { diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 5046f90c117..be5d40c43bd 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ diff --git a/include/trace/kmemtrace.h b/include/trace/kmem.h similarity index 78% rename from include/trace/kmemtrace.h rename to include/trace/kmem.h index 28ee69f9cd4..24d25192818 100644 --- a/include/trace/kmemtrace.h +++ b/include/trace/kmem.h @@ -1,25 +1,9 @@ -/* - * Copyright (C) 2008 Eduard - Gabriel Munteanu - * - * This file is released under GPL version 2. - */ - -#ifndef _LINUX_KMEMTRACE_H -#define _LINUX_KMEMTRACE_H - -#ifdef __KERNEL__ +#ifndef _TRACE_KMEM_H +#define _TRACE_KMEM_H #include #include -#ifdef CONFIG_KMEMTRACE -extern void kmemtrace_init(void); -#else -static inline void kmemtrace_init(void) -{ -} -#endif - DECLARE_TRACE(kmalloc, TP_PROTO(unsigned long call_site, const void *ptr, @@ -57,7 +41,4 @@ DECLARE_TRACE(kmem_cache_free, TP_PROTO(unsigned long call_site, const void *ptr), TP_ARGS(call_site, ptr)); -#endif /* __KERNEL__ */ - -#endif /* _LINUX_KMEMTRACE_H */ - +#endif /* _TRACE_KMEM_H */ diff --git a/init/main.c b/init/main.c index 3585f073d63..eece40cd8a6 100644 --- a/init/main.c +++ b/init/main.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -71,7 +72,6 @@ #include #include #include -#include #ifdef CONFIG_X86_LOCAL_APIC #include diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 5011f4d91e3..7a0aa0e260d 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include "trace_output.h" #include "trace.h" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f76a8f8689d..34b94c3f40a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include enum trace_type { diff --git a/mm/slab.c b/mm/slab.c index 9a90b00d2f9..f85831da908 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -102,7 +102,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/slob.c b/mm/slob.c index a2d4ab32198..494f05f1941 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -65,7 +65,7 @@ #include #include #include -#include +#include #include /* diff --git a/mm/slub.c b/mm/slub.c index 7ab54ecbd3f..ea9e7160e2e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include From fc182a4330fc22ea1b68fa3d5064dd85a73a4c4a Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 10 Apr 2009 14:27:38 +0800 Subject: [PATCH 150/900] tracing, kmemtrace: Make kmem tracepoints use TRACE_EVENT macro TRACE_EVENT is a more generic way to define tracepoints. Doing so adds these new capabilities to this tracepoint: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions Signed-off-by: Zhao Lei Acked-by: Eduard - Gabriel Munteanu Acked-by: Pekka Enberg Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49DEE6DA.80600@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/kmem.h | 39 +----- include/trace/kmem_event_types.h | 193 ++++++++++++++++++++++++++++++ include/trace/trace_event_types.h | 1 + include/trace/trace_events.h | 1 + 4 files changed, 197 insertions(+), 37 deletions(-) create mode 100644 include/trace/kmem_event_types.h diff --git a/include/trace/kmem.h b/include/trace/kmem.h index 24d25192818..46efc2423f0 100644 --- a/include/trace/kmem.h +++ b/include/trace/kmem.h @@ -1,44 +1,9 @@ #ifndef _TRACE_KMEM_H #define _TRACE_KMEM_H -#include #include +#include -DECLARE_TRACE(kmalloc, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)); -DECLARE_TRACE(kmem_cache_alloc, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)); -DECLARE_TRACE(kmalloc_node, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)); -DECLARE_TRACE(kmem_cache_alloc_node, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)); -DECLARE_TRACE(kfree, - TP_PROTO(unsigned long call_site, const void *ptr), - TP_ARGS(call_site, ptr)); -DECLARE_TRACE(kmem_cache_free, - TP_PROTO(unsigned long call_site, const void *ptr), - TP_ARGS(call_site, ptr)); +#include #endif /* _TRACE_KMEM_H */ diff --git a/include/trace/kmem_event_types.h b/include/trace/kmem_event_types.h new file mode 100644 index 00000000000..4ff420fe467 --- /dev/null +++ b/include/trace/kmem_event_types.h @@ -0,0 +1,193 @@ + +/* use instead */ +#ifndef TRACE_EVENT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kmem + +TRACE_EVENT(kmalloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmem_cache_alloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmalloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kmem_cache_alloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kfree, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +TRACE_EVENT(kmem_cache_free, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +#undef TRACE_SYSTEM diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h index 33b6bfcba93..552a50e169a 100644 --- a/include/trace/trace_event_types.h +++ b/include/trace/trace_event_types.h @@ -4,3 +4,4 @@ #include #include #include +#include diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 0e2aa80076d..13d6b85668c 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -4,3 +4,4 @@ #include #include #include +#include From b78825d608f30a47e3154ab6872a03f0de0c9d45 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 1 Apr 2009 16:18:53 +0800 Subject: [PATCH 151/900] blktrace: fix output of unknown events Not all events are pc (packet command) events. An event is a pc event only if it has BLK_TC_PC bit set. Signed-off-by: Li Zefan Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49D3236D.3090705@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 921ef5d1f0b..e45e1af1356 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1182,7 +1182,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) - ret = trace_seq_printf(s, "Bad pc action %x\n", what); + ret = trace_seq_printf(s, "Unknown action %x\n", what); else { ret = log_action(iter, what2act[what].act[long_act]); if (ret) From 66de7792c02693b49671afe58c771fde3b092fc7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 1 Apr 2009 16:19:19 +0800 Subject: [PATCH 152/900] blktrace: fix output of BLK_TC_PC events BLK_TC_PC events should be treated differently with BLK_TC_FS events. Before this patch: # echo 1 > /sys/block/sda/sda1/trace/enable # echo pc > /sys/block/sda/sda1/trace/act_mask # echo blk > /debugfs/tracing/current_tracer # (generate some BLK_TC_PC events) # cat trace bash-2184 [000] 1774.275413: 8,7 I N [bash] bash-2184 [000] 1774.275435: 8,7 D N [bash] bash-2184 [000] 1774.275540: 8,7 I R [bash] bash-2184 [000] 1774.275547: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275580: 8,7 C N 0 [0] bash-2184 [000] 1774.275648: 8,7 I R [bash] bash-2184 [000] 1774.275653: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275682: 8,7 C N 0 [0] bash-2184 [000] 1774.275739: 8,7 I R [bash] bash-2184 [000] 1774.275744: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275771: 8,7 C N 0 [0] bash-2184 [000] 1774.275804: 8,7 I R [bash] bash-2184 [000] 1774.275808: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275836: 8,7 C N 0 [0] After this patch: # cat trace bash-2263 [000] 366.782149: 8,7 I N 0 (00 ..) [bash] bash-2263 [000] 366.782323: 8,7 D N 0 (00 ..) [bash] bash-2263 [000] 366.782557: 8,7 I R 8 (25 00 ..) [bash] bash-2263 [000] 366.782560: 8,7 D R 8 (25 00 ..) [bash] ksoftirqd/0-4 [000] 366.782582: 8,7 C N (25 00 ..) [0] bash-2263 [000] 366.782648: 8,7 I R 8 (5a 00 3f 00) [bash] bash-2263 [000] 366.782650: 8,7 D R 8 (5a 00 3f 00) [bash] ksoftirqd/0-4 [000] 366.782669: 8,7 C N (5a 00 3f 00) [0] bash-2263 [000] 366.782710: 8,7 I R 8 (5a 00 08 00) [bash] bash-2263 [000] 366.782713: 8,7 D R 8 (5a 00 08 00) [bash] ksoftirqd/0-4 [000] 366.782730: 8,7 C N (5a 00 08 00) [0] bash-2263 [000] 366.783375: 8,7 I R 36 (5a 00 08 00) [bash] bash-2263 [000] 366.783379: 8,7 D R 36 (5a 00 08 00) [bash] ksoftirqd/0-4 [000] 366.783404: 8,7 C N (5a 00 08 00) [0] This is what we do with PC events in user-space blktrace. Signed-off-by: Li Zefan Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49D32387.9040106@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 88 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 8 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e45e1af1356..2b98195b338 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -971,6 +971,16 @@ static inline const void *pdu_start(const struct trace_entry *ent) return te_blk_io_trace(ent) + 1; } +static inline u32 t_action(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->action; +} + +static inline u32 t_bytes(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes; +} + static inline u32 t_sec(const struct trace_entry *ent) { return te_blk_io_trace(ent)->bytes >> 9; @@ -1031,25 +1041,87 @@ static int blk_log_action(struct trace_iterator *iter, const char *act) MAJOR(t->device), MINOR(t->device), act, rwbs); } +static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +{ + const char *pdu_buf; + int pdu_len; + int i, end, ret; + + pdu_buf = pdu_start(ent); + pdu_len = te_blk_io_trace(ent)->pdu_len; + + if (!pdu_len) + return 1; + + /* find the last zero that needs to be printed */ + for (end = pdu_len - 1; end >= 0; end--) + if (pdu_buf[end]) + break; + end++; + + if (!trace_seq_putc(s, '(')) + return 0; + + for (i = 0; i < pdu_len; i++) { + + ret = trace_seq_printf(s, "%s%02x", + i == 0 ? "" : " ", pdu_buf[i]); + if (!ret) + return ret; + + /* + * stop when the rest is just zeroes and indicate so + * with a ".." appended + */ + if (i == end && end != pdu_len - 1) + return trace_seq_puts(s, " ..) "); + } + + return trace_seq_puts(s, ") "); +} + static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%s]\n", - t_sector(ent), t_sec(ent), cmd); - return trace_seq_printf(s, "[%s]\n", cmd); + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = trace_seq_printf(s, "%u ", t_bytes(ent)); + if (!ret) + return 0; + ret = blk_log_dump_pdu(s, ent); + if (!ret) + return 0; + return trace_seq_printf(s, "[%s]\n", cmd); + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + return trace_seq_printf(s, "[%s]\n", cmd); + } } static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent) { - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), - t_sec(ent), t_error(ent)); - return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = blk_log_dump_pdu(s, ent); + if (ret) + return trace_seq_printf(s, "[%d]\n", t_error(ent)); + return 0; + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%d]\n", + t_sector(ent), + t_sec(ent), t_error(ent)); + return trace_seq_printf(s, "%llu [%d]\n", + t_sector(ent), t_error(ent)); + } } static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) From 3fa89ca7ba5ba50b3924a11f6604b4bdce5f7842 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 12 Apr 2009 20:37:25 +0530 Subject: [PATCH 153/900] x86: vdso/vma.c declare vdso_enabled and arch_setup_additional_pages before they get used Impact: cleanup, address sparse warnings Addresses the problem pointed out by these sparse warning: arch/x86/vdso/vma.c:19:28: warning: symbol 'vdso_enabled' was not declared. Should it be static? arch/x86/vdso/vma.c:101:5: warning: symbol 'arch_setup_additional_pages' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1239548845.4170.2.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/vdso/vma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 7133cdf9098..cac083386e0 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include From edea7148a87c099e5d5d4838285cc27e459588b7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 12 Apr 2009 20:47:39 +0400 Subject: [PATCH 154/900] x86: irq.c - tiny cleanup Impact: cleanup, robustization 1) guard ack_bad_irq with printk_ratelimit since there is no guarantee we will not be flooded one day 2) use pr_emerg() helper Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090412165058.277579847@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3aaf7b9e3a8..6603492e8b7 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -24,7 +24,8 @@ void (*generic_interrupt_extension)(void) = NULL; */ void ack_bad_irq(unsigned int irq) { - printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + if (printk_ratelimit()) + pr_err("unexpected IRQ trap at vector %02x\n", irq); #ifdef CONFIG_X86_LOCAL_APIC /* @@ -178,7 +179,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_thermal_count; # ifdef CONFIG_X86_64 sum += irq_stats(cpu)->irq_threshold_count; -#endif +# endif #endif return sum; } @@ -219,8 +220,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) #endif if (printk_ratelimit()) - printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", - __func__, smp_processor_id(), vector, irq); + pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), vector, irq); } irq_exit(); From c0eaa4536f08b98fbcfa7fce5b7b0de1bebcb0e1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 12 Apr 2009 20:47:40 +0400 Subject: [PATCH 155/900] x86: apic - introduce imcr_ helpers Impact: cleanup Distinguish port writting magic into helpers with comments. Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090412165058.535921550@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 098ec84b8c0..c3be10f5773 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -98,6 +98,29 @@ early_param("lapic", parse_lapic); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; +/* + * Handle interrupt mode configuration register (IMCR). + * This register controls whether the interrupt signals + * that reach the BSP come from the master PIC or from the + * local APIC. Before entering Symmetric I/O Mode, either + * the BIOS or the operating system must switch out of + * PIC Mode by changing the IMCR. + */ +static inline imcr_pic_to_apic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go through APIC */ + outb(0x01, 0x23); +} + +static inline imcr_apic_to_pic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go directly to BSP */ + outb(0x00, 0x23); +} #endif #ifdef CONFIG_X86_64 @@ -1727,8 +1750,7 @@ void __init connect_bsp_APIC(void) */ apic_printk(APIC_VERBOSE, "leaving PIC mode, " "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); + imcr_pic_to_apic(); } #endif if (apic->enable_apic_mode) @@ -1756,8 +1778,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) */ apic_printk(APIC_VERBOSE, "disabling APIC mode, " "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); + imcr_apic_to_pic(); return; } #endif From 08306ce61d6848e6fbf74fa4cc693c3fb29e943f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 12 Apr 2009 20:47:41 +0400 Subject: [PATCH 156/900] x86: apic - introduce dummy apic operations Impact: refactor, speed up and robustize code In case if apic was disabled by kernel option or by hardware limits we can use dummy operations in apic->write to simplify the ack_APIC_irq() code. At the lame time the patch fixes the missed EOI in do_IRQ function (which has place if kernel is compiled as X86-32 and interrupt without handler happens where apic was not asked to be disabled via kernel option). Note that native_apic_write_dummy() consists of WARN_ON_ONCE to catch any buggy writes on enabled APICs. Could be removed after some time of testing. Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090412165058.724788431@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 3 ++- arch/x86/kernel/apic/apic.c | 24 ++++++++++++++++++++++++ arch/x86/kernel/irq.c | 10 ++-------- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 42f2f837742..2bd5a463fd1 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -212,6 +212,7 @@ static inline void ack_x2APIC_irq(void) } #endif +extern void apic_disable(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void connect_bsp_APIC(void); @@ -252,7 +253,7 @@ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } - +static inline void apic_disable(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c3be10f5773..9b849d4957d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -232,6 +232,24 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } +/* + * bare function to substitute write operation + * and it's _that_ fast :) + */ +void native_apic_write_dummy(u32 reg, u32 v) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); +} + +/* + * right after this call apic->write doesn't do anything + * note that there is no restore operation it works one way + */ +void apic_disable(void) +{ + apic->write = native_apic_write_dummy; +} + void native_apic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) @@ -1582,6 +1600,12 @@ void __init init_apic_mappings(void) */ if (boot_cpu_physical_apicid == -1U) boot_cpu_physical_apicid = read_apic_id(); + + /* lets check if we may to NOP'ify apic operations */ + if (!cpu_has_apic) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } } /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 6603492e8b7..fd57bf35d0f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -27,7 +27,6 @@ void ack_bad_irq(unsigned int irq) if (printk_ratelimit()) pr_err("unexpected IRQ trap at vector %02x\n", irq); -#ifdef CONFIG_X86_LOCAL_APIC /* * Currently unexpected vectors happen only on SMP and APIC. * We _must_ ack these because every local APIC has only N @@ -37,9 +36,7 @@ void ack_bad_irq(unsigned int irq) * completely. * But only ack when the APIC is enabled -AK */ - if (cpu_has_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); } #define irq_stats(x) (&per_cpu(irq_stat, x)) @@ -214,10 +211,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) irq = __get_cpu_var(vector_irq)[vector]; if (!handle_irq(irq, regs)) { -#ifdef CONFIG_X86_64 - if (!disable_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); if (printk_ratelimit()) pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", From b9b34f24b23ba9e79e07c0980e7fff16af2a67d1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 12 Apr 2009 20:47:42 +0400 Subject: [PATCH 157/900] x86: smp.c - align smp_ops assignments Impact: cleanup It's a bit hard to parse by eyes without them being aligned. Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090412165058.924175574@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 13f33ea8cca..f6db48c405b 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -193,19 +193,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) } struct smp_ops smp_ops = { - .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, - .smp_prepare_cpus = native_smp_prepare_cpus, - .smp_cpus_done = native_smp_cpus_done, + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, + .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, - .smp_send_reschedule = native_smp_send_reschedule, + .smp_send_stop = native_smp_send_stop, + .smp_send_reschedule = native_smp_send_reschedule, - .cpu_up = native_cpu_up, - .cpu_die = native_cpu_die, - .cpu_disable = native_cpu_disable, - .play_dead = native_play_dead, + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, - .send_call_func_ipi = native_send_call_func_ipi, + .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); From 5cda395f4a262788d8ed79ac8a26a2b821e5f751 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Mon, 13 Apr 2009 17:39:24 +0200 Subject: [PATCH 158/900] x86: fix function definitions after: x86: apic - introduce imcr_ helpers The patch "introduce imcr_ helpers" introduced good comments, but also a few new compile warnings. This fixes the function definitions to have a 'void' return type. Signed-off-by: Alexander van Heukelum Acked-by: Cyrill Gorcunov LKML-Reference: <20090413153924.GA20287@mailshack.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 9b849d4957d..4b48ff9163c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -106,7 +106,7 @@ static int enabled_via_apicbase; * the BIOS or the operating system must switch out of * PIC Mode by changing the IMCR. */ -static inline imcr_pic_to_apic(void) +static inline void imcr_pic_to_apic(void) { /* select IMCR register */ outb(0x70, 0x22); @@ -114,7 +114,7 @@ static inline imcr_pic_to_apic(void) outb(0x01, 0x23); } -static inline imcr_apic_to_pic(void) +static inline void imcr_apic_to_pic(void) { /* select IMCR register */ outb(0x70, 0x22); From e1112b4d96859367a93468027c9635e2ac04eb3f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 31 Mar 2009 00:48:49 -0500 Subject: [PATCH 159/900] tracing/filters: add run-time field descriptions to TRACE_EVENT_FORMAT events This patch adds run-time field descriptions to all the event formats exported using TRACE_EVENT_FORMAT. It also hooks up all the tracers that use them (i.e. the tracers in the 'ftrace subsystem') so they can also have their output filtered by the event-filtering mechanism. When I was testing this, there were a couple of things that fooled me into thinking the filters weren't working, when actually they were - I'll mention them here so others don't make the same mistakes (and file bug reports. ;-) One is that some of the tracers trace multiple events e.g. the sched_switch tracer uses the context_switch and wakeup events, and if you don't set filters on all of the traced events, the unfiltered output from the events without filters on them can make it look like the filtering as a whole isn't working properly, when actually it is doing what it was asked to do - it just wasn't asked to do the right thing. The other is that for the really high-volume tracers e.g. the function tracer, the volume of filtered events can be so high that it pushes the unfiltered events out of the ring buffer before they can be read so e.g. cat'ing the trace file repeatedly shows either no output, or once in awhile some output but that isn't there the next time you read the trace, which isn't what you normally expect when reading the trace file. If you read from the trace_pipe file though, you can catch them before they disappear. Changes from v1: As suggested by Frederic Weisbecker: - get rid of externs in functions - added unlikely() to filter_check_discard() Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 6 +++ kernel/trace/trace.c | 25 ++++++++++++ kernel/trace/trace.h | 20 ++++++++++ kernel/trace/trace_branch.c | 3 ++ kernel/trace/trace_event_types.h | 6 ++- kernel/trace/trace_events.c | 7 ++++ kernel/trace/trace_events_filter.c | 4 +- kernel/trace/trace_events_stage_2.h | 7 ---- kernel/trace/trace_export.c | 59 +++++++++++++++++++++++++++-- kernel/trace/trace_hw_branches.c | 2 + kernel/trace/trace_power.c | 4 ++ 11 files changed, 128 insertions(+), 15 deletions(-) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 7a0aa0e260d..9419ad10541 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, gfp_t gfp_flags, int node) { + struct ftrace_event_call *call = &event_kmem_alloc; struct trace_array *tr = kmemtrace_array; struct kmemtrace_alloc_entry *entry; struct ring_buffer_event *event; @@ -62,6 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, entry->gfp_flags = gfp_flags; entry->node = node; + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); @@ -71,6 +74,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, unsigned long call_site, const void *ptr) { + struct ftrace_event_call *call = &event_kmem_free; struct trace_array *tr = kmemtrace_array; struct kmemtrace_free_entry *entry; struct ring_buffer_event *event; @@ -86,6 +90,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, entry->call_site = call_site; entry->ptr = ptr; + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4865459f609..962e6179994 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -898,6 +898,7 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_function; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -912,6 +913,9 @@ trace_function(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; + + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); } @@ -921,6 +925,7 @@ static int __trace_graph_entry(struct trace_array *tr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; struct ftrace_graph_ent_entry *entry; @@ -933,6 +938,7 @@ static int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(global_trace.buffer, event); return 1; @@ -943,6 +949,7 @@ static void __trace_graph_return(struct trace_array *tr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; struct ftrace_graph_ret_entry *entry; @@ -955,6 +962,7 @@ static void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(global_trace.buffer, event); } #endif @@ -973,6 +981,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, int skip, int pc) { #ifdef CONFIG_STACKTRACE + struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; @@ -990,6 +999,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1015,6 +1025,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc) { #ifdef CONFIG_STACKTRACE + struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; struct stack_trace trace; @@ -1036,6 +1047,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1052,6 +1064,7 @@ ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { + struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; struct special_entry *entry; @@ -1064,6 +1077,7 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, pc); } @@ -1080,6 +1094,7 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_context_switch; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -1095,6 +1110,9 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_prio = next->prio; entry->next_state = next->state; entry->next_cpu = task_cpu(next); + + filter_check_discard(call, entry, event); + trace_buffer_unlock_commit(tr, event, flags, pc); } @@ -1104,6 +1122,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -1120,6 +1139,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); @@ -1221,6 +1242,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static u32 trace_buf[TRACE_BUF_SIZE]; + struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1260,6 +1282,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); out_unlock: @@ -1279,6 +1302,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; + struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1314,6 +1338,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); out_unlock: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 34b94c3f40a..e7737281953 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -866,6 +866,21 @@ extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); +static inline void +filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer_event *event) +{ + if (unlikely(call->preds) && !filter_match_preds(call, rec)) + ring_buffer_event_discard(event); +} + +#define __common_field(type, item) \ + ret = trace_define_field(event_call, #type, "common_" #item, \ + offsetof(typeof(field.ent), item), \ + sizeof(field.ent.item)); \ + if (ret) \ + return ret; + void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; @@ -897,4 +912,9 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ + extern struct ftrace_event_call event_##call; +#include "trace_event_types.h" + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index e6e32912ffb..c95c25d838e 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -30,6 +30,7 @@ static struct trace_array *branch_tracer; static void probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { + struct ftrace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; @@ -73,6 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); out: diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index fd78bee71dd..95b147aac22 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned int, line, line) - TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func) - TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) + TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, + TRACE_FUNC_SIZE+1, func) + TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, + TRACE_FUNC_SIZE+1, file) TRACE_FIELD(char, correct, correct) ), TP_RAW_FMT("%u:%s:%s (%u)") diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 64ec4d278ff..be9299a53e2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -680,6 +680,7 @@ static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { struct event_subsystem *system; + struct dentry *entry; /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { @@ -708,6 +709,12 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->preds = NULL; + entry = debugfs_create_file("filter", 0644, system->entry, system, + &ftrace_subsystem_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", name); + return system->entry; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 026be412f35..470ad9487ec 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -185,7 +185,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system) } events_for_each(call) { - if (!call->name || !call->regfunc) + if (!call->define_fields) continue; if (!strcmp(call->system, system->name)) @@ -324,7 +324,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system, events_for_each(call) { int err; - if (!call->name || !call->regfunc) + if (!call->define_fields) continue; if (strcmp(call->system, system->name)) diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 30743f7d411..1c94b87c718 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -146,13 +146,6 @@ ftrace_format_##call(struct trace_seq *s) \ if (ret) \ return ret; -#define __common_field(type, item) \ - ret = trace_define_field(event_call, #type, "common_" #item, \ - offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ - if (ret) \ - return ret; - #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ int \ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 07a22c33ebf..f4e46616c48 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -30,7 +30,7 @@ #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), item), \ @@ -85,18 +85,69 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_ENTRY entry #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ cmd; #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int ftrace_define_fields_##call(void); \ +static int ftrace_raw_init_event_##call(void); \ \ -static struct ftrace_event_call __used \ +struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .id = proto, \ .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_raw_init_event_##call, \ .show_format = ftrace_format_##call, \ -} + .define_fields = ftrace_define_fields_##call, \ +}; \ +static int ftrace_raw_init_event_##call(void) \ +{ \ + INIT_LIST_HEAD(&event_##call.fields); \ + return 0; \ +} \ + +#include "trace_event_types.h" + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_ZERO_CHAR +#define TRACE_FIELD_ZERO_CHAR(item) + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int \ +ftrace_define_fields_##call(void) \ +{ \ + struct ftrace_event_call *event_call = &event_##call; \ + struct args field; \ + int ret; \ + \ + __common_field(unsigned char, type); \ + __common_field(unsigned char, flags); \ + __common_field(unsigned char, preempt_count); \ + __common_field(int, pid); \ + __common_field(int, tgid); \ + \ + tstruct; \ + \ + return ret; \ +} + #include "trace_event_types.h" diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 7bfdf4c2347..e6b275b22ac 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -168,6 +168,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) void trace_hw_branch(u64 from, u64 to) { + struct ftrace_event_call *call = &event_hw_branch; struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; struct hw_branch_entry *entry; @@ -194,6 +195,7 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, 0); out: diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index bae791ebcc5..8ce7d7d62c0 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type, static void probe_power_end(struct power_trace *it) { + struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; @@ -54,6 +55,7 @@ static void probe_power_end(struct power_trace *it) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); @@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it) static void probe_power_mark(struct power_trace *it, unsigned int type, unsigned int level) { + struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; @@ -84,6 +87,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); From e45f2e2bd298e1ff687448e5fd15a3588b5807ec Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 31 Mar 2009 00:49:16 -0500 Subject: [PATCH 160/900] tracing/filters: add TRACE_EVENT_FORMAT_NOFILTER event macro Frederic Weisbecker suggested that the trace_special event shouldn't be filterable; this patch adds a TRACE_EVENT_FORMAT_NOFILTER event macro that allows an event format to be exported without having a filter attached, and removes filtering from the trace_special event. Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 -- kernel/trace/trace.h | 2 ++ kernel/trace/trace_event_types.h | 2 +- kernel/trace/trace_export.c | 33 ++++++++++++++++++++++++++++++++ 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 962e6179994..c209d214169 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1064,7 +1064,6 @@ ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { - struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; struct special_entry *entry; @@ -1077,7 +1076,6 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; - filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, pc); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e7737281953..3cf856fa597 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -915,6 +915,8 @@ do { \ #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ extern struct ftrace_event_call event_##call; +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) #include "trace_event_types.h" #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 95b147aac22..cfcecc4fd86 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") ); -TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, +TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, arg1, arg1) TRACE_FIELD(unsigned long, arg2, arg2) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index f4e46616c48..77c494f5e1d 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -65,6 +65,22 @@ ftrace_format_##call(struct trace_seq *s) \ return ret; \ } +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct args field; \ + int ret; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + \ + return ret; \ +} + #include "trace_event_types.h" #undef TRACE_ZERO_CHAR @@ -109,6 +125,19 @@ static int ftrace_raw_init_event_##call(void) \ return 0; \ } \ +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) \ + \ +struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .id = proto, \ + .system = __stringify(TRACE_SYSTEM), \ + .show_format = ftrace_format_##call, \ +}; + #include "trace_event_types.h" #undef TRACE_FIELD @@ -150,4 +179,8 @@ ftrace_define_fields_##call(void) \ return ret; \ } +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) + #include "trace_event_types.h" From fa1b47dd85453ec7d4bcfe4aa4a2d172ba452fc3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 2 Apr 2009 00:09:41 -0400 Subject: [PATCH 161/900] ring-buffer: add ring_buffer_discard_commit The ring_buffer_discard_commit is similar to ring_buffer_event_discard but it can only be done on an event that has yet to be commited. Unpredictable results can happen otherwise. The main difference between ring_buffer_discard_commit and ring_buffer_event_discard is that ring_buffer_discard_commit will try to free the data in the ring buffer if nothing has addded data after the reserved event. If something did, then it acts almost the same as ring_buffer_event_discard followed by a ring_buffer_unlock_commit. Note, either ring_buffer_commit_discard and ring_buffer_unlock_commit can be called on an event, not both. This commit also exports both discard functions to be usable by GPL modules. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ring_buffer.h | 29 +++++++++ kernel/trace/ring_buffer.c | 125 ++++++++++++++++++++++++++++++------ 2 files changed, 133 insertions(+), 21 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index e1b7b217388..f0aa486d131 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -68,8 +68,37 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event) return event->time_delta; } +/* + * ring_buffer_event_discard can discard any event in the ring buffer. + * it is up to the caller to protect against a reader from + * consuming it or a writer from wrapping and replacing it. + * + * No external protection is needed if this is called before + * the event is commited. But in that case it would be better to + * use ring_buffer_discard_commit. + * + * Note, if an event that has not been committed is discarded + * with ring_buffer_event_discard, it must still be committed. + */ void ring_buffer_event_discard(struct ring_buffer_event *event); +/* + * ring_buffer_discard_commit will remove an event that has not + * ben committed yet. If this is used, then ring_buffer_unlock_commit + * must not be called on the discarded event. This function + * will try to remove the event from the ring buffer completely + * if another event has not been written after it. + * + * Example use: + * + * if (some_condition) + * ring_buffer_discard_commit(buffer, event); + * else + * ring_buffer_unlock_commit(buffer, event); + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event); + /* * size is in bytes for each per CPU buffer. */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 74a11808c28..f935bd5ec3e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -205,27 +205,6 @@ static void rb_event_set_padding(struct ring_buffer_event *event) event->time_delta = 0; } -/** - * ring_buffer_event_discard - discard an event in the ring buffer - * @buffer: the ring buffer - * @event: the event to discard - * - * Sometimes a event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * Note, it is up to the user to be careful with this, and protect - * against races. If the user discards an event that has been consumed - * it is possible that it could corrupt the ring buffer. - */ -void ring_buffer_event_discard(struct ring_buffer_event *event) -{ - event->type = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - static unsigned rb_event_data_length(struct ring_buffer_event *event) { @@ -1570,6 +1549,110 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +/** + * ring_buffer_event_discard - discard any event in the ring buffer + * @event: the event to discard + * + * Sometimes a event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * Note, it is up to the user to be careful with this, and protect + * against races. If the user discards an event that has been consumed + * it is possible that it could corrupt the ring buffer. + */ +void ring_buffer_event_discard(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} +EXPORT_SYMBOL_GPL(ring_buffer_event_discard); + +/** + * ring_buffer_commit_discard - discard an event that has not been committed + * @buffer: the ring buffer + * @event: non committed event to discard + * + * This is similar to ring_buffer_event_discard but must only be + * performed on an event that has not been committed yet. The difference + * is that this will also try to free the event from the ring buffer + * if another event has not been added behind it. + * + * If another event has been added behind it, it will set the event + * up as discarded, and perform the commit. + * + * If this function is called, do not call ring_buffer_unlock_commit on + * the event. + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + int cpu; + + /* The event is discarded regardless */ + ring_buffer_event_discard(event); + + /* + * This must only be called if the event has not been + * committed yet. Thus we can assume that preemption + * is still disabled. + */ + RB_WARN_ON(buffer, !preempt_count()); + + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage == (void *)addr && rb_page_write(bpage) == old_index) { + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) + goto out; + } + + /* + * The commit is still visible by the reader, so we + * must increment entries. + */ + cpu_buffer->entries++; + out: + /* + * If a write came in and pushed the tail page + * we still need to update the commit pointer + * if we were the commit. + */ + if (rb_is_commit(cpu_buffer, event)) + rb_set_commit_to_write(cpu_buffer); + + /* + * Only the last preempt count needs to restore preemption. + */ + if (preempt_count() == 1) + ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); + else + preempt_enable_no_resched_notrace(); + +} +EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); + /** * ring_buffer_write - write data to the buffer without reserving * @buffer: The ring buffer to write to. From 77d9f465d46fd67cdb82ee5e1ab99dd57a17c486 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 2 Apr 2009 01:16:59 -0400 Subject: [PATCH 162/900] tracing/filters: use ring_buffer_discard_commit for discarded events The ring_buffer_discard_commit makes better usage of the ring_buffer when an event has been discarded. It tries to remove it completely if possible. This patch converts the trace event filtering to use ring_buffer_discard_commit instead of the ring_buffer_event_discard. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 9 +++++++-- kernel/trace/trace.h | 1 + kernel/trace/trace_events_stage_3.h | 6 +++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c209d214169..d880ab2772c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -884,13 +884,18 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); + __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); + __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); +} + +void trace_current_buffer_discard_commit(struct ring_buffer_event *event) +{ + ring_buffer_discard_commit(global_trace.buffer, event); } void diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3cf856fa597..dfefffd7ae3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -497,6 +497,7 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); +void trace_current_buffer_discard_commit(struct ring_buffer_event *event); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 9d2fa78cecc..d2f34bf30e5 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -223,9 +223,9 @@ static void ftrace_raw_event_##call(proto) \ assign; \ \ if (call->preds && !filter_match_preds(call, entry)) \ - ring_buffer_event_discard(event); \ - \ - trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ + trace_current_buffer_discard_commit(event); \ + else \ + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ \ } \ \ From 5f77a88b3f8268b11940b51d2e03d26a663ceb90 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 8 Apr 2009 03:14:01 -0500 Subject: [PATCH 163/900] tracing/infrastructure: separate event tracer from event support Add a new config option, CONFIG_EVENT_TRACING that gets selected when CONFIG_TRACING is selected and adds everything needed by the stuff in trace_export - basically all the event tracing support needed by e.g. bprint, minus the actual events, which are only included if CONFIG_EVENT_TRACER is selected. So CONFIG_EVENT_TRACER can be used to turn on or off the generated events (what I think of as the 'event tracer'), while CONFIG_EVENT_TRACING turns on or off the base event tracing support used by both the event tracer and the other things such as bprint that can't be configured out. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: fweisbec@gmail.com LKML-Reference: <1239178441.10295.34.camel@tropicana> Signed-off-by: Ingo Molnar --- include/asm-generic/vmlinux.lds.h | 2 +- kernel/trace/Kconfig | 4 ++++ kernel/trace/Makefile | 6 +++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 7fa660fd449..7e9b1e9f711 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -61,7 +61,7 @@ #define BRANCH_PROFILE() #endif -#ifdef CONFIG_EVENT_TRACER +#ifdef CONFIG_EVENT_TRACING #define FTRACE_EVENTS() VMLINUX_SYMBOL(__start_ftrace_events) = .; \ *(_ftrace_events) \ VMLINUX_SYMBOL(__stop_ftrace_events) = .; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 23b96ebbf89..644606e899f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -48,6 +48,9 @@ config FTRACE_NMI_ENTER depends on HAVE_FTRACE_NMI_ENTER default y +config EVENT_TRACING + bool + config TRACING bool select DEBUG_FS @@ -56,6 +59,7 @@ config TRACING select TRACEPOINTS select NOP_TRACER select BINARY_PRINTF + select EVENT_TRACING # # Minimum requirements an architecture has to meet for us to diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2630f5121ec..3ad367e7c97 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -40,11 +40,11 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o -obj-$(CONFIG_EVENT_TRACER) += trace_events.o +obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACER) += events.o -obj-$(CONFIG_EVENT_TRACER) += trace_export.o +obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o -obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o libftrace-y := ftrace.o From eb02ce017dd83985041a7e54c6449f92d53b026f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 8 Apr 2009 03:15:54 -0500 Subject: [PATCH 164/900] tracing/filters: use ring_buffer_discard_commit() in filter_check_discard() This patch changes filter_check_discard() to make use of the new ring_buffer_discard_commit() function and modifies the current users to call the old commit function in the non-discard case. It also introduces a version of filter_check_discard() that uses the global trace buffer (filter_current_check_discard()) for those cases. v2 changes: - fix compile error noticed by Ingo Molnar Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: fweisbec@gmail.com LKML-Reference: <1239178554.10295.36.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 10 +++---- kernel/trace/trace.c | 45 +++++++++++++++-------------- kernel/trace/trace.h | 14 +++++++-- kernel/trace/trace_branch.c | 5 ++-- kernel/trace/trace_events_stage_3.h | 5 +--- kernel/trace/trace_hw_branches.c | 4 +-- kernel/trace/trace_power.c | 8 ++--- 7 files changed, 48 insertions(+), 43 deletions(-) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 9419ad10541..86cdf671d7e 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -63,9 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, entry->gfp_flags = gfp_flags; entry->node = node; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } @@ -90,9 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, entry->call_site = call_site; entry->ptr = ptr; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d880ab2772c..c0047fcf707 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -171,6 +171,12 @@ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +int filter_current_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer_event *event) +{ + return filter_check_discard(call, rec, global_trace.buffer, event); +} + cycle_t ftrace_now(int cpu) { u64 ts; @@ -919,9 +925,8 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -943,8 +948,8 @@ static int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(global_trace.buffer, event); return 1; } @@ -967,8 +972,8 @@ static void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(global_trace.buffer, event); } #endif @@ -1004,8 +1009,8 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1052,8 +1057,8 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1114,9 +1119,8 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = next->state; entry->next_cpu = task_cpu(next); - filter_check_discard(call, entry, event); - - trace_buffer_unlock_commit(tr, event, flags, pc); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, flags, pc); } void @@ -1142,9 +1146,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); } @@ -1285,8 +1288,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); @@ -1341,8 +1344,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dfefffd7ae3..9729d14767d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -866,13 +866,21 @@ extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); +extern int filter_current_check_discard(struct ftrace_event_call *call, + void *rec, + struct ring_buffer_event *event); -static inline void +static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->preds) && !filter_match_preds(call, rec)) - ring_buffer_event_discard(event); + if (unlikely(call->preds) && !filter_match_preds(call, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; } #define __common_field(type, item) \ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index c95c25d838e..8e64e604f5a 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -74,9 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index d2f34bf30e5..b2b298269eb 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -222,11 +222,8 @@ static void ftrace_raw_event_##call(proto) \ \ assign; \ \ - if (call->preds && !filter_match_preds(call, entry)) \ - trace_current_buffer_discard_commit(event); \ - else \ + if (!filter_current_check_discard(call, entry, event)) \ trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ - \ } \ \ static int ftrace_raw_reg_event_##call(void) \ diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index e6b275b22ac..8683d50a753 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -195,8 +195,8 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; - filter_check_discard(call, entry, event); - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 8ce7d7d62c0..810a5b7cf1c 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -55,8 +55,8 @@ static void probe_power_end(struct power_trace *it) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - filter_check_discard(call, entry, event); - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } @@ -87,8 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - filter_check_discard(call, entry, event); - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } From 0a19e53c1514ad8e9c3cbab40c6c3f52c86f403d Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 13 Apr 2009 03:17:50 -0500 Subject: [PATCH 165/900] tracing/filters: allow on-the-fly filter switching This patch allows event filters to be safely removed or switched on-the-fly while avoiding the use of rcu or the suspension of tracing of previous versions. It does it by adding a new filter_pred_none() predicate function which does nothing and by never deallocating either the predicates or any of the filter_pred members used in matching; the predicate lists are allocated and initialized during ftrace_event_calls initialization. Whenever a filter is removed or replaced, the filter_pred_* functions currently in use by the affected ftrace_event_call are immediately switched over to to the filter_pred_none() function, while the rest of the filter_pred members are left intact, allowing any currently executing filter_pred_* functions to finish up, using the values they're currently using. In the case of filter replacement, the new predicate values are copied into the old predicates after the above step, and the filter_pred_none() functions are replaced by the filter_pred_* functions for the new filter. In this case, it is possible though very unlikely that a previous filter_pred_* is still running even after the filter_pred_none() switch and the switch to the new filter_pred_*. In that case, however, because nothing has been deallocated in the filter_pred, the worst that can happen is that the old filter_pred_* function sees the new values and as a result produces either a false positive or a false negative, depending on the values it finds. So one downside to this method is that rarely, it can produce a bad match during the filter switch, but it should be possible to live with that, IMHO. The other downside is that at least in this patch the predicate lists are always pre-allocated, taking up memory from the start. They could probably be allocated on first-use, and de-allocated when tracing is completely stopped - if this patch makes sense, I could create another one to do that later on. Oh, and it also places a restriction on the size of __arrays in events, currently set to 128, since they can't be larger than the now embedded str_val arrays in the filter_pred struct. Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: paulmck@linux.vnet.ibm.com LKML-Reference: <1239610670.6660.49.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 14 +- kernel/trace/trace_events.c | 9 +- kernel/trace/trace_events_filter.c | 254 +++++++++++++++------------- kernel/trace/trace_events_stage_2.h | 1 + kernel/trace/trace_events_stage_3.h | 1 + kernel/trace/trace_export.c | 1 + 6 files changed, 151 insertions(+), 129 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9729d14767d..b05b6ac982a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -813,6 +813,7 @@ struct ftrace_event_call { int (*show_format)(struct trace_seq *s); int (*define_fields)(void); struct list_head fields; + int n_preds; struct filter_pred **preds; #ifdef CONFIG_EVENT_PROFILE @@ -826,6 +827,7 @@ struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; + int n_preds; struct filter_pred **preds; }; @@ -834,7 +836,8 @@ struct event_subsystem { (unsigned long)event < (unsigned long)__stop_ftrace_events; \ event++) -#define MAX_FILTER_PRED 8 +#define MAX_FILTER_PRED 8 +#define MAX_FILTER_STR_VAL 128 struct filter_pred; @@ -843,7 +846,7 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); struct filter_pred { filter_pred_fn_t fn; u64 val; - char *str_val; + char str_val[MAX_FILTER_STR_VAL]; int str_len; char *field_name; int offset; @@ -855,13 +858,14 @@ struct filter_pred { int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size); +extern int init_preds(struct ftrace_event_call *call); extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct filter_pred **preds, +extern void filter_print_preds(struct filter_pred **preds, int n_preds, struct trace_seq *s); extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); -extern void filter_free_preds(struct ftrace_event_call *call); +extern void filter_disable_preds(struct ftrace_event_call *call); extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, @@ -875,7 +879,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->preds) && !filter_match_preds(call, rec)) { + if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 789e14eb09a..ead68ac9919 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -481,7 +481,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call->preds, s); + filter_print_preds(call->preds, call->n_preds, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -516,7 +516,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } if (pred->clear) { - filter_free_preds(call); + filter_disable_preds(call); filter_free_pred(pred); return cnt; } @@ -527,6 +527,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return err; } + filter_free_pred(pred); + *ppos += cnt; return cnt; @@ -549,7 +551,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(system->preds, s); + filter_print_preds(system->preds, system->n_preds, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -712,6 +714,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) list_add(&system->list, &event_subsystems); system->preds = NULL; + system->n_preds = 0; entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9f8ecca34a5..de42dad42a8 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -82,25 +82,27 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } +static int filter_pred_none(struct filter_pred *pred, void *event) +{ + return 0; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct ftrace_event_call *call, void *rec) { int i, matched, and_failed = 0; struct filter_pred *pred; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (call->preds[i]) { - pred = call->preds[i]; - if (and_failed && !pred->or) - continue; - matched = pred->fn(pred, rec); - if (!matched && !pred->or) { - and_failed = 1; - continue; - } else if (matched && pred->or) - return 1; - } else - break; + for (i = 0; i < call->n_preds; i++) { + pred = call->preds[i]; + if (and_failed && !pred->or) + continue; + matched = pred->fn(pred, rec); + if (!matched && !pred->or) { + and_failed = 1; + continue; + } else if (matched && pred->or) + return 1; } if (and_failed) @@ -109,31 +111,29 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) return 1; } -void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) +void filter_print_preds(struct filter_pred **preds, int n_preds, + struct trace_seq *s) { char *field_name; struct filter_pred *pred; int i; - if (!preds) { + if (!n_preds) { trace_seq_printf(s, "none\n"); return; } - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (preds[i]) { - pred = preds[i]; - field_name = pred->field_name; - if (i) - trace_seq_printf(s, pred->or ? "|| " : "&& "); - trace_seq_printf(s, "%s ", field_name); - trace_seq_printf(s, pred->not ? "!= " : "== "); - if (pred->str_val) - trace_seq_printf(s, "%s\n", pred->str_val); - else - trace_seq_printf(s, "%llu\n", pred->val); - } else - break; + for (i = 0; i < n_preds; i++) { + pred = preds[i]; + field_name = pred->field_name; + if (i) + trace_seq_printf(s, pred->or ? "|| " : "&& "); + trace_seq_printf(s, "%s ", field_name); + trace_seq_printf(s, pred->not ? "!= " : "== "); + if (pred->str_len) + trace_seq_printf(s, "%s\n", pred->str_val); + else + trace_seq_printf(s, "%llu\n", pred->val); } } @@ -156,20 +156,69 @@ void filter_free_pred(struct filter_pred *pred) return; kfree(pred->field_name); - kfree(pred->str_val); kfree(pred); } -void filter_free_preds(struct ftrace_event_call *call) +static void filter_clear_pred(struct filter_pred *pred) +{ + kfree(pred->field_name); + pred->field_name = NULL; + pred->str_len = 0; +} + +static int filter_set_pred(struct filter_pred *dest, + struct filter_pred *src, + filter_pred_fn_t fn) +{ + *dest = *src; + dest->field_name = kstrdup(src->field_name, GFP_KERNEL); + if (!dest->field_name) + return -ENOMEM; + dest->fn = fn; + + return 0; +} + +void filter_disable_preds(struct ftrace_event_call *call) { int i; - if (call->preds) { - for (i = 0; i < MAX_FILTER_PRED; i++) - filter_free_pred(call->preds[i]); - kfree(call->preds); - call->preds = NULL; + call->n_preds = 0; + + for (i = 0; i < MAX_FILTER_PRED; i++) + call->preds[i]->fn = filter_pred_none; +} + +int init_preds(struct ftrace_event_call *call) +{ + struct filter_pred *pred; + int i; + + call->n_preds = 0; + + call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + if (!call->preds) + return -ENOMEM; + + for (i = 0; i < MAX_FILTER_PRED; i++) { + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + goto oom; + pred->fn = filter_pred_none; + call->preds[i] = pred; } + + return 0; + +oom: + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (call->preds[i]) + filter_free_pred(call->preds[i]); + } + kfree(call->preds); + call->preds = NULL; + + return -ENOMEM; } void filter_free_subsystem_preds(struct event_subsystem *system) @@ -177,11 +226,12 @@ void filter_free_subsystem_preds(struct event_subsystem *system) struct ftrace_event_call *call = __start_ftrace_events; int i; - if (system->preds) { - for (i = 0; i < MAX_FILTER_PRED; i++) + if (system->n_preds) { + for (i = 0; i < system->n_preds; i++) filter_free_pred(system->preds[i]); kfree(system->preds); system->preds = NULL; + system->n_preds = 0; } events_for_each(call) { @@ -189,33 +239,31 @@ void filter_free_subsystem_preds(struct event_subsystem *system) continue; if (!strcmp(call->system, system->name)) - filter_free_preds(call); + filter_disable_preds(call); } } static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred) + struct filter_pred *pred, + filter_pred_fn_t fn) { - int i; + int idx, err; - if (call->preds && !pred->compound) - filter_free_preds(call); + if (call->n_preds && !pred->compound) + filter_disable_preds(call); - if (!call->preds) { - call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), - GFP_KERNEL); - if (!call->preds) - return -ENOMEM; - } + if (call->n_preds == MAX_FILTER_PRED) + return -ENOSPC; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (!call->preds[i]) { - call->preds[i] = pred; - return 0; - } - } + idx = call->n_preds; + filter_clear_pred(call->preds[idx]); + err = filter_set_pred(call->preds[idx], pred, fn); + if (err) + return err; - return -ENOSPC; + call->n_preds++; + + return 0; } static int is_string_field(const char *type) @@ -229,98 +277,66 @@ static int is_string_field(const char *type) int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) { struct ftrace_event_field *field; + filter_pred_fn_t fn; field = find_event_field(call, pred->field_name); if (!field) return -EINVAL; + pred->fn = filter_pred_none; pred->offset = field->offset; if (is_string_field(field->type)) { - if (!pred->str_val) + if (!pred->str_len) return -EINVAL; - pred->fn = filter_pred_string; + fn = filter_pred_string; pred->str_len = field->size; - return __filter_add_pred(call, pred); + return __filter_add_pred(call, pred, fn); } else { - if (pred->str_val) + if (pred->str_len) return -EINVAL; } switch (field->size) { case 8: - pred->fn = filter_pred_64; + fn = filter_pred_64; break; case 4: - pred->fn = filter_pred_32; + fn = filter_pred_32; break; case 2: - pred->fn = filter_pred_16; + fn = filter_pred_16; break; case 1: - pred->fn = filter_pred_8; + fn = filter_pred_8; break; default: return -EINVAL; } - return __filter_add_pred(call, pred); -} - -static struct filter_pred *copy_pred(struct filter_pred *pred) -{ - struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); - if (!new_pred) - return NULL; - - memcpy(new_pred, pred, sizeof(*pred)); - - if (pred->field_name) { - new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); - if (!new_pred->field_name) { - kfree(new_pred); - return NULL; - } - } - - if (pred->str_val) { - new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); - if (!new_pred->str_val) { - filter_free_pred(new_pred); - return NULL; - } - } - - return new_pred; + return __filter_add_pred(call, pred, fn); } int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred) { struct ftrace_event_call *call = __start_ftrace_events; - struct filter_pred *event_pred; - int i; - if (system->preds && !pred->compound) + if (system->n_preds && !pred->compound) filter_free_subsystem_preds(system); - if (!system->preds) { + if (!system->n_preds) { system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); if (!system->preds) return -ENOMEM; } - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (!system->preds[i]) { - system->preds[i] = pred; - break; - } - } - - if (i == MAX_FILTER_PRED) + if (system->n_preds == MAX_FILTER_PRED) return -ENOSPC; + system->preds[system->n_preds] = pred; + events_for_each(call) { int err; @@ -333,22 +349,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system, if (!find_event_field(call, pred->field_name)) continue; - event_pred = copy_pred(pred); - if (!event_pred) - goto oom; - - err = filter_add_pred(call, event_pred); - if (err) - filter_free_pred(event_pred); - if (err == -ENOMEM) - goto oom; + err = filter_add_pred(call, pred); + if (err == -ENOMEM) { + system->preds[system->n_preds] = NULL; + return err; + } } - return 0; + system->n_preds++; -oom: - system->preds[i] = NULL; - return -ENOMEM; + return 0; } int filter_parse(char **pbuf, struct filter_pred *pred) @@ -410,7 +420,8 @@ int filter_parse(char **pbuf, struct filter_pred *pred) } } - if (!val_str) { + if (!val_str || !strlen(val_str) + || strlen(val_str) >= MAX_FILTER_STR_VAL) { pred->field_name = NULL; return -EINVAL; } @@ -419,11 +430,12 @@ int filter_parse(char **pbuf, struct filter_pred *pred) if (!pred->field_name) return -ENOMEM; + pred->str_len = 0; pred->val = simple_strtoull(val_str, &tmp, 0); if (tmp == val_str) { - pred->str_val = kstrdup(val_str, GFP_KERNEL); - if (!pred->str_val) - return -ENOMEM; + strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL); + pred->str_len = strlen(val_str); + pred->str_val[pred->str_len] = '\0'; } else if (*tmp != '\0') return -EINVAL; diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 02fb710193e..59cfd7dfe68 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -140,6 +140,7 @@ ftrace_format_##call(struct trace_seq *s) \ #undef __array #define __array(type, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ sizeof(field.item)); \ diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index b2b298269eb..5bb1b7ffbdb 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -255,6 +255,7 @@ static int ftrace_raw_init_event_##call(void) \ return -ENODEV; \ event_##call.id = id; \ INIT_LIST_HEAD(&event_##call.fields); \ + init_preds(&event_##call); \ return 0; \ } \ \ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 77c494f5e1d..48fc02fe73a 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -122,6 +122,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ static int ftrace_raw_init_event_##call(void) \ { \ INIT_LIST_HEAD(&event_##call.fields); \ + init_preds(&event_##call); \ return 0; \ } \ From 7ba5c840e64d4a967379f1ae3eca73278180b11d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Apr 2009 21:31:17 -0700 Subject: [PATCH 166/900] rcu: Add __rcu_pending tracing to hierarchical RCU Add tracing to __rcu_pending() to provide information on why RCU processing was kicked off. This is helpful for debugging hierarchical RCU, and might also be helpful in learning how hierarchical RCU operates. Located-by: Anton Blanchard Tested-by: Anton Blanchard Signed-off-by: Paul E. McKenney Cc: anton@samba.org Cc: akpm@linux-foundation.org Cc: dipankar@in.ibm.com Cc: manfred@colorfullife.com Cc: cl@linux-foundation.org Cc: josht@linux.vnet.ibm.com Cc: schamp@sgi.com Cc: niv@us.ibm.com Cc: dvhltc@us.ibm.com Cc: ego@in.ibm.com Cc: laijs@cn.fujitsu.com Cc: rostedt@goodmis.org Cc: peterz@infradead.org Cc: penberg@cs.helsinki.fi Cc: andi@firstfloor.org Cc: "Paul E. McKenney" LKML-Reference: <1239683479943-git-send-email-> Signed-off-by: Ingo Molnar --- include/linux/rcutree.h | 9 +++++- kernel/rcutree.c | 25 ++++++++++++---- kernel/rcutree_trace.c | 64 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 90 insertions(+), 8 deletions(-) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 58b2aa5312b..5a5153806c4 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -161,8 +161,15 @@ struct rcu_data { unsigned long offline_fqs; /* Kicked due to being offline. */ unsigned long resched_ipi; /* Sent a resched IPI. */ - /* 5) For future __rcu_pending statistics. */ + /* 5) __rcu_pending() statistics. */ long n_rcu_pending; /* rcu_pending() calls since boot. */ + long n_rp_qs_pending; + long n_rp_cb_ready; + long n_rp_cpu_needs_gp; + long n_rp_gp_completed; + long n_rp_gp_started; + long n_rp_need_fqs; + long n_rp_need_nothing; int cpu; }; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d2a372fb0b9..0dccfbba6d2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) check_cpu_stall(rsp, rdp); /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->qs_pending) + if (rdp->qs_pending) { + rdp->n_rp_qs_pending++; return 1; + } /* Does this CPU have callbacks ready to invoke? */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (cpu_has_callbacks_ready_to_invoke(rdp)) { + rdp->n_rp_cb_ready++; return 1; + } /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) + if (cpu_needs_another_gp(rsp, rdp)) { + rdp->n_rp_cpu_needs_gp++; return 1; + } /* Has another RCU grace period completed? */ - if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */ + if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ + rdp->n_rp_gp_completed++; return 1; + } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */ + if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ + rdp->n_rp_gp_started++; return 1; + } /* Has an RCU GP gone long enough to send resched IPIs &c? */ if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && - ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) + ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { + rdp->n_rp_need_fqs++; return 1; + } /* nothing to do */ + rdp->n_rp_need_nothing++; return 0; } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 4b1875ba940..fe1dcdbf1ca 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = { .release = single_release, }; -static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir; +static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) +{ + seq_printf(m, "%3d%cnp=%ld " + "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", + rdp->cpu, + cpu_is_offline(rdp->cpu) ? '!' : ' ', + rdp->n_rcu_pending, + rdp->n_rp_qs_pending, + rdp->n_rp_cb_ready, + rdp->n_rp_cpu_needs_gp, + rdp->n_rp_gp_completed, + rdp->n_rp_gp_started, + rdp->n_rp_need_fqs, + rdp->n_rp_need_nothing); +} + +static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) +{ + int cpu; + struct rcu_data *rdp; + + for_each_possible_cpu(cpu) { + rdp = rsp->rda[cpu]; + if (rdp->beenonline) + print_one_rcu_pending(m, rdp); + } +} + +static int show_rcu_pending(struct seq_file *m, void *unused) +{ + seq_puts(m, "rcu:\n"); + print_rcu_pendings(m, &rcu_state); + seq_puts(m, "rcu_bh:\n"); + print_rcu_pendings(m, &rcu_bh_state); + return 0; +} + +static int rcu_pending_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcu_pending, NULL); +} + +static struct file_operations rcu_pending_fops = { + .owner = THIS_MODULE, + .open = rcu_pending_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rcudir; +static struct dentry *datadir; +static struct dentry *datadir_csv; +static struct dentry *gpdir; +static struct dentry *hierdir; +static struct dentry *rcu_pendingdir; + static int __init rcuclassic_trace_init(void) { rcudir = debugfs_create_dir("rcu", NULL); @@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void) NULL, &rcuhier_fops); if (!hierdir) goto free_out; + + rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, + NULL, &rcu_pending_fops); + if (!rcu_pendingdir) + goto free_out; return 0; free_out: if (datadir) @@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void) debugfs_remove(datadir_csv); debugfs_remove(gpdir); debugfs_remove(hierdir); + debugfs_remove(rcu_pendingdir); debugfs_remove(rcudir); } From 6fd9b3a40b82081d9e6490b0d7cd656e9a78a134 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Apr 2009 21:31:18 -0700 Subject: [PATCH 167/900] rcu: Update RCU tracing documentation for __rcu_pending This patch updates the RCU documentation to reflect the changes in tracing made in the previous patch in the set. Located-by: Anton Blanchard Tested-by: Anton Blanchard Signed-off-by: Paul E. McKenney Cc: anton@samba.org Cc: akpm@linux-foundation.org Cc: dipankar@in.ibm.com Cc: manfred@colorfullife.com Cc: cl@linux-foundation.org Cc: josht@linux.vnet.ibm.com Cc: schamp@sgi.com Cc: niv@us.ibm.com Cc: dvhltc@us.ibm.com Cc: ego@in.ibm.com Cc: laijs@cn.fujitsu.com Cc: rostedt@goodmis.org Cc: peterz@infradead.org Cc: penberg@cs.helsinki.fi Cc: andi@firstfloor.org Cc: "Paul E. McKenney" LKML-Reference: <12396834792865-git-send-email-> Signed-off-by: Ingo Molnar --- Documentation/RCU/trace.txt | 102 ++++++++++++++++++++++++++++-------- 1 file changed, 80 insertions(+), 22 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 068848240a8..02cced183b2 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -192,23 +192,24 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy). The output of "cat rcu/rcudata" looks as follows: rcu: - 0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10 - 1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10 - 2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10 - 3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10 - 4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10 - 5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10 - 6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10 - 7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10 +rcu: + 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 + 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 + 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 + 3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10 + 4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10 + 5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10 + 6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10 + 7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10 rcu_bh: - 0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10 - 1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10 - 2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10 - 3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10 - 4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 - 5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 - 6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 - 7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 + 0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10 + 1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10 + 2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 + 3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10 + 4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 + 5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 + 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 + 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 The first section lists the rcu_data structures for rcu, the second for rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system. @@ -253,12 +254,6 @@ o "pqc" indicates which grace period the last-observed quiescent o "qp" indicates that RCU still expects a quiescent state from this CPU. -o "rpfq" is the number of rcu_pending() calls on this CPU required - to induce this CPU to invoke force_quiescent_state(). - -o "rp" is low-order four hex digits of the count of how many times - rcu_pending() has been invoked on this CPU. - o "dt" is the current value of the dyntick counter that is incremented when entering or leaving dynticks idle state, either by the scheduler or by irq. The number after the "/" is the interrupt @@ -305,6 +300,9 @@ o "b" is the batch limit for this CPU. If more than this number of RCU callbacks is ready to invoke, then the remainder will be deferred. +There is also an rcu/rcudata.csv file with the same information in +comma-separated-variable spreadsheet format. + The output of "cat rcu/rcugp" looks as follows: @@ -411,3 +409,63 @@ o Each element of the form "1/1 0:127 ^0" represents one struct For example, the first entry at the lowest level shows "^0", indicating that it corresponds to bit zero in the first entry at the middle level. + + +The output of "cat rcu/rcu_pending" looks as follows: + +rcu: + 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 + 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 + 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 + 3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 + 4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 + 5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 + 6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 + 7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 +rcu_bh: + 0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 + 1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 + 2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 + 3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 + 4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 + 5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 + 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 + 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 + +As always, this is once again split into "rcu" and "rcu_bh" portions. +The fields are as follows: + +o "np" is the number of times that __rcu_pending() has been invoked + for the corresponding flavor of RCU. + +o "qsp" is the number of times that the RCU was waiting for a + quiescent state from this CPU. + +o "cbr" is the number of times that this CPU had RCU callbacks + that had passed through a grace period, and were thus ready + to be invoked. + +o "cng" is the number of times that this CPU needed another + grace period while RCU was idle. + +o "gpc" is the number of times that an old grace period had + completed, but this CPU was not yet aware of it. + +o "gps" is the number of times that a new grace period had started, + but this CPU was not yet aware of it. + +o "nf" is the number of times that this CPU suspected that the + current grace period had run for too long, and thus needed to + be forced. + + Please note that "forcing" consists of sending resched IPIs + to holdout CPUs. If that CPU really still is in an old RCU + read-side critical section, then we really do have to wait for it. + The assumption behing "forcing" is that the CPU is not still in + an old RCU read-side critical section, but has not yet responded + for some other reason. + +o "nn" is the number of times that this CPU needed nothing. Alert + readers will note that the rcu "nn" number for a given CPU very + closely matches the rcu_bh "np" number for that same CPU. This + is due to short-circuit evaluation in rcu_pending(). From 66aa230e437d89ca56224135f617e2d8e391a3ef Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 14 Apr 2009 12:54:29 +0530 Subject: [PATCH 168/900] x86: page_types.h unification of declarations Impact: unification of declarations, cleanup Unification of declarations: moved init_memory_mapping, initmem_init and free_initmem from page_XX_types.h to page_types.h Signed-off-by: Jaswinder Singh Rajput Acked-by: Pekka Enberg Cc: Andrew Morton LKML-Reference: <1239693869.3033.31.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page_32_types.h | 4 ---- arch/x86/include/asm/page_64_types.h | 6 ------ arch/x86/include/asm/page_types.h | 6 ++++++ 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 0f915ae649a..6f1b7331313 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE; extern int sysctl_legacy_va_layout; extern void find_low_pfn_range(void); -extern unsigned long init_memory_mapping(unsigned long start, - unsigned long end); -extern void initmem_init(unsigned long, unsigned long); -extern void free_initmem(void); extern void setup_bootmem_allocator(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index d38c91b7024..3f587188ae6 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -71,12 +71,6 @@ extern unsigned long __phys_addr(unsigned long); #define vmemmap ((struct page *)VMEMMAP_START) -extern unsigned long init_memory_mapping(unsigned long start, - unsigned long end); - -extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); -extern void free_initmem(void); - extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 826ad37006a..6473f5ccff8 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +extern unsigned long init_memory_mapping(unsigned long start, + unsigned long end); + +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); +extern void free_initmem(void); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PAGE_DEFS_H */ From e7d43a74cb07cbc4b8e9b5e4a914816b33fb0719 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 14 Apr 2009 13:18:28 +0530 Subject: [PATCH 169/900] x86: avoid multiple declaration of kstack_depth_to_print Impact: cleanup asm/stacktrace.h is more appropriate so removing other 2 declarations. Signed-off-by: Jaswinder Singh Rajput Cc: Neil Horman LKML-Reference: <1239695308.3033.34.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/traps.h | 1 - arch/x86/kernel/dumpstack.h | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0d5342515b8..9aa3ab26205 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -74,7 +74,6 @@ static inline int get_si_code(unsigned long condition) } extern int panic_on_unrecovered_nmi; -extern int kstack_depth_to_print; void math_error(void __user *); void math_emulate(struct math_emu_info *); diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index da87590b869..81086c227ab 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl); extern unsigned int code_bytes; -extern int kstack_depth_to_print; /* The form of the top of the frame on the stack */ struct stack_frame { From f711f6090a81cbd396b63de90f415d33f563af9b Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 14 Apr 2009 10:25:30 +0530 Subject: [PATCH 170/900] sched: Nominate idle load balancer from a semi-idle package. Currently the nomination of idle-load balancer is done by choosing the first idle cpu in the nohz.cpu_mask. This may not be power-efficient, since such an idle cpu could come from a completely idle core/package thereby preventing the whole core/package from being in a low-power state. For eg, consider a quad-core dual package system. The cpu numbering need not be sequential and can something like [0, 2, 4, 6] and [1, 3, 5, 7]. With sched_mc/smt_power_savings and the power-aware IRQ balance, we try to keep as fewer Packages/Cores active. But the current idle load balancer logic goes against this by choosing the first_cpu in the nohz.cpu_mask and not taking the system topology into consideration. Improve the algorithm to nominate the idle load balancer from a semi idle cores/packages thereby increasing the probability of the cores/packages being in deeper sleep states for longer duration. The algorithm is activated only when sched_mc/smt_power_savings != 0. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra LKML-Reference: <20090414045530.7645.12175.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 127 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 118 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5724508c3b6..b0fefa300b4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4240,10 +4240,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static struct { atomic_t load_balancer; cpumask_var_t cpu_mask; + cpumask_var_t ilb_grp_nohz_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), }; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu: The cpu whose lowest level of sched domain is to + * be returned. + * @flag: The flag to check for the lowest sched_domain + * for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) + if (sd && (sd->flags & flag)) + break; + + return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu: The cpu whose domains we're iterating over. + * @sd: variable holding the value of the power_savings_sd + * for cpu. + * @flag: The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ + for (sd = lowest_flag_domain(cpu, flag); \ + (sd && (sd->flags & flag)); sd = sd->parent) + +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group: group to be checked for semi-idleness + * + * Returns: 1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ + cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ + if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + return 0; + + if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +} +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu: The cpu which is nominating a new idle_load_balancer. + * + * Returns: Returns the id of the idle load balancer if it exists, + * Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ + struct sched_domain *sd; + struct sched_group *ilb_group; + + /* + * Have idle load balancer selection from semi-idle packages only + * when power-aware load balancing is enabled + */ + if (!(sched_smt_power_savings || sched_mc_power_savings)) + goto out_done; + + /* + * Optimize for the case when we have no idle CPUs or only one + * idle CPU. Don't walk the sched_domain hierarchy in such cases + */ + if (cpumask_weight(nohz.cpu_mask) < 2) + goto out_done; + + for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { + ilb_group = sd->groups; + + do { + if (is_semi_idle_group(ilb_group)) + return cpumask_first(nohz.ilb_grp_nohz_mask); + + ilb_group = ilb_group->next; + + } while (ilb_group != sd->groups); + } + +out_done: + return cpumask_first(nohz.cpu_mask); +} +#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ + return first_cpu(nohz.cpu_mask); +} +#endif + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle @@ -4468,15 +4584,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) } if (atomic_read(&nohz.load_balancer) == -1) { - /* - * simple selection for now: Nominate the - * first cpu in the nohz list to be the next - * ilb owner. - * - * TBD: Traverse the sched domains and nominate - * the nearest cpu in the nohz.cpu_mask. - */ - int ilb = cpumask_first(nohz.cpu_mask); + int ilb = find_new_ilb(cpu); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -9051,6 +9159,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); + alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); #endif alloc_bootmem_cpumask_var(&cpu_isolated_map); #endif /* SMP */ From e790fb0ba64bfec158e1219d899cb588275d12ab Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 14 Apr 2009 10:25:35 +0530 Subject: [PATCH 171/900] sched: Nominate a power-efficient ilb in select_nohz_balancer() The CPU that first goes idle becomes the idle-load-balancer and remains that until either it picks up a task or till all the CPUs of the system goes idle. Optimize this further to allow it to relinquish it's post once all it's siblings in the power-aware sched_domain go idle, thereby allowing the whole package-core to go idle. While relinquising the post, nominate another an idle-load balancer from a semi-idle core/package. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra LKML-Reference: <20090414045535.7645.31641.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index b0fefa300b4..36d213bca47 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4414,8 +4414,24 @@ int select_nohz_load_balancer(int stop_tick) /* make me the ilb owner */ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) + } else if (atomic_read(&nohz.load_balancer) == cpu) { + int new_ilb; + + if (!(sched_smt_power_savings || + sched_mc_power_savings)) + return 1; + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { + atomic_set(&nohz.load_balancer, -1); + resched_cpu(new_ilb); + return 0; + } return 1; + } } else { if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; From 6424fb38667fffbbb1b90be0ffd9a0c540db6a4b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 13 Apr 2009 23:51:46 -0700 Subject: [PATCH 172/900] x86: remove (null) in /sys kernel_page_tables Impact: cleanup %p prints out 0x000000000000000 as (null) so use %lx instead. Signed-off-by: Yinghai Lu LKML-Reference: <49E43282.1090607@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/dump_pagetables.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e7277cbcfb4..a725b7f760a 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->current_address >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; + int width = sizeof(unsigned long) * 2; /* * Now print the actual finished series */ - seq_printf(m, "0x%p-0x%p ", - (void *)st->start_address, - (void *)st->current_address); + seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, st->current_address); delta = (st->current_address - st->start_address) >> 10; while (!(delta & 1023) && unit[1]) { From 7e05575c422d45f393c2d9b5900e97a30bf69bea Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 14 Apr 2009 12:12:29 +0900 Subject: [PATCH 173/900] x86: calgary: remove IOMMU_DEBUG CONFIG_IOMMU_DEBUG has depends on CONFIG_GART_IOMMU: config IOMMU_DEBUG bool "Enable IOMMU debugging" depends on GART_IOMMU && DEBUG_KERNEL depends on X86_64 So it's not useful to have CONFIG_IOMMU_DEBUG in Calgary IOMMU code, which does the extra checking of the bitmap space management. And Calgary uses the iommu helper for the bitmap space management now so it would be better to have the extra checking feature in the iommu helper rather than Calgary code (if necessary). Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Cc: Joerg Roedel Cc: alexisb@us.ibm.com LKML-Reference: <20090414120827G.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 54 ++------------------------------ 1 file changed, 2 insertions(+), 52 deletions(-) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 755c21e906f..971a3bec47a 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = { static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; -/* enable this to stress test the chip's TCE cache */ -#ifdef CONFIG_IOMMU_DEBUG -static int debugging = 1; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - unsigned long idx = start; - - BUG_ON(start >= end); - - while (idx < end) { - if (!!test_bit(idx, bitmap) != expected) - return idx; - ++idx; - } - - /* all bits have the expected value */ - return ~0UL; -} -#else /* debugging is disabled */ -static int debugging; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - return ~0UL; -} - -#endif /* CONFIG_IOMMU_DEBUG */ - static inline int translation_enabled(struct iommu_table *tbl) { /* only PHBs with translation enabled have an IOMMU table */ @@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, { unsigned long index; unsigned long end; - unsigned long badbit; unsigned long flags; index = start_addr >> PAGE_SHIFT; @@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 0, index, end); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: entry already allocated at " - "0x%lx tbl %p dma 0x%lx npages %u\n", - badbit, tbl, start_addr, npages); - } - iommu_area_reserve(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; - unsigned long badbit; unsigned long badend; unsigned long flags; @@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: bit is off at 0x%lx " - "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", - badbit, tbl, dma_addr, entry, npages); - } - iommu_area_free(tbl->it_map, entry, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -1488,9 +1439,8 @@ void __init detect_calgary(void) iommu_detected = 1; calgary_detected = 1; printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); - printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " - "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, - debugging ? "enabled" : "disabled"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", + specified_table_size); /* swiotlb for devices that aren't behind the Calgary. */ if (max_pfn > MAX_DMA32_PFN) From ea20d9293ce423a39717ed4375393129a2e701f9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 08:54:16 -0400 Subject: [PATCH 174/900] tracing: consolidate trace and trace_event headers Impact: clean up Neil Horman (et. al.) criticized the way the trace events were broken up into two files. The reason for that was that ftrace needed to separate out the declarations from where the #include was used. It then dawned on me that the tracepoint.h header only needs to define the TRACE_EVENT macro if it is not already defined. The solution is simply to test if TRACE_EVENT is defined, and if it is not then the linux/tracepoint.h header can define it. This change consolidates all the .h and _event_types.h into the .h file. Reported-by: Neil Horman Reported-by: Theodore Tso Reported-by: Jiaying Zhang Cc: Zhaolei Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Jason Baron Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 9 +- include/trace/irq.h | 51 ++++- include/trace/irq_event_types.h | 55 ----- include/trace/kmem.h | 189 +++++++++++++++- include/trace/lockdep.h | 52 ++++- include/trace/lockdep_event_types.h | 57 ----- include/trace/sched.h | 333 ++++++++++++++++++++++++++- include/trace/sched_event_types.h | 337 ---------------------------- include/trace/skb.h | 36 ++- include/trace/skb_event_types.h | 38 ---- include/trace/trace_event_types.h | 7 - kernel/trace/events.c | 1 + kernel/trace/trace_events_stage_1.h | 4 +- kernel/trace/trace_events_stage_2.h | 8 +- kernel/trace/trace_events_stage_3.h | 4 +- 15 files changed, 663 insertions(+), 518 deletions(-) delete mode 100644 include/trace/irq_event_types.h delete mode 100644 include/trace/lockdep_event_types.h delete mode 100644 include/trace/sched_event_types.h delete mode 100644 include/trace/skb_event_types.h delete mode 100644 include/trace/trace_event_types.h diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index d35a7ee7611..4353f3f7e62 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -31,6 +31,8 @@ struct tracepoint { * Keep in sync with vmlinux.lds.h. */ +#ifndef DECLARE_TRACE + #define TP_PROTO(args...) args #define TP_ARGS(args...) args @@ -114,6 +116,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) { } #endif /* CONFIG_TRACEPOINTS */ +#endif /* DECLARE_TRACE */ /* * Connect a probe to a tracepoint. @@ -154,10 +157,13 @@ static inline void tracepoint_synchronize_unregister(void) } #define PARAMS(args...) args + +#ifndef TRACE_FORMAT #define TRACE_FORMAT(name, proto, args, fmt) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) +#endif - +#ifndef TRACE_EVENT /* * For use with the TRACE_EVENT macro: * @@ -262,5 +268,6 @@ static inline void tracepoint_synchronize_unregister(void) #define TRACE_EVENT(name, proto, args, struct, assign, print) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) +#endif #endif diff --git a/include/trace/irq.h b/include/trace/irq.h index ff5d4495dc3..04ab4c65222 100644 --- a/include/trace/irq.h +++ b/include/trace/irq.h @@ -1,9 +1,54 @@ -#ifndef _TRACE_IRQ_H +#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IRQ_H -#include #include +#include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM irq + +/* + * Tracepoint for entry of interrupt handler: + */ +TRACE_FORMAT(irq_handler_entry, + TP_PROTO(int irq, struct irqaction *action), + TP_ARGS(irq, action), + TP_FMT("irq=%d handler=%s", irq, action->name) + ); + +/* + * Tracepoint for return of an interrupt handler: + */ +TRACE_EVENT(irq_handler_exit, + + TP_PROTO(int irq, struct irqaction *action, int ret), + + TP_ARGS(irq, action, ret), + + TP_STRUCT__entry( + __field( int, irq ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->ret = ret; + ), + + TP_printk("irq=%d return=%s", + __entry->irq, __entry->ret ? "handled" : "unhandled") +); + +TRACE_FORMAT(softirq_entry, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), + TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) + ); + +TRACE_FORMAT(softirq_exit, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), + TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) + ); #endif diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h deleted file mode 100644 index 85964ebd47e..00000000000 --- a/include/trace/irq_event_types.h +++ /dev/null @@ -1,55 +0,0 @@ - -/* use instead */ -#ifndef TRACE_FORMAT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM irq - -/* - * Tracepoint for entry of interrupt handler: - */ -TRACE_FORMAT(irq_handler_entry, - TP_PROTO(int irq, struct irqaction *action), - TP_ARGS(irq, action), - TP_FMT("irq=%d handler=%s", irq, action->name) - ); - -/* - * Tracepoint for return of an interrupt handler: - */ -TRACE_EVENT(irq_handler_exit, - - TP_PROTO(int irq, struct irqaction *action, int ret), - - TP_ARGS(irq, action, ret), - - TP_STRUCT__entry( - __field( int, irq ) - __field( int, ret ) - ), - - TP_fast_assign( - __entry->irq = irq; - __entry->ret = ret; - ), - - TP_printk("irq=%d return=%s", - __entry->irq, __entry->ret ? "handled" : "unhandled") -); - -TRACE_FORMAT(softirq_entry, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), - TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); - -TRACE_FORMAT(softirq_exit, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), - TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); - -#undef TRACE_SYSTEM diff --git a/include/trace/kmem.h b/include/trace/kmem.h index 46efc2423f0..d7d12189e5c 100644 --- a/include/trace/kmem.h +++ b/include/trace/kmem.h @@ -1,9 +1,192 @@ -#ifndef _TRACE_KMEM_H +#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KMEM_H #include #include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kmem -#endif /* _TRACE_KMEM_H */ +TRACE_EVENT(kmalloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmem_cache_alloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmalloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kmem_cache_alloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kfree, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +TRACE_EVENT(kmem_cache_free, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +#endif diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h index 5ca67df87f2..8ee7900b38c 100644 --- a/include/trace/lockdep.h +++ b/include/trace/lockdep.h @@ -1,9 +1,57 @@ -#ifndef _TRACE_LOCKDEP_H +#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_LOCKDEP_H #include #include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM lock + +#ifdef CONFIG_LOCKDEP + +TRACE_FORMAT(lock_acquire, + TP_PROTO(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *next_lock, unsigned long ip), + TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), + TP_FMT("%s%s%s", trylock ? "try " : "", + read ? "read " : "", lock->name) + ); + +TRACE_FORMAT(lock_release, + TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TP_ARGS(lock, nested, ip), + TP_FMT("%s", lock->name) + ); + +#ifdef CONFIG_LOCK_STAT + +TRACE_FORMAT(lock_contended, + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + TP_ARGS(lock, ip), + TP_FMT("%s", lock->name) + ); + +TRACE_EVENT(lock_acquired, + TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), + + TP_ARGS(lock, ip, waittime), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, wait_usec) + __field(unsigned long, wait_nsec_rem) + ), + TP_fast_assign( + __entry->name = lock->name; + __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); + __entry->wait_usec = (unsigned long) waittime; + ), + TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, + __entry->wait_nsec_rem) +); #endif +#endif + +#endif /* _TRACE_LOCKDEP_H */ diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h deleted file mode 100644 index 863f1e4583a..00000000000 --- a/include/trace/lockdep_event_types.h +++ /dev/null @@ -1,57 +0,0 @@ - -#ifndef TRACE_FORMAT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM lock - -#ifdef CONFIG_LOCKDEP - -TRACE_FORMAT(lock_acquire, - TP_PROTO(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, - struct lockdep_map *next_lock, unsigned long ip), - TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), - TP_FMT("%s%s%s", trylock ? "try " : "", - read ? "read " : "", lock->name) - ); - -TRACE_FORMAT(lock_release, - TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), - TP_ARGS(lock, nested, ip), - TP_FMT("%s", lock->name) - ); - -#ifdef CONFIG_LOCK_STAT - -TRACE_FORMAT(lock_contended, - TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip), - TP_FMT("%s", lock->name) - ); - -TRACE_EVENT(lock_acquired, - TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), - - TP_ARGS(lock, ip, waittime), - - TP_STRUCT__entry( - __field(const char *, name) - __field(unsigned long, wait_usec) - __field(unsigned long, wait_nsec_rem) - ), - TP_fast_assign( - __entry->name = lock->name; - __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); - __entry->wait_usec = (unsigned long) waittime; - ), - TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, - __entry->wait_nsec_rem) -); - -#endif -#endif - -#undef TRACE_SYSTEM diff --git a/include/trace/sched.h b/include/trace/sched.h index 4e372a1a29b..5b1cf4a2846 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -1,9 +1,336 @@ -#ifndef _TRACE_SCHED_H +#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCHED_H #include #include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sched -#endif +/* + * Tracepoint for calling kthread_stop, performed to end a kthread: + */ +TRACE_EVENT(sched_kthread_stop, + + TP_PROTO(struct task_struct *t), + + TP_ARGS(t), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + ), + + TP_fast_assign( + memcpy(__entry->comm, t->comm, TASK_COMM_LEN); + __entry->pid = t->pid; + ), + + TP_printk("task %s:%d", __entry->comm, __entry->pid) +); + +/* + * Tracepoint for the return value of the kthread stopping: + */ +TRACE_EVENT(sched_kthread_stop_ret, + + TP_PROTO(int ret), + + TP_ARGS(ret), + + TP_STRUCT__entry( + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ret = ret; + ), + + TP_printk("ret %d", __entry->ret) +); + +/* + * Tracepoint for waiting on task to unschedule: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wait_task, + + TP_PROTO(struct rq *rq, struct task_struct *p), + + TP_ARGS(rq, p), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for waking up a task: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wakeup, + + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + + TP_ARGS(rq, p, success), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, success ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->success = success; + ), + + TP_printk("task %s:%d [%d] success=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->success) +); + +/* + * Tracepoint for waking up a new task: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wakeup_new, + + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + + TP_ARGS(rq, p, success), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, success ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->success = success; + ), + + TP_printk("task %s:%d [%d] success=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->success) +); + +/* + * Tracepoint for task switches, performed by the scheduler: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_switch, + + TP_PROTO(struct rq *rq, struct task_struct *prev, + struct task_struct *next), + + TP_ARGS(rq, prev, next), + + TP_STRUCT__entry( + __array( char, prev_comm, TASK_COMM_LEN ) + __field( pid_t, prev_pid ) + __field( int, prev_prio ) + __array( char, next_comm, TASK_COMM_LEN ) + __field( pid_t, next_pid ) + __field( int, next_prio ) + ), + + TP_fast_assign( + memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); + __entry->prev_pid = prev->pid; + __entry->prev_prio = prev->prio; + memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); + __entry->next_pid = next->pid; + __entry->next_prio = next->prio; + ), + + TP_printk("task %s:%d [%d] ==> %s:%d [%d]", + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, + __entry->next_comm, __entry->next_pid, __entry->next_prio) +); + +/* + * Tracepoint for a task being migrated: + */ +TRACE_EVENT(sched_migrate_task, + + TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + + TP_ARGS(p, orig_cpu, dest_cpu), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, orig_cpu ) + __field( int, dest_cpu ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->orig_cpu = orig_cpu; + __entry->dest_cpu = dest_cpu; + ), + + TP_printk("task %s:%d [%d] from: %d to: %d", + __entry->comm, __entry->pid, __entry->prio, + __entry->orig_cpu, __entry->dest_cpu) +); + +/* + * Tracepoint for freeing a task: + */ +TRACE_EVENT(sched_process_free, + + TP_PROTO(struct task_struct *p), + + TP_ARGS(p), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for a task exiting: + */ +TRACE_EVENT(sched_process_exit, + + TP_PROTO(struct task_struct *p), + + TP_ARGS(p), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for a waiting task: + */ +TRACE_EVENT(sched_process_wait, + + TP_PROTO(struct pid *pid), + + TP_ARGS(pid), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + __entry->pid = pid_nr(pid); + __entry->prio = current->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for do_fork: + */ +TRACE_EVENT(sched_process_fork, + + TP_PROTO(struct task_struct *parent, struct task_struct *child), + + TP_ARGS(parent, child), + + TP_STRUCT__entry( + __array( char, parent_comm, TASK_COMM_LEN ) + __field( pid_t, parent_pid ) + __array( char, child_comm, TASK_COMM_LEN ) + __field( pid_t, child_pid ) + ), + + TP_fast_assign( + memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); + __entry->parent_pid = parent->pid; + memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); + __entry->child_pid = child->pid; + ), + + TP_printk("parent %s:%d child %s:%d", + __entry->parent_comm, __entry->parent_pid, + __entry->child_comm, __entry->child_pid) +); + +/* + * Tracepoint for sending a signal: + */ +TRACE_EVENT(sched_signal_send, + + TP_PROTO(int sig, struct task_struct *p), + + TP_ARGS(sig, p), + + TP_STRUCT__entry( + __field( int, sig ) + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->sig = sig; + ), + + TP_printk("sig: %d task %s:%d", + __entry->sig, __entry->comm, __entry->pid) +); + +#endif /* _TRACE_SCHED_H */ diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h deleted file mode 100644 index 63547dc1125..00000000000 --- a/include/trace/sched_event_types.h +++ /dev/null @@ -1,337 +0,0 @@ - -/* use instead */ -#ifndef TRACE_EVENT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM sched - -/* - * Tracepoint for calling kthread_stop, performed to end a kthread: - */ -TRACE_EVENT(sched_kthread_stop, - - TP_PROTO(struct task_struct *t), - - TP_ARGS(t), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - ), - - TP_fast_assign( - memcpy(__entry->comm, t->comm, TASK_COMM_LEN); - __entry->pid = t->pid; - ), - - TP_printk("task %s:%d", __entry->comm, __entry->pid) -); - -/* - * Tracepoint for the return value of the kthread stopping: - */ -TRACE_EVENT(sched_kthread_stop_ret, - - TP_PROTO(int ret), - - TP_ARGS(ret), - - TP_STRUCT__entry( - __field( int, ret ) - ), - - TP_fast_assign( - __entry->ret = ret; - ), - - TP_printk("ret %d", __entry->ret) -); - -/* - * Tracepoint for waiting on task to unschedule: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_wait_task, - - TP_PROTO(struct rq *rq, struct task_struct *p), - - TP_ARGS(rq, p), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for waking up a task: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_wakeup, - - TP_PROTO(struct rq *rq, struct task_struct *p, int success), - - TP_ARGS(rq, p, success), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, success ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - __entry->success = success; - ), - - TP_printk("task %s:%d [%d] success=%d", - __entry->comm, __entry->pid, __entry->prio, - __entry->success) -); - -/* - * Tracepoint for waking up a new task: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_wakeup_new, - - TP_PROTO(struct rq *rq, struct task_struct *p, int success), - - TP_ARGS(rq, p, success), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, success ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - __entry->success = success; - ), - - TP_printk("task %s:%d [%d] success=%d", - __entry->comm, __entry->pid, __entry->prio, - __entry->success) -); - -/* - * Tracepoint for task switches, performed by the scheduler: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_switch, - - TP_PROTO(struct rq *rq, struct task_struct *prev, - struct task_struct *next), - - TP_ARGS(rq, prev, next), - - TP_STRUCT__entry( - __array( char, prev_comm, TASK_COMM_LEN ) - __field( pid_t, prev_pid ) - __field( int, prev_prio ) - __array( char, next_comm, TASK_COMM_LEN ) - __field( pid_t, next_pid ) - __field( int, next_prio ) - ), - - TP_fast_assign( - memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); - __entry->prev_pid = prev->pid; - __entry->prev_prio = prev->prio; - memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); - __entry->next_pid = next->pid; - __entry->next_prio = next->prio; - ), - - TP_printk("task %s:%d [%d] ==> %s:%d [%d]", - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - __entry->next_comm, __entry->next_pid, __entry->next_prio) -); - -/* - * Tracepoint for a task being migrated: - */ -TRACE_EVENT(sched_migrate_task, - - TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), - - TP_ARGS(p, orig_cpu, dest_cpu), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, orig_cpu ) - __field( int, dest_cpu ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - __entry->orig_cpu = orig_cpu; - __entry->dest_cpu = dest_cpu; - ), - - TP_printk("task %s:%d [%d] from: %d to: %d", - __entry->comm, __entry->pid, __entry->prio, - __entry->orig_cpu, __entry->dest_cpu) -); - -/* - * Tracepoint for freeing a task: - */ -TRACE_EVENT(sched_process_free, - - TP_PROTO(struct task_struct *p), - - TP_ARGS(p), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for a task exiting: - */ -TRACE_EVENT(sched_process_exit, - - TP_PROTO(struct task_struct *p), - - TP_ARGS(p), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for a waiting task: - */ -TRACE_EVENT(sched_process_wait, - - TP_PROTO(struct pid *pid), - - TP_ARGS(pid), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - __entry->pid = pid_nr(pid); - __entry->prio = current->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for do_fork: - */ -TRACE_EVENT(sched_process_fork, - - TP_PROTO(struct task_struct *parent, struct task_struct *child), - - TP_ARGS(parent, child), - - TP_STRUCT__entry( - __array( char, parent_comm, TASK_COMM_LEN ) - __field( pid_t, parent_pid ) - __array( char, child_comm, TASK_COMM_LEN ) - __field( pid_t, child_pid ) - ), - - TP_fast_assign( - memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); - __entry->parent_pid = parent->pid; - memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); - __entry->child_pid = child->pid; - ), - - TP_printk("parent %s:%d child %s:%d", - __entry->parent_comm, __entry->parent_pid, - __entry->child_comm, __entry->child_pid) -); - -/* - * Tracepoint for sending a signal: - */ -TRACE_EVENT(sched_signal_send, - - TP_PROTO(int sig, struct task_struct *p), - - TP_ARGS(sig, p), - - TP_STRUCT__entry( - __field( int, sig ) - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->sig = sig; - ), - - TP_printk("sig: %d task %s:%d", - __entry->sig, __entry->comm, __entry->pid) -); - -#undef TRACE_SYSTEM diff --git a/include/trace/skb.h b/include/trace/skb.h index d2de7174a6e..e6fd281f7f8 100644 --- a/include/trace/skb.h +++ b/include/trace/skb.h @@ -1,9 +1,37 @@ -#ifndef _TRACE_SKB_H_ -#define _TRACE_SKB_H_ +#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SKB_H #include #include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM skb -#endif +/* + * Tracepoint for free an sk_buff: + */ +TRACE_EVENT(kfree_skb, + + TP_PROTO(struct sk_buff *skb, void *location), + + TP_ARGS(skb, location), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned short, protocol ) + __field( void *, location ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + if (skb) { + __entry->protocol = ntohs(skb->protocol); + } + __entry->location = location; + ), + + TP_printk("skbaddr=%p protocol=%u location=%p", + __entry->skbaddr, __entry->protocol, __entry->location) +); + +#endif /* _TRACE_SKB_H */ diff --git a/include/trace/skb_event_types.h b/include/trace/skb_event_types.h deleted file mode 100644 index 4a1c504c0e1..00000000000 --- a/include/trace/skb_event_types.h +++ /dev/null @@ -1,38 +0,0 @@ - -/* use instead */ -#ifndef TRACE_EVENT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM skb - -/* - * Tracepoint for free an sk_buff: - */ -TRACE_EVENT(kfree_skb, - - TP_PROTO(struct sk_buff *skb, void *location), - - TP_ARGS(skb, location), - - TP_STRUCT__entry( - __field( void *, skbaddr ) - __field( unsigned short, protocol ) - __field( void *, location ) - ), - - TP_fast_assign( - __entry->skbaddr = skb; - if (skb) { - __entry->protocol = ntohs(skb->protocol); - } - __entry->location = location; - ), - - TP_printk("skbaddr=%p protocol=%u location=%p", - __entry->skbaddr, __entry->protocol, __entry->location) -); - -#undef TRACE_SYSTEM diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h deleted file mode 100644 index 552a50e169a..00000000000 --- a/include/trace/trace_event_types.h +++ /dev/null @@ -1,7 +0,0 @@ -/* trace/_event_types.h here */ - -#include -#include -#include -#include -#include diff --git a/kernel/trace/events.c b/kernel/trace/events.c index 246f2aa6dc4..5a35a914f0e 100644 --- a/kernel/trace/events.c +++ b/kernel/trace/events.c @@ -8,6 +8,7 @@ #include "trace_output.h" +#define TRACE_HEADER_MULTI_READ #include "trace_events_stage_1.h" #include "trace_events_stage_2.h" #include "trace_events_stage_3.h" diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h index 38985f9b379..475f46a047a 100644 --- a/kernel/trace/trace_events_stage_1.h +++ b/kernel/trace/trace_events_stage_1.h @@ -1,7 +1,7 @@ /* * Stage 1 of the trace events. * - * Override the macros in to include the following: + * Override the macros in to include the following: * * struct ftrace_raw_ { * struct trace_entry ent; @@ -36,4 +36,4 @@ }; \ static struct ftrace_event_call event_##name -#include +#include diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 59cfd7dfe68..aa4a67a0656 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -1,7 +1,7 @@ /* * Stage 2 of the trace events. * - * Override the macros in to include the following: + * Override the macros in to include the following: * * enum print_line_t * ftrace_raw_output_(struct trace_iterator *iter, int flags) @@ -64,7 +64,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ return TRACE_TYPE_HANDLED; \ } -#include +#include /* * Setup the showing format of trace point. @@ -128,7 +128,7 @@ ftrace_format_##call(struct trace_seq *s) \ return ret; \ } -#include +#include #undef __field #define __field(type, item) \ @@ -167,4 +167,4 @@ ftrace_define_fields_##call(void) \ return ret; \ } -#include +#include diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 5bb1b7ffbdb..45c04e1f38d 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -1,7 +1,7 @@ /* * Stage 3 of the trace events. * - * Override the macros in to include the following: + * Override the macros in to include the following: * * static void ftrace_event_(proto) * { @@ -272,7 +272,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ _TRACE_PROFILE_INIT(call) \ } -#include +#include #undef _TRACE_PROFILE #undef _TRACE_PROFILE_INIT From 78ddb08feb7d4fbe3c0a9931804c51ee58be4023 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 14 Apr 2009 16:53:05 +0200 Subject: [PATCH 175/900] wait: don't use __wake_up_common() '777c6c5 wait: prevent exclusive waiter starvation' made __wake_up_common() global to be used from abort_exclusive_wait(). It was needed to do a wake-up with the waitqueue lock held while passing down a key to the wake-up function. Since '4ede816 epoll keyed wakeups: add __wake_up_locked_key() and __wake_up_sync_key()' there is an appropriate wrapper for this case: __wake_up_locked_key(). Use it here and make __wake_up_common() private to the scheduler again. Signed-off-by: Johannes Weiner Cc: Andrew Morton Cc: Peter Zijlstra LKML-Reference: <1239720785-19661-1-git-send-email-hannes@cmpxchg.org> Signed-off-by: Ingo Molnar --- include/linux/wait.h | 2 -- kernel/sched.c | 2 +- kernel/wait.c | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 5d631c17eae..67d4e89b62d 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -132,8 +132,6 @@ static inline void __remove_wait_queue(wait_queue_head_t *head, list_del(&old->task_list); } -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, diff --git a/kernel/sched.c b/kernel/sched.c index 36d213bca47..92b4b56ad09 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5345,7 +5345,7 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { wait_queue_t *curr, *next; diff --git a/kernel/wait.c b/kernel/wait.c index 42a2dbc181c..ea7c3b4275c 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_common(q, mode, 1, 0, key); + __wake_up_locked_key(q, mode, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); From 56449f437add737a1e5e1cb7e00f63ac8ead1938 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 14 Apr 2009 11:24:36 +0200 Subject: [PATCH 176/900] tracing: make the trace clocks available generally Jeremy Fitzhardinge reported this build failure: LD .tmp_vmlinux1 arch/x86/kernel/built-in.o: In function `ds_take_timestamp': git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global' git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global' Which is due to !CONFIG_TRACING && CONFIG_X86_DS=y. Expose the trace clock code to CONFIG_X86_DS as well. [ Unfortunately librarizing doesnt work well - ancient architectures with no raw_local_irq_save() primitive break the build. ] Reported-by: Jeremy Fitzhardinge LKML-Reference: <49E4413F.7070700@goop.org> Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + kernel/trace/Makefile | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index bab1dffe37e..c8e1be5f0b0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ +obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2630f5121ec..ecc671e9f14 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -15,11 +15,16 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +# +# Make the trace clocks available generally: it's infrastructure +# relied on by ptrace for example: +# +obj-y += trace_clock.o + obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o -obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o From 72c6a9870f901045f2464c3dc6ee8914bfdc07aa Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 14 Apr 2009 17:33:57 +0200 Subject: [PATCH 177/900] rculist.h: introduce list_entry_rcu() and list_first_entry_rcu() I've run into the situation where I need to use list_first_entry with rcu-guarded list. This patch introduces this. Also simplify list_for_each_entry_rcu() to use new list_entry_rcu() instead of list_entry(). Signed-off-by: Jiri Pirko Reviewed-by: Paul E. McKenney Cc: dipankar@in.ibm.com LKML-Reference: <20090414153356.GC3999@psychotron.englab.brq.redhat.com> Signed-off-by: Ingo Molnar --- include/linux/rculist.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index e649bd3f2c9..5710f43bbc9 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -198,6 +198,32 @@ static inline void list_splice_init_rcu(struct list_head *list, at->prev = last; } +/** + * list_entry_rcu - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). + */ +#define list_entry_rcu(ptr, type, member) \ + container_of(rcu_dereference(ptr), type, member) + +/** + * list_first_entry_rcu - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note, that list is expected to be not empty. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). + */ +#define list_first_entry_rcu(ptr, type, member) \ + list_entry_rcu((ptr)->next, type, member) + #define __list_for_each_rcu(pos, head) \ for (pos = rcu_dereference((head)->next); \ pos != (head); \ @@ -214,9 +240,9 @@ static inline void list_splice_init_rcu(struct list_head *list, * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_entry_rcu(pos, head, member) \ - for (pos = list_entry(rcu_dereference((head)->next), typeof(*pos), member); \ + for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \ prefetch(pos->member.next), &pos->member != (head); \ - pos = list_entry(rcu_dereference(pos->member.next), typeof(*pos), member)) + pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** From a8d154b009168337494fbf345671bab74d3e4b8b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 09:36:00 -0400 Subject: [PATCH 178/900] tracing: create automated trace defines This patch lowers the number of places a developer must modify to add new tracepoints. The current method to add a new tracepoint into an existing system is to write the trace point macro in the trace header with one of the macros TRACE_EVENT, TRACE_FORMAT or DECLARE_TRACE, then they must add the same named item into the C file with the macro DEFINE_TRACE(name) and then add the trace point. This change cuts out the needing to add the DEFINE_TRACE(name). Every file that uses the tracepoint must still include the trace/.h file, but the one C file must also add a define before the including of that file. #define CREATE_TRACE_POINTS #include This will cause the trace/mytrace.h file to also produce the C code necessary to implement the trace point. Note, if more than one trace/.h is used to create the C code it is best to list them all together. #define CREATE_TRACE_POINTS #include #include #include Thanks to Mathieu Desnoyers and Christoph Hellwig for coming up with the cleaner solution of the define above the includes over my first design to have the C code include a "special" header. This patch converts sched, irq and lockdep and skb to use this new method. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neil Horman Cc: Zhao Lei Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Signed-off-by: Steven Rostedt --- include/trace/define_trace.h | 75 ++++++++++++++++++++++++++++++++++++ include/trace/irq.h | 5 ++- include/trace/kmem.h | 4 +- include/trace/lockdep.h | 3 ++ include/trace/sched.h | 3 ++ include/trace/skb.h | 3 ++ kernel/exit.c | 4 -- kernel/fork.c | 2 - kernel/irq/handle.c | 7 ++-- kernel/kthread.c | 3 -- kernel/lockdep.c | 12 ++---- kernel/sched.c | 10 ++--- kernel/signal.c | 2 - kernel/softirq.c | 3 -- mm/util.c | 11 ++---- net/core/net-traces.c | 4 +- 16 files changed, 105 insertions(+), 46 deletions(-) create mode 100644 include/trace/define_trace.h diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h new file mode 100644 index 00000000000..de9dc7d8508 --- /dev/null +++ b/include/trace/define_trace.h @@ -0,0 +1,75 @@ +/* + * Trace files that want to automate creationg of all tracepoints defined + * in their file should include this file. The following are macros that the + * trace file may define: + * + * TRACE_SYSTEM defines the system the tracepoint is for + * + * TRACE_INCLUDE_FILE if the file name is something other than TRACE_SYSTEM.h + * This macro may be defined to tell define_trace.h what file to include. + * Note, leave off the ".h". + * + * TRACE_INCLUDE_PATH if the path is something other than core kernel include/trace + * then this macro can define the path to use. Note, the path is relative to + * define_trace.h, not the file including it. Full path names for out of tree + * modules must be used. + */ + +#ifdef CREATE_TRACE_POINTS + +/* Prevent recursion */ +#undef CREATE_TRACE_POINTS + +#include + +#undef TRACE_EVENT +#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ + DEFINE_TRACE(name) + +#undef TRACE_FORMAT +#define TRACE_FORMAT(name, proto, args, print) \ + DEFINE_TRACE(name) + +#undef DECLARE_TRACE +#define DECLARE_TRACE(name, proto, args) \ + DEFINE_TRACE(name) + +#undef TRACE_INCLUDE +#undef __TRACE_INCLUDE + +#ifndef TRACE_INCLUDE_FILE +# define TRACE_INCLUDE_FILE TRACE_SYSTEM +# define UNDEF_TRACE_INCLUDE_FILE +#endif + +#ifndef TRACE_INCLUDE_PATH +# define __TRACE_INCLUDE(system) +# define UNDEF_TRACE_INCLUDE_FILE +#else +# define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h) +#endif + +# define TRACE_INCLUDE(system) __TRACE_INCLUDE(system) + +/* Let the trace headers be reread */ +#define TRACE_HEADER_MULTI_READ + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef TRACE_HEADER_MULTI_READ + +/* Only undef what we defined in this file */ +#ifdef UNDEF_TRACE_INCLUDE_FILE +# undef TRACE_INCLUDE_PATH +# undef UNDEF_TRACE_INCLUDE_FILE +#endif + +#ifdef UNDEF_TRACE_INCLUDE_FILE +# undef TRACE_INCLUDE_PATH +# undef UNDEF_TRACE_INCLUDE_FILE +#endif + +/* We may be processing more files */ +#define CREATE_TRACE_POINTS + +#endif /* CREATE_TRACE_POINTS */ diff --git a/include/trace/irq.h b/include/trace/irq.h index 04ab4c65222..75e3468e449 100644 --- a/include/trace/irq.h +++ b/include/trace/irq.h @@ -51,4 +51,7 @@ TRACE_FORMAT(softirq_exit, TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) ); -#endif +#endif /* _TRACE_IRQ_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/kmem.h b/include/trace/kmem.h index d7d12189e5c..c22c42f980b 100644 --- a/include/trace/kmem.h +++ b/include/trace/kmem.h @@ -188,5 +188,7 @@ TRACE_EVENT(kmem_cache_free, TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) ); +#endif /* _TRACE_KMEM_H */ -#endif +/* This part must be outside protection */ +#include diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h index 8ee7900b38c..4d301e758de 100644 --- a/include/trace/lockdep.h +++ b/include/trace/lockdep.h @@ -55,3 +55,6 @@ TRACE_EVENT(lock_acquired, #endif #endif /* _TRACE_LOCKDEP_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/sched.h b/include/trace/sched.h index 5b1cf4a2846..ffa1cab586b 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -334,3 +334,6 @@ TRACE_EVENT(sched_signal_send, ); #endif /* _TRACE_SCHED_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/skb.h b/include/trace/skb.h index e6fd281f7f8..1e8fabb57c0 100644 --- a/include/trace/skb.h +++ b/include/trace/skb.h @@ -35,3 +35,6 @@ TRACE_EVENT(kfree_skb, ); #endif /* _TRACE_SKB_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/exit.c b/kernel/exit.c index abf9cf3b95c..2fe9d2c7eee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -56,10 +56,6 @@ #include #include "cred-internals.h" -DEFINE_TRACE(sched_process_free); -DEFINE_TRACE(sched_process_exit); -DEFINE_TRACE(sched_process_wait); - static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd0072..4bebf263923 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -83,8 +83,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ -DEFINE_TRACE(sched_process_fork); - int nr_processes(void) { int cpu; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d82142be8dd..983d8be8dff 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,9 +17,11 @@ #include #include #include -#include #include +#define CREATE_TRACE_POINTS +#include + #include "internals.h" /* @@ -348,9 +350,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -DEFINE_TRACE(irq_handler_entry); -DEFINE_TRACE(irq_handler_exit); - /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ebaf8519ab..e1c76924545 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; -DEFINE_TRACE(sched_kthread_stop); -DEFINE_TRACE(sched_kthread_stop_ret); - struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c4582a6ea95..257f21a76c5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,12 +42,14 @@ #include #include #include -#include #include #include "lockdep_internals.h" +#define CREATE_TRACE_POINTS +#include + #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -2929,8 +2931,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lock_set_class); -DEFINE_TRACE(lock_acquire); - /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -2957,8 +2957,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, } EXPORT_SYMBOL_GPL(lock_acquire); -DEFINE_TRACE(lock_release); - void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { @@ -3061,8 +3059,6 @@ found_it: put_lock_stats(stats); } -DEFINE_TRACE(lock_acquired); - static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { @@ -3118,8 +3114,6 @@ found_it: lock->ip = ip; } -DEFINE_TRACE(lock_contended); - void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; diff --git a/kernel/sched.c b/kernel/sched.c index 5724508c3b6..e6d4518d47e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -72,13 +72,15 @@ #include #include #include -#include #include #include #include "sched_cpupri.h" +#define CREATE_TRACE_POINTS +#include + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -118,12 +120,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -DEFINE_TRACE(sched_wait_task); -DEFINE_TRACE(sched_wakeup); -DEFINE_TRACE(sched_wakeup_new); -DEFINE_TRACE(sched_switch); -DEFINE_TRACE(sched_migrate_task); - #ifdef CONFIG_SMP static void double_rq_lock(struct rq *rq1, struct rq *rq2); diff --git a/kernel/signal.c b/kernel/signal.c index d8034737db4..1d5703ff003 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -41,8 +41,6 @@ static struct kmem_cache *sigqueue_cachep; -DEFINE_TRACE(sched_signal_send); - static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; diff --git a/kernel/softirq.c b/kernel/softirq.c index 2fecefacdc5..a2d9b458ac2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -186,9 +186,6 @@ EXPORT_SYMBOL(local_bh_enable_ip); */ #define MAX_SOFTIRQ_RESTART 10 -DEFINE_TRACE(softirq_entry); -DEFINE_TRACE(softirq_exit); - asmlinkage void __do_softirq(void) { struct softirq_action *h; diff --git a/mm/util.c b/mm/util.c index 2599e83eea1..0e74a22791c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -4,9 +4,11 @@ #include #include #include -#include #include +#define CREATE_TRACE_POINTS +#include + /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -239,13 +241,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, EXPORT_SYMBOL_GPL(get_user_pages_fast); /* Tracepoints definitions. */ -DEFINE_TRACE(kmalloc); -DEFINE_TRACE(kmem_cache_alloc); -DEFINE_TRACE(kmalloc_node); -DEFINE_TRACE(kmem_cache_alloc_node); -DEFINE_TRACE(kfree); -DEFINE_TRACE(kmem_cache_free); - EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c8fb45665e4..80177205947 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -19,11 +19,11 @@ #include #include #include -#include #include #include +#define CREATE_TRACE_POINTS +#include -DEFINE_TRACE(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); From 9504504cbab29ecb694186b1c5b15d3579c43c51 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 11 Apr 2009 12:59:57 -0400 Subject: [PATCH 179/900] tracing: make trace_seq operations available for core kernel In the process to make TRACE_EVENT macro work for modules, the trace_seq operations must be available for core kernel code. These operations are quite useful and can be used for other implementations. The main idea is that we create a trace_seq handle that acts very much like the seq_file handle. struct trace_seq *s = kmalloc(sizeof(*s, GFP_KERNEL); trace_seq_init(s); trace_seq_printf(s, "some data %d\n", variable); printk("%s", s->buffer); The main use is to allow a top level function call several other functions that may store printf like data into the buffer. Then at the end, the top level function can process all the data with any method it would like to. It could be passed to userspace, output via printk or even use seq_file: trace_seq_to_user(s, ubuf, cnt); seq_puts(m, s->buffer); Signed-off-by: Steven Rostedt --- include/linux/trace_seq.h | 89 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 15 +------ kernel/trace/trace_output.h | 16 +------ 3 files changed, 92 insertions(+), 28 deletions(-) create mode 100644 include/linux/trace_seq.h diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h new file mode 100644 index 00000000000..28051da876d --- /dev/null +++ b/include/linux/trace_seq.h @@ -0,0 +1,89 @@ +#ifndef _LINUX_TRACE_SEQ_H +#define _LINUX_TRACE_SEQ_H + +/* + * Trace sequences are used to allow a function to call several other functions + * to create a string of data to use (up to a max of PAGE_SIZE. + */ + +struct trace_seq { + unsigned char buffer[PAGE_SIZE]; + unsigned int len; + unsigned int readpos; +}; + +static inline void +trace_seq_init(struct trace_seq *s) +{ + s->len = 0; + s->readpos = 0; +} + +/* + * Currently only defined when tracing is enabled. + */ +#ifdef CONFIG_TRACING +extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern int +trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); +extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); +extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt); +extern int trace_seq_puts(struct trace_seq *s, const char *str); +extern int trace_seq_putc(struct trace_seq *s, unsigned char c); +extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); +extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + size_t len); +extern void *trace_seq_reserve(struct trace_seq *s, size_t len); +extern int trace_seq_path(struct trace_seq *s, struct path *path); + +#else /* CONFIG_TRACING */ +static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))) +{ + return 0; +} +static inline int +trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +{ + return 0; +} + +static inline void trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ +} +static inline ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt) +{ + return 0; +} +static inline int trace_seq_puts(struct trace_seq *s, const char *str) +{ + return 0; +} +static inline int trace_seq_putc(struct trace_seq *s, unsigned char c); +{ + return 0; +} +static inline int +trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) +{ + return 0; +} +static inline int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + size_t len) +{ + return 0; +} +static inline void *trace_seq_reserve(struct trace_seq *s, size_t len) +{ + return NULL; +} +static inline int trace_seq_path(struct trace_seq *s, struct path *path) +{ + return 0; +} +#endif /* CONFIG_TRACING */ + +#endif /* _LINUX_TRACE_SEQ_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b05b6ac982a..1882846b738 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -12,6 +12,8 @@ #include #include +#include + enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -423,19 +425,6 @@ struct tracer { struct tracer_stat *stats; }; -struct trace_seq { - unsigned char buffer[PAGE_SIZE]; - unsigned int len; - unsigned int readpos; -}; - -static inline void -trace_seq_init(struct trace_seq *s) -{ - s->len = 0; - s->readpos = 0; -} - #define TRACE_PIPE_ALL_CPU -1 diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 91630217fb4..5c7cbfb65c7 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -1,6 +1,7 @@ #ifndef __TRACE_EVENTS_H #define __TRACE_EVENTS_H +#include #include "trace.h" typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, @@ -20,24 +21,9 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); -extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); - -extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); -extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, - size_t cnt); -extern int trace_seq_puts(struct trace_seq *s, const char *str); -extern int trace_seq_putc(struct trace_seq *s, unsigned char c); -extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); -extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, - size_t len); -extern void *trace_seq_reserve(struct trace_seq *s, size_t len); -extern int trace_seq_path(struct trace_seq *s, struct path *path); extern int seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, unsigned long sym_flags); extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, From 97f2025153499faa17267a0d4e18c7afaf73f39d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 13 Apr 2009 11:20:49 -0400 Subject: [PATCH 180/900] tracing/events: move declarations from trace directory to core include In preparation to allowing trace events to happen in modules, we need to move some of the local declarations in the kernel/trace directory into include/linux. This patch simply moves the declarations and performs no context changes. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 146 +++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 120 +--------------------------- kernel/trace/trace_output.h | 14 ---- 3 files changed, 147 insertions(+), 133 deletions(-) create mode 100644 include/linux/ftrace_event.h diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h new file mode 100644 index 00000000000..496b76d9f9d --- /dev/null +++ b/include/linux/ftrace_event.h @@ -0,0 +1,146 @@ +#ifndef _LINUX_FTRACE_EVENT_H +#define _LINUX_FTRACE_EVENT_H + +#include +#include + + +struct trace_array; +struct tracer; + +/* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + unsigned char type; + unsigned char flags; + unsigned char preempt_count; + int pid; + int tgid; +}; + +/* + * Trace iterator - used by printout routines who present trace + * results to users and which routines might sleep, etc: + */ +struct trace_iterator { + struct trace_array *tr; + struct tracer *trace; + void *private; + int cpu_file; + struct mutex mutex; + struct ring_buffer_iter *buffer_iter[NR_CPUS]; + + /* The below is zeroed out in pipe_read */ + struct trace_seq seq; + struct trace_entry *ent; + int cpu; + u64 ts; + + unsigned long iter_flags; + loff_t pos; + long idx; + + cpumask_var_t started; +}; + + +typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, + int flags); +struct trace_event { + struct hlist_node node; + int type; + trace_print_func trace; + trace_print_func raw; + trace_print_func hex; + trace_print_func binary; +}; + +extern int register_ftrace_event(struct trace_event *event); +extern int unregister_ftrace_event(struct trace_event *event); + +/* Return values for print_line callback */ +enum print_line_t { + TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ + TRACE_TYPE_HANDLED = 1, + TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ + TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ +}; + + +struct ring_buffer_event * +trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, + unsigned long flags, int pc); +void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc); +void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc); +void trace_current_buffer_discard_commit(struct ring_buffer_event *event); + +void tracing_record_cmdline(struct task_struct *tsk); + +struct ftrace_event_call { + char *name; + char *system; + struct dentry *dir; + int enabled; + int (*regfunc)(void); + void (*unregfunc)(void); + int id; + int (*raw_init)(void); + int (*show_format)(struct trace_seq *s); + int (*define_fields)(void); + struct list_head fields; + int n_preds; + struct filter_pred **preds; + +#ifdef CONFIG_EVENT_PROFILE + atomic_t profile_count; + int (*profile_enable)(struct ftrace_event_call *); + void (*profile_disable)(struct ftrace_event_call *); +#endif +}; + +#define MAX_FILTER_PRED 8 +#define MAX_FILTER_STR_VAL 128 + +extern int init_preds(struct ftrace_event_call *call); +extern int filter_match_preds(struct ftrace_event_call *call, void *rec); +extern int filter_current_check_discard(struct ftrace_event_call *call, + void *rec, + struct ring_buffer_event *event); + +extern int trace_define_field(struct ftrace_event_call *call, char *type, + char *name, int offset, int size); + + +/* + * The double __builtin_constant_p is because gcc will give us an error + * if we try to allocate the static variable to fmt if it is not a + * constant. Even with the outer if statement optimizing out. + */ +#define event_trace_printk(ip, fmt, args...) \ +do { \ + __trace_printk_check_format(fmt, ##args); \ + tracing_record_cmdline(current); \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))) = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __trace_bprintk(ip, trace_printk_fmt, ##args); \ + } else \ + __trace_printk(ip, fmt, ##args); \ +} while (0) + +#define __common_field(type, item) \ + ret = trace_define_field(event_call, #type, "common_" #item, \ + offsetof(typeof(field.ent), item), \ + sizeof(field.ent.item)); \ + if (ret) \ + return ret; + +#endif /* _LINUX_FTRACE_EVENT_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1882846b738..6bcdf4af9b2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,7 @@ #include #include +#include enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -43,20 +44,6 @@ enum trace_type { __TRACE_LAST_TYPE, }; -/* - * The trace entry - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - * bash-15816 [01] 235.197585: idle_cpu <- irq_enter - */ -struct trace_entry { - unsigned char type; - unsigned char flags; - unsigned char preempt_count; - int pid; - int tgid; -}; - /* * Function trace entry - function address and parent function addres: */ @@ -265,8 +252,6 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; }; -struct trace_iterator; - /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -341,15 +326,6 @@ extern void __ftrace_bad_type(void); __ftrace_bad_type(); \ } while (0) -/* Return values for print_line callback */ -enum print_line_t { - TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ - TRACE_TYPE_HANDLED = 1, - TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ - TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ -}; - - /* * An option specific to a tracer. This is a boolean value. * The bit is the bit index that sets its value on the @@ -428,31 +404,6 @@ struct tracer { #define TRACE_PIPE_ALL_CPU -1 -/* - * Trace iterator - used by printout routines who present trace - * results to users and which routines might sleep, etc: - */ -struct trace_iterator { - struct trace_array *tr; - struct tracer *trace; - void *private; - int cpu_file; - struct mutex mutex; - struct ring_buffer_iter *buffer_iter[NR_CPUS]; - - /* The below is zeroed out in pipe_read */ - struct trace_seq seq; - struct trace_entry *ent; - int cpu; - u64 ts; - - unsigned long iter_flags; - loff_t pos; - long idx; - - cpumask_var_t started; -}; - int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void trace_wake_up(void); @@ -479,15 +430,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr, struct ring_buffer_event *event, unsigned long flags, int pc); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, - unsigned long flags, int pc); -void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_current_buffer_discard_commit(struct ring_buffer_event *event); - struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -510,7 +452,6 @@ void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, unsigned long flags, int pc); -void tracing_record_cmdline(struct task_struct *tsk); void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, @@ -790,28 +731,6 @@ struct ftrace_event_field { int size; }; -struct ftrace_event_call { - char *name; - char *system; - struct dentry *dir; - int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); - int id; - int (*raw_init)(void); - int (*show_format)(struct trace_seq *s); - int (*define_fields)(void); - struct list_head fields; - int n_preds; - struct filter_pred **preds; - -#ifdef CONFIG_EVENT_PROFILE - atomic_t profile_count; - int (*profile_enable)(struct ftrace_event_call *); - void (*profile_disable)(struct ftrace_event_call *); -#endif -}; - struct event_subsystem { struct list_head list; const char *name; @@ -825,9 +744,6 @@ struct event_subsystem { (unsigned long)event < (unsigned long)__stop_ftrace_events; \ event++) -#define MAX_FILTER_PRED 8 -#define MAX_FILTER_STR_VAL 128 - struct filter_pred; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); @@ -845,9 +761,6 @@ struct filter_pred { int clear; }; -int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size); -extern int init_preds(struct ftrace_event_call *call); extern void filter_free_pred(struct filter_pred *pred); extern void filter_print_preds(struct filter_pred **preds, int n_preds, struct trace_seq *s); @@ -855,13 +768,9 @@ extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); extern void filter_disable_preds(struct ftrace_event_call *call); -extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); -extern int filter_current_check_discard(struct ftrace_event_call *call, - void *rec, - struct ring_buffer_event *event); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -876,14 +785,6 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -#define __common_field(type, item) \ - ret = trace_define_field(event_call, #type, "common_" #item, \ - offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ - if (ret) \ - return ret; - -void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; @@ -895,25 +796,6 @@ extern struct ftrace_event_call __stop_ftrace_events[]; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; -/* - * The double __builtin_constant_p is because gcc will give us an error - * if we try to allocate the static variable to fmt if it is not a - * constant. Even with the outer if statement optimizing out. - */ -#define event_trace_printk(ip, fmt, args...) \ -do { \ - __trace_printk_check_format(fmt, ##args); \ - tracing_record_cmdline(current); \ - if (__builtin_constant_p(fmt)) { \ - static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))) = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __trace_bprintk(ip, trace_printk_fmt, ##args); \ - } else \ - __trace_printk(ip, fmt, ##args); \ -} while (0) - #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ extern struct ftrace_event_call event_##call; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 5c7cbfb65c7..6e220a8e570 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -4,18 +4,6 @@ #include #include "trace.h" -typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, - int flags); - -struct trace_event { - struct hlist_node node; - int type; - trace_print_func trace; - trace_print_func raw; - trace_print_func hex; - trace_print_func binary; -}; - extern enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t @@ -33,8 +21,6 @@ extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); extern struct trace_event *ftrace_find_event(int type); -extern int register_ftrace_event(struct trace_event *event); -extern int unregister_ftrace_event(struct trace_event *event); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); From f42c85e74faa422cf0bc747ed808681145448f88 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 13 Apr 2009 12:25:37 -0400 Subject: [PATCH 181/900] tracing/events: move the ftrace event tracing code to core This patch moves the ftrace creation into include/trace/ftrace.h and simplifies the work of developers in adding new tracepoints. Just the act of creating the trace points in include/trace and including define_trace.h will create the events in the debugfs/tracing/events directory. This patch removes the need of include/trace/trace_events.h Signed-off-by: Steven Rostedt --- include/trace/define_trace.h | 4 + .../trace/ftrace.h | 215 +++++++++++++++++- include/trace/trace_events.h | 7 - kernel/trace/Makefile | 1 - kernel/trace/events.c | 15 -- kernel/trace/trace_events_stage_1.h | 39 ---- kernel/trace/trace_events_stage_2.h | 170 -------------- 7 files changed, 218 insertions(+), 233 deletions(-) rename kernel/trace/trace_events_stage_3.h => include/trace/ftrace.h (58%) delete mode 100644 include/trace/trace_events.h delete mode 100644 kernel/trace/events.c delete mode 100644 kernel/trace/trace_events_stage_1.h delete mode 100644 kernel/trace/trace_events_stage_2.h diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index de9dc7d8508..980eb66a6e3 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -56,6 +56,10 @@ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#ifdef CONFIG_EVENT_TRACER +#include +#endif + #undef TRACE_HEADER_MULTI_READ /* Only undef what we defined in this file */ diff --git a/kernel/trace/trace_events_stage_3.h b/include/trace/ftrace.h similarity index 58% rename from kernel/trace/trace_events_stage_3.h rename to include/trace/ftrace.h index 45c04e1f38d..955b967acd7 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/include/trace/ftrace.h @@ -1,3 +1,216 @@ +/* + * Stage 1 of the trace events. + * + * Override the macros in to include the following: + * + * struct ftrace_raw_ { + * struct trace_entry ent; + * ; + * []; + * [...] + * }; + * + * The is created by the __field(type, item) macro or + * the __array(type2, item2, len) macro. + * We simply do "type item;", and that will create the fields + * in the structure. + */ + +#include + +#undef TRACE_FORMAT +#define TRACE_FORMAT(call, proto, args, fmt) + +#undef __array +#define __array(type, item, len) type item[len]; + +#undef __field +#define __field(type, item) type item; + +#undef TP_STRUCT__entry +#define TP_STRUCT__entry(args...) args + +#undef TRACE_EVENT +#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ + struct ftrace_raw_##name { \ + struct trace_entry ent; \ + tstruct \ + }; \ + static struct ftrace_event_call event_##name + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +/* + * Stage 2 of the trace events. + * + * Override the macros in to include the following: + * + * enum print_line_t + * ftrace_raw_output_(struct trace_iterator *iter, int flags) + * { + * struct trace_seq *s = &iter->seq; + * struct ftrace_raw_ *field; <-- defined in stage 1 + * struct trace_entry *entry; + * int ret; + * + * entry = iter->ent; + * + * if (entry->type != event_.id) { + * WARN_ON_ONCE(1); + * return TRACE_TYPE_UNHANDLED; + * } + * + * field = (typeof(field))entry; + * + * ret = trace_seq_printf(s, "\n"); + * if (!ret) + * return TRACE_TYPE_PARTIAL_LINE; + * + * return TRACE_TYPE_HANDLED; + * } + * + * This is the method used to print the raw event to the trace + * output format. Note, this is not needed if the data is read + * in binary. + */ + +#undef __entry +#define __entry field + +#undef TP_printk +#define TP_printk(fmt, args...) fmt "\n", args + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +enum print_line_t \ +ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ +{ \ + struct trace_seq *s = &iter->seq; \ + struct ftrace_raw_##call *field; \ + struct trace_entry *entry; \ + int ret; \ + \ + entry = iter->ent; \ + \ + if (entry->type != event_##call.id) { \ + WARN_ON_ONCE(1); \ + return TRACE_TYPE_UNHANDLED; \ + } \ + \ + field = (typeof(field))entry; \ + \ + ret = trace_seq_printf(s, #call ": " print); \ + if (!ret) \ + return TRACE_TYPE_PARTIAL_LINE; \ + \ + return TRACE_TYPE_HANDLED; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +/* + * Setup the showing format of trace point. + * + * int + * ftrace_format_##call(struct trace_seq *s) + * { + * struct ftrace_raw_##call field; + * int ret; + * + * ret = trace_seq_printf(s, #type " " #item ";" + * " offset:%u; size:%u;\n", + * offsetof(struct ftrace_raw_##call, item), + * sizeof(field.type)); + * + * } + */ + +#undef TP_STRUCT__entry +#define TP_STRUCT__entry(args...) args + +#undef __field +#define __field(type, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef __array +#define __array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef __entry +#define __entry REC + +#undef TP_printk +#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) + +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct ftrace_raw_##call field; \ + int ret; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: " print); \ + \ + return ret; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef __field +#define __field(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ +int \ +ftrace_define_fields_##call(void) \ +{ \ + struct ftrace_raw_##call field; \ + struct ftrace_event_call *event_call = &event_##call; \ + int ret; \ + \ + __common_field(unsigned char, type); \ + __common_field(unsigned char, flags); \ + __common_field(unsigned char, preempt_count); \ + __common_field(int, pid); \ + __common_field(int, tgid); \ + \ + tstruct; \ + \ + return ret; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + /* * Stage 3 of the trace events. * @@ -272,7 +485,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ _TRACE_PROFILE_INIT(call) \ } -#include +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) #undef _TRACE_PROFILE #undef _TRACE_PROFILE_INIT diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h deleted file mode 100644 index 13d6b85668c..00000000000 --- a/include/trace/trace_events.h +++ /dev/null @@ -1,7 +0,0 @@ -/* trace/.h here */ - -#include -#include -#include -#include -#include diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 3ad367e7c97..fb9d7f96489 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,7 +41,6 @@ obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_EVENT_TRACING) += trace_events.o -obj-$(CONFIG_EVENT_TRACER) += events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o diff --git a/kernel/trace/events.c b/kernel/trace/events.c deleted file mode 100644 index 5a35a914f0e..00000000000 --- a/kernel/trace/events.c +++ /dev/null @@ -1,15 +0,0 @@ -/* - * This is the place to register all trace points as events. - */ - -#include - -#include - -#include "trace_output.h" - -#define TRACE_HEADER_MULTI_READ -#include "trace_events_stage_1.h" -#include "trace_events_stage_2.h" -#include "trace_events_stage_3.h" - diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h deleted file mode 100644 index 475f46a047a..00000000000 --- a/kernel/trace/trace_events_stage_1.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Stage 1 of the trace events. - * - * Override the macros in to include the following: - * - * struct ftrace_raw_ { - * struct trace_entry ent; - * ; - * []; - * [...] - * }; - * - * The is created by the __field(type, item) macro or - * the __array(type2, item2, len) macro. - * We simply do "type item;", and that will create the fields - * in the structure. - */ - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - -#undef __array -#define __array(type, item, len) type item[len]; - -#undef __field -#define __field(type, item) type item; - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ - struct ftrace_raw_##name { \ - struct trace_entry ent; \ - tstruct \ - }; \ - static struct ftrace_event_call event_##name - -#include diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h deleted file mode 100644 index aa4a67a0656..00000000000 --- a/kernel/trace/trace_events_stage_2.h +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Stage 2 of the trace events. - * - * Override the macros in to include the following: - * - * enum print_line_t - * ftrace_raw_output_(struct trace_iterator *iter, int flags) - * { - * struct trace_seq *s = &iter->seq; - * struct ftrace_raw_ *field; <-- defined in stage 1 - * struct trace_entry *entry; - * int ret; - * - * entry = iter->ent; - * - * if (entry->type != event_.id) { - * WARN_ON_ONCE(1); - * return TRACE_TYPE_UNHANDLED; - * } - * - * field = (typeof(field))entry; - * - * ret = trace_seq_printf(s, "\n"); - * if (!ret) - * return TRACE_TYPE_PARTIAL_LINE; - * - * return TRACE_TYPE_HANDLED; - * } - * - * This is the method used to print the raw event to the trace - * output format. Note, this is not needed if the data is read - * in binary. - */ - -#undef __entry -#define __entry field - -#undef TP_printk -#define TP_printk(fmt, args...) fmt "\n", args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -enum print_line_t \ -ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ -{ \ - struct trace_seq *s = &iter->seq; \ - struct ftrace_raw_##call *field; \ - struct trace_entry *entry; \ - int ret; \ - \ - entry = iter->ent; \ - \ - if (entry->type != event_##call.id) { \ - WARN_ON_ONCE(1); \ - return TRACE_TYPE_UNHANDLED; \ - } \ - \ - field = (typeof(field))entry; \ - \ - ret = trace_seq_printf(s, #call ": " print); \ - if (!ret) \ - return TRACE_TYPE_PARTIAL_LINE; \ - \ - return TRACE_TYPE_HANDLED; \ -} - -#include - -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " offset:%u; size:%u;\n", - * offsetof(struct ftrace_raw_##call, item), - * sizeof(field.type)); - * - * } - */ - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __entry -#define __entry REC - -#undef TP_printk -#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include - -#undef __field -#define __field(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (ret) \ - return ret; - -#undef __array -#define __array(type, item, len) \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (ret) \ - return ret; - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -int \ -ftrace_define_fields_##call(void) \ -{ \ - struct ftrace_raw_##call field; \ - struct ftrace_event_call *event_call = &event_##call; \ - int ret; \ - \ - __common_field(unsigned char, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ - \ - tstruct; \ - \ - return ret; \ -} - -#include From a59fd6027218bd7c994e39d14afe0242f895144f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 13:52:20 -0400 Subject: [PATCH 182/900] tracing/events: convert event call sites to use a link list Impact: makes it possible to define events in modules The events are created by reading down the section that they are linked in by the macros. But this is not scalable to modules. This patch converts the manipulations to use a global link list, and on boot up it adds the items in the section to the list. This change will allow modules to add their tracing events to the list as well. Note, this change alone does not permit modules to use the TRACE_EVENT macros, but the change is needed for them to eventually do so. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + kernel/trace/trace.h | 13 +------- kernel/trace/trace_event_profile.c | 4 +-- kernel/trace/trace_events.c | 51 ++++++++++++++++++------------ kernel/trace/trace_events_filter.c | 8 ++--- 5 files changed, 39 insertions(+), 38 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 496b76d9f9d..17810853b4f 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -83,6 +83,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event); void tracing_record_cmdline(struct task_struct *tsk); struct ftrace_event_call { + struct list_head list; char *name; char *system; struct dentry *dir; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6bcdf4af9b2..8817c18ef97 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -739,11 +739,6 @@ struct event_subsystem { struct filter_pred **preds; }; -#define events_for_each(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) - struct filter_pred; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); @@ -785,13 +780,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -extern struct ftrace_event_call __start_ftrace_events[]; -extern struct ftrace_event_call __stop_ftrace_events[]; - -#define for_each_event(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) +extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 199de9c7422..7bf2ad65eee 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -11,7 +11,7 @@ int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; - for_each_event(event) { + list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) return event->profile_enable(event); } @@ -23,7 +23,7 @@ void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; - for_each_event(event) { + list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) return event->profile_disable(event); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ead68ac9919..5c66aaff07c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -19,6 +19,8 @@ static DEFINE_MUTEX(event_mutex); +LIST_HEAD(ftrace_events); + int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size) { @@ -54,16 +56,14 @@ err: static void ftrace_clear_events(void) { - struct ftrace_event_call *call = (void *)__start_ftrace_events; + struct ftrace_event_call *call; - - while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { + list_for_each_entry(call, &ftrace_events, list) { if (call->enabled) { call->enabled = 0; call->unregfunc(); } - call++; } } @@ -89,7 +89,7 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, static int ftrace_set_clr_event(char *buf, int set) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; char *event = NULL, *sub = NULL, *match; int ret = -EINVAL; @@ -118,7 +118,7 @@ static int ftrace_set_clr_event(char *buf, int set) } mutex_lock(&event_mutex); - for_each_event(call) { + list_for_each_entry(call, &ftrace_events, list) { if (!call->name || !call->regfunc) continue; @@ -224,15 +224,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_call *next = call; + struct list_head *list = m->private; + struct ftrace_event_call *call; (*pos)++; for (;;) { - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + if (list == &ftrace_events) return NULL; + call = list_entry(list, struct ftrace_event_call, list); + /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. @@ -240,11 +242,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (call->regfunc) break; - call++; - next = call; + list = list->next; } - m->private = ++next; + m->private = list->next; return call; } @@ -257,22 +258,23 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_call *next; + struct list_head *list = m->private; + struct ftrace_event_call *call; (*pos)++; retry: - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + if (list == &ftrace_events) return NULL; + call = list_entry(list, struct ftrace_event_call, list); + if (!call->enabled) { - call++; + list = list->next; goto retry; } - next = call; - m->private = ++next; + m->private = list->next; return call; } @@ -312,7 +314,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *m = file->private_data; - m->private = __start_ftrace_events; + m->private = ftrace_events.next; } return ret; } @@ -797,9 +799,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return 0; } +extern struct ftrace_event_call __start_ftrace_events[]; +extern struct ftrace_event_call __stop_ftrace_events[]; + +#define for_each_event(event) \ + for (event = __start_ftrace_events; \ + (unsigned long)event < (unsigned long)__stop_ftrace_events; \ + event++) + static __init int event_trace_init(void) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; @@ -830,6 +840,7 @@ static __init int event_trace_init(void) /* The linker may leave blanks */ if (!call->name) continue; + list_add(&call->list, &ftrace_events); event_create_dir(call, d_events); } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index de42dad42a8..d30b06b02b4 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -223,7 +223,7 @@ oom: void filter_free_subsystem_preds(struct event_subsystem *system) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; int i; if (system->n_preds) { @@ -234,7 +234,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system) system->n_preds = 0; } - events_for_each(call) { + list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; @@ -320,7 +320,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; if (system->n_preds && !pred->compound) filter_free_subsystem_preds(system); @@ -337,7 +337,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system, system->preds[system->n_preds] = pred; - events_for_each(call) { + list_for_each_entry(call, &ftrace_events, list) { int err; if (!call->define_fields) From 17c873ec280a03894bc718af817f7f24fa787ae1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 18:12:50 -0400 Subject: [PATCH 183/900] tracing/events: add export symbols for trace events in modules Impact: let modules add trace events The trace event code requires some functions to be exported to allow modules to use TRACE_EVENT. This patch adds EXPORT_SYMBOL_GPL to the necessary functions. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +++ kernel/trace/trace_events.c | 1 + kernel/trace/trace_events_filter.c | 2 ++ kernel/trace/trace_output.c | 3 +++ 4 files changed, 9 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c0047fcf707..2d69b26b3cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -176,6 +176,7 @@ int filter_current_check_discard(struct ftrace_event_call *call, void *rec, { return filter_check_discard(call, rec, global_trace.buffer, event); } +EXPORT_SYMBOL_GPL(filter_current_check_discard); cycle_t ftrace_now(int cpu) { @@ -886,6 +887,7 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, return trace_buffer_lock_reserve(&global_trace, type, len, flags, pc); } +EXPORT_SYMBOL(trace_current_buffer_lock_reserve); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) @@ -903,6 +905,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event) { ring_buffer_discard_commit(global_trace.buffer, event); } +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); void trace_function(struct trace_array *tr, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5c66aaff07c..8b9e621b80b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -53,6 +53,7 @@ err: return -ENOMEM; } +EXPORT_SYMBOL_GPL(trace_define_field); static void ftrace_clear_events(void) { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d30b06b02b4..f8e5eab0424 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -110,6 +110,7 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) return 1; } +EXPORT_SYMBOL_GPL(filter_match_preds); void filter_print_preds(struct filter_pred **preds, int n_preds, struct trace_seq *s) @@ -220,6 +221,7 @@ oom: return -ENOMEM; } +EXPORT_SYMBOL_GPL(init_preds); void filter_free_subsystem_preds(struct event_subsystem *system) { diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0e70fb07ca7..83a8abb9640 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -94,6 +94,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } +EXPORT_SYMBOL_GPL(trace_seq_printf); int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { @@ -538,6 +539,7 @@ int register_ftrace_event(struct trace_event *event) return ret; } +EXPORT_SYMBOL_GPL(register_ftrace_event); /** * unregister_ftrace_event - remove a no longer used event @@ -551,6 +553,7 @@ int unregister_ftrace_event(struct trace_event *event) return 0; } +EXPORT_SYMBOL_GPL(unregister_ftrace_event); /* * Standard events From 6d723736e472f7a0cd5b62c84152fceead241328 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 14:53:50 -0400 Subject: [PATCH 184/900] tracing/events: add support for modules to TRACE_EVENT Impact: allow modules to add TRACE_EVENTS on load This patch adds the final hooks to allow modules to use the TRACE_EVENT macro. A notifier and a data structure are used to link the TRACE_EVENTs defined in the module to connect them with the ftrace event tracing system. It also adds the necessary automated clean ups to the trace events when a module is removed. Cc: Rusty Russell Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 3 + include/linux/module.h | 4 ++ include/linux/trace_seq.h | 2 + include/trace/ftrace.h | 1 + kernel/module.c | 7 ++ kernel/trace/trace_events.c | 128 ++++++++++++++++++++++++++--------- 6 files changed, 113 insertions(+), 32 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 17810853b4f..75f3ac01a87 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -7,6 +7,7 @@ struct trace_array; struct tracer; +struct dentry; /* * The trace entry - the most basic unit of tracing. This is what @@ -87,6 +88,7 @@ struct ftrace_event_call { char *name; char *system; struct dentry *dir; + struct trace_event *event; int enabled; int (*regfunc)(void); void (*unregfunc)(void); @@ -97,6 +99,7 @@ struct ftrace_event_call { struct list_head fields; int n_preds; struct filter_pred **preds; + void *mod; #ifdef CONFIG_EVENT_PROFILE atomic_t profile_count; diff --git a/include/linux/module.h b/include/linux/module.h index 627ac082e2a..6155fa44168 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -337,6 +337,10 @@ struct module const char **trace_bprintk_fmt_start; unsigned int num_trace_bprintk_fmt; #endif +#ifdef CONFIG_EVENT_TRACING + struct ftrace_event_call *trace_events; + unsigned int num_trace_events; +#endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 28051da876d..15ca2c71af1 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -1,6 +1,8 @@ #ifndef _LINUX_TRACE_SEQ_H #define _LINUX_TRACE_SEQ_H +#include + /* * Trace sequences are used to allow a function to call several other functions * to create a string of data to use (up to a max of PAGE_SIZE. diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 955b967acd7..60c5323bee6 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -477,6 +477,7 @@ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .system = __stringify(TRACE_SYSTEM), \ + .event = &ftrace_event_type_##call, \ .raw_init = ftrace_raw_init_event_##call, \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ diff --git a/kernel/module.c b/kernel/module.c index e797812a4d9..a0394706f10 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -18,6 +18,7 @@ */ #include #include +#include #include #include #include @@ -2172,6 +2173,12 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->tracepoints), &mod->num_tracepoints); #endif +#ifdef CONFIG_EVENT_TRACING + mod->trace_events = section_objs(hdr, sechdrs, secstrings, + "_ftrace_events", + sizeof(*mod->trace_events), + &mod->num_trace_events); +#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8b9e621b80b..a4b177720a6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -713,7 +713,13 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return d_events; } - system->name = name; + system->name = kstrdup(name, GFP_KERNEL); + if (!system->name) { + debugfs_remove(system->entry); + kfree(system); + return d_events; + } + list_add(&system->list, &event_subsystems); system->preds = NULL; @@ -738,7 +744,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->system, "TRACE_SYSTEM") != 0) + if (strcmp(call->system, TRACE_SYSTEM) != 0) d_events = event_subsystem_dir(call->system, d_events); if (call->raw_init) { @@ -757,21 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return -1; } - if (call->regfunc) { - entry = debugfs_create_file("enable", 0644, call->dir, call, - &ftrace_enable_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/enable' entry\n", call->name); - } + if (call->regfunc) + entry = trace_create_file("enable", 0644, call->dir, call, + &ftrace_enable_fops); - if (call->id) { - entry = debugfs_create_file("id", 0444, call->dir, call, - &ftrace_event_id_fops); - if (!entry) - pr_warning("Could not create debugfs '%s/id' entry\n", - call->name); - } + if (call->id) + entry = trace_create_file("id", 0444, call->dir, call, + &ftrace_event_id_fops); if (call->define_fields) { ret = call->define_fields(); @@ -780,40 +778,102 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) " events/%s\n", call->name); return ret; } - entry = debugfs_create_file("filter", 0644, call->dir, call, - &ftrace_event_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", call->name); + entry = trace_create_file("filter", 0644, call->dir, call, + &ftrace_event_filter_fops); } /* A trace may not want to export its format */ if (!call->show_format) return 0; - entry = debugfs_create_file("format", 0444, call->dir, call, - &ftrace_event_format_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/format' entry\n", call->name); + entry = trace_create_file("format", 0444, call->dir, call, + &ftrace_event_format_fops); return 0; } +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +static void trace_module_add_events(struct module *mod) +{ + struct ftrace_event_call *call, *start, *end; + struct dentry *d_events; + + start = mod->trace_events; + end = mod->trace_events + mod->num_trace_events; + + if (start == end) + return; + + d_events = event_trace_events_dir(); + if (!d_events) + return; + + for_each_event(call, start, end) { + /* The linker may leave blanks */ + if (!call->name) + continue; + call->mod = mod; + list_add(&call->list, &ftrace_events); + event_create_dir(call, d_events); + } +} + +static void trace_module_remove_events(struct module *mod) +{ + struct ftrace_event_call *call, *p; + + list_for_each_entry_safe(call, p, &ftrace_events, list) { + if (call->mod == mod) { + if (call->enabled) { + call->enabled = 0; + call->unregfunc(); + } + if (call->event) + unregister_ftrace_event(call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + } + } +} + +int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + mutex_lock(&event_mutex); + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_events(mod); + break; + case MODULE_STATE_GOING: + trace_module_remove_events(mod); + break; + } + mutex_unlock(&event_mutex); + + return 0; +} + +struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; + extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; -#define for_each_event(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) - static __init int event_trace_init(void) { struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; + int ret; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -837,7 +897,7 @@ static __init int event_trace_init(void) if (!d_events) return 0; - for_each_event(call) { + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) continue; @@ -845,6 +905,10 @@ static __init int event_trace_init(void) event_create_dir(call, d_events); } + ret = register_module_notifier(&trace_module_nb); + if (!ret) + pr_warning("Failed to register trace events module notifier\n"); + return 0; } fs_initcall(event_trace_init); From 61f919a12fbdc3fd20f980a34a118d597198a392 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 18:22:32 -0400 Subject: [PATCH 185/900] tracing/events: fix compile for modules disabled Impact: compile fix The addition of TRACE_EVENT for modules breaks the build for when modules are disabled. This code fixes that. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a4b177720a6..6591d83e1e7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -797,6 +797,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) (unsigned long)event < (unsigned long)end; \ event++) +#ifdef CONFIG_MODULES static void trace_module_add_events(struct module *mod) { struct ftrace_event_call *call, *start, *end; @@ -840,8 +841,8 @@ static void trace_module_remove_events(struct module *mod) } } -int trace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; @@ -858,6 +859,13 @@ int trace_module_notify(struct notifier_block *self, return 0; } +#else +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, From ecda8ae02a08ef065ff387f5cb2a2d4999da2408 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 18:49:38 -0400 Subject: [PATCH 186/900] tracing/events: fix lockdep system name Impact: fix compile error of lockdep event tracer Ingo Molnar pointed out that the system name for the lockdep tracer was "lock" which is used to include the event trace file name. It should be "lockdep" Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- include/trace/lockdep.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h index 4d301e758de..45e326b5c7f 100644 --- a/include/trace/lockdep.h +++ b/include/trace/lockdep.h @@ -5,7 +5,7 @@ #include #undef TRACE_SYSTEM -#define TRACE_SYSTEM lock +#define TRACE_SYSTEM lockdep #ifdef CONFIG_LOCKDEP From ad8d75fff811a6a230f7f43b05a6483099349533 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 19:39:12 -0400 Subject: [PATCH 187/900] tracing/events: move trace point headers into include/trace/events Impact: clean up Create a sub directory in include/trace called events to keep the trace point headers in their own separate directory. Only headers that declare trace points should be defined in this directory. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neil Horman Cc: Zhao Lei Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Signed-off-by: Steven Rostedt --- include/linux/kmemtrace.h | 2 +- include/trace/define_trace.h | 2 +- include/trace/{ => events}/irq.h | 0 include/trace/{ => events}/kmem.h | 0 include/trace/{ => events}/lockdep.h | 0 include/trace/{ => events}/sched.h | 0 include/trace/{ => events}/skb.h | 0 kernel/exit.c | 2 +- kernel/fork.c | 3 ++- kernel/irq/handle.c | 2 +- kernel/kthread.c | 2 +- kernel/lockdep.c | 2 +- kernel/sched.c | 2 +- kernel/signal.c | 2 +- kernel/softirq.c | 2 +- kernel/trace/ftrace.c | 2 +- kernel/trace/trace_sched_switch.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- mm/util.c | 2 +- net/core/drop_monitor.c | 2 +- net/core/net-traces.c | 2 +- net/core/skbuff.c | 2 +- 22 files changed, 18 insertions(+), 17 deletions(-) rename include/trace/{ => events}/irq.h (100%) rename include/trace/{ => events}/kmem.h (100%) rename include/trace/{ => events}/lockdep.h (100%) rename include/trace/{ => events}/sched.h (100%) rename include/trace/{ => events}/skb.h (100%) diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h index 15c45a27a92..b616d3930c3 100644 --- a/include/linux/kmemtrace.h +++ b/include/linux/kmemtrace.h @@ -9,7 +9,7 @@ #ifdef __KERNEL__ -#include +#include #ifdef CONFIG_KMEMTRACE extern void kmemtrace_init(void); diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 980eb66a6e3..18869417109 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -43,7 +43,7 @@ #endif #ifndef TRACE_INCLUDE_PATH -# define __TRACE_INCLUDE(system) +# define __TRACE_INCLUDE(system) # define UNDEF_TRACE_INCLUDE_FILE #else # define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h) diff --git a/include/trace/irq.h b/include/trace/events/irq.h similarity index 100% rename from include/trace/irq.h rename to include/trace/events/irq.h diff --git a/include/trace/kmem.h b/include/trace/events/kmem.h similarity index 100% rename from include/trace/kmem.h rename to include/trace/events/kmem.h diff --git a/include/trace/lockdep.h b/include/trace/events/lockdep.h similarity index 100% rename from include/trace/lockdep.h rename to include/trace/events/lockdep.h diff --git a/include/trace/sched.h b/include/trace/events/sched.h similarity index 100% rename from include/trace/sched.h rename to include/trace/events/sched.h diff --git a/include/trace/skb.h b/include/trace/events/skb.h similarity index 100% rename from include/trace/skb.h rename to include/trace/events/skb.h diff --git a/kernel/exit.c b/kernel/exit.c index 2fe9d2c7eee..cab535c427b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 4bebf263923..085f73ebcea 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -61,7 +61,6 @@ #include #include #include -#include #include #include @@ -71,6 +70,8 @@ #include #include +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 983d8be8dff..37c63633e78 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -20,7 +20,7 @@ #include #define CREATE_TRACE_POINTS -#include +#include #include "internals.h" diff --git a/kernel/kthread.c b/kernel/kthread.c index e1c76924545..41c88fe4050 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #define KTHREAD_NICE_LEVEL (-5) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 257f21a76c5..47b201ecc6d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -48,7 +48,7 @@ #include "lockdep_internals.h" #define CREATE_TRACE_POINTS -#include +#include #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; diff --git a/kernel/sched.c b/kernel/sched.c index e6d4518d47e..9f7ffd00b6e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -79,7 +79,7 @@ #include "sched_cpupri.h" #define CREATE_TRACE_POINTS -#include +#include /* * Convert user-nice values [ -20 ... 0 ... 19 ] diff --git a/kernel/signal.c b/kernel/signal.c index 1d5703ff003..94ec0a4dde0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include diff --git a/kernel/softirq.c b/kernel/softirq.c index a2d9b458ac2..7ab9dfd8d08 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8e6a0b5c994..a2348898858 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 9d8cccdfaa0..a98106dd979 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include "trace.h" diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5bc00e8f153..b8b13c5540f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "trace.h" diff --git a/mm/util.c b/mm/util.c index 0e74a22791c..6794a336e9a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -7,7 +7,7 @@ #include #define CREATE_TRACE_POINTS -#include +#include /** * kstrdup - allocate space for and copy an existing string diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 9fd0dc3cca9..b75b6cea49d 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 80177205947..499a67eaf3a 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -24,6 +24,6 @@ #include #define CREATE_TRACE_POINTS -#include +#include EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ce6356cd9f7..12806b84445 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -65,7 +65,7 @@ #include #include -#include +#include #include "kmap_skb.h" From 9cfe06f8cd5c8c3ad6ab323973e87dde670642b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 21:37:03 -0400 Subject: [PATCH 188/900] tracing/events: add trace-events-sample This patch adds a sample to the samples directory on how to create and use TRACE_EVENT trace points. Signed-off-by: Steven Rostedt --- samples/Kconfig | 7 ++ samples/Makefile | 2 +- samples/trace_events/Makefile | 8 ++ samples/trace_events/trace-events-sample.c | 56 ++++++++++ samples/trace_events/trace-events-sample.h | 124 +++++++++++++++++++++ 5 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 samples/trace_events/Makefile create mode 100644 samples/trace_events/trace-events-sample.c create mode 100644 samples/trace_events/trace-events-sample.h diff --git a/samples/Kconfig b/samples/Kconfig index 4b02f5a0e65..93f41c05109 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -19,6 +19,13 @@ config SAMPLE_TRACEPOINTS help This build tracepoints example modules. +config SAMPLE_TRACE_EVENTS + tristate "Build trace_events examples" + depends on EVENT_TRACING + default m + help + This build trace event example modules. + config SAMPLE_KOBJECT tristate "Build kobject examples" help diff --git a/samples/Makefile b/samples/Makefile index 10eaca89fe1..13e4b470b53 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,3 +1,3 @@ # Makefile for Linux samples code -obj-$(CONFIG_SAMPLES) += markers/ kobject/ kprobes/ tracepoints/ +obj-$(CONFIG_SAMPLES) += markers/ kobject/ kprobes/ tracepoints/ trace_events/ diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile new file mode 100644 index 00000000000..06c6dea1eb8 --- /dev/null +++ b/samples/trace_events/Makefile @@ -0,0 +1,8 @@ +# builds the trace events example kernel modules; +# then to use one (as root): insmod + +PWD := $(shell pwd) + +CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/ + +obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c new file mode 100644 index 00000000000..f33b3ba744a --- /dev/null +++ b/samples/trace_events/trace-events-sample.c @@ -0,0 +1,56 @@ +#include +#include + +/* + * Any file that uses trace points, must include the header. + * But only one file, must include the header by defining + * CREATE_TRACE_POINTS first. This will make the C code that + * creates the handles for the trace points. + */ +#define CREATE_TRACE_POINTS +#include "trace-events-sample.h" + + +static void simple_thread_func(int cnt) +{ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + trace_foo_bar("hello", cnt); + + if (!(cnt % 10)) + /* It is really important that I say "hi!" */ + printk(KERN_EMERG "hi!\n"); +} + +static int simple_thread(void *arg) +{ + int cnt = 0; + + while (!kthread_should_stop()) + simple_thread_func(cnt++); + + return 0; +} + +static struct task_struct *simple_tsk; + +static int __init trace_event_init(void) +{ + simple_tsk = kthread_run(simple_thread, NULL, "event-sample"); + if (IS_ERR(simple_tsk)) + return -1; + + return 0; +} + +static void __exit trace_event_exit(void) +{ + kthread_stop(simple_tsk); +} + +module_init(trace_event_init); +module_exit(trace_event_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("trace-events-sample"); +MODULE_LICENSE("GPL"); diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h new file mode 100644 index 00000000000..eab46443e61 --- /dev/null +++ b/samples/trace_events/trace-events-sample.h @@ -0,0 +1,124 @@ +/* + * Notice that this file is not protected like a normal header. + * We also must allow for rereading of this file. The + * + * || defined(TRACE_HEADER_MULTI_READ) + * + * serves this purpose. + */ +#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EVENT_SAMPLE_H + +/* + * All trace headers should include tracepoint.h, until we finally + * make it into a standard header. + */ +#include + +/* + * If TRACE_SYSTEM is defined, that will be the directory created + * in the ftrace directory under /debugfs/tracing/events/ + * + * The define_trace.h belowe will also look for a file name of + * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here. + * + * If you want a different system than file name, you can override + * the header name by defining TRACE_INCLUDE_FILE + * + * If this file was called, goofy.h, then we would define: + * + * #define TRACE_INCLUDE_FILE goofy + * + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM trace-events-sample + +/* + * The TRACE_EVENT macro is broken up into 5 parts. + * + * name: name of the trace point. This is also how to enable the tracepoint. + * A function called trace_foo_bar() will be created. + * + * proto: the prototype of the function trace_foo_bar() + * Here it is trace_foo_bar(char *foo, int bar). + * + * args: must match the arguments in the prototype. + * Here it is simply "foo, bar". + * + * struct: This defines the way the data will be stored in the ring buffer. + * There are currently two types of elements. __field and __array. + * a __field is broken up into (type, name). Where type can be any + * type but an array. + * For an array. there are three fields. (type, name, size). The + * type of elements in the array, the name of the field and the size + * of the array. + * + * __array( char, foo, 10) is the same as saying char foo[10]. + * + * fast_assign: This is a C like function that is used to store the items + * into the ring buffer. + * + * printk: This is a way to print out the data in pretty print. This is + * useful if the system crashes and you are logging via a serial line, + * the data can be printed to the console using this "printk" method. + * + * Note, that for both the assign and the printk, __entry is the handler + * to the data structure in the ring buffer, and is defined by the + * TP_STRUCT__entry. + */ +TRACE_EVENT(foo_bar, + + TP_PROTO(char *foo, int bar), + + TP_ARGS(foo, bar), + + TP_STRUCT__entry( + __array( char, foo, 10 ) + __field( int, bar ) + ), + + TP_fast_assign( + strncpy(__entry->foo, foo, 10); + __entry->bar = bar; + ), + + TP_printk("foo %s %d", __entry->foo, __entry->bar) +); +#endif + +/***** NOTICE! The #if protection ends here. *****/ + + +/* + * There are several ways I could have done this. If I left out the + * TRACE_INCLUDE_PATH, then it would default to the kernel source + * include/trace/events directory. + * + * I could specify a path from the define_trace.h file back to this + * file. + * + * #define TRACE_INCLUDE_PATH ../../samples/trace_events + * + * But I chose to simply make it use the current directory and then in + * the Makefile I added: + * + * CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/ + * + * This will make sure the current path is part of the include + * structure for our file so that we can find it. + * + * I could have made only the top level directory the include: + * + * CFLAGS_trace-events-sample.o := -I$(PWD) + * + * And then let the path to this directory be the TRACE_INCLUDE_PATH: + * + * #define TRACE_INCLUDE_PATH samples/trace_events + * + * But then if something defines "samples" or "trace_events" then we + * could risk that being converted too, and give us an unexpected + * result. + */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#include From 05725f7eb4b8acb147c5fc7b91397b1f6bcab00d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 14 Apr 2009 20:17:16 +0200 Subject: [PATCH 189/900] rculist: use list_entry_rcu in places where it's appropriate Use previously introduced list_entry_rcu instead of an open-coded list_entry + rcu_dereference combination. Signed-off-by: Jiri Pirko Reviewed-by: Paul E. McKenney Cc: dipankar@in.ibm.com LKML-Reference: <20090414181715.GA3634@psychotron.englab.brq.redhat.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 +++++--- ipc/sem.c | 4 ++-- security/integrity/ima/ima_fs.c | 4 ++-- security/smack/smackfs.c | 8 ++++---- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049..886df41e745 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -77,6 +77,7 @@ struct sched_param { #include #include #include +#include #include #include @@ -2010,7 +2011,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, } #endif -#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) +#define next_task(p) \ + list_entry_rcu((p)->tasks.next, struct task_struct, tasks) #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) @@ -2049,8 +2051,8 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2) static inline struct task_struct *next_thread(const struct task_struct *p) { - return list_entry(rcu_dereference(p->thread_group.next), - struct task_struct, thread_group); + return list_entry_rcu(p->thread_group.next, + struct task_struct, thread_group); } static inline int thread_group_empty(struct task_struct *p) diff --git a/ipc/sem.c b/ipc/sem.c index 16a2189e96f..87c2b641fd7 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1290,8 +1290,8 @@ void exit_sem(struct task_struct *tsk) int i; rcu_read_lock(); - un = list_entry(rcu_dereference(ulp->list_proc.next), - struct sem_undo, list_proc); + un = list_entry_rcu(ulp->list_proc.next, + struct sem_undo, list_proc); if (&un->list_proc == &ulp->list_proc) semid = -1; else diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index ffbe259700b..510186f0b72 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -84,8 +84,8 @@ static void *ima_measurements_next(struct seq_file *m, void *v, loff_t *pos) * against concurrent list-extension */ rcu_read_lock(); - qe = list_entry(rcu_dereference(qe->later.next), - struct ima_queue_entry, later); + qe = list_entry_rcu(qe->later.next, + struct ima_queue_entry, later); rcu_read_unlock(); (*pos)++; diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index e03a7e19c73..11d2cb19d7a 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -734,8 +734,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new) return; } - m = list_entry(rcu_dereference(smk_netlbladdr_list.next), - struct smk_netlbladdr, list); + m = list_entry_rcu(smk_netlbladdr_list.next, + struct smk_netlbladdr, list); /* the comparison '>' is a bit hacky, but works */ if (new->smk_mask.s_addr > m->smk_mask.s_addr) { @@ -748,8 +748,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new) list_add_rcu(&new->list, &m->list); return; } - m_next = list_entry(rcu_dereference(m->list.next), - struct smk_netlbladdr, list); + m_next = list_entry_rcu(m->list.next, + struct smk_netlbladdr, list); if (new->smk_mask.s_addr > m_next->smk_mask.s_addr) { list_add_rcu(&new->list, &m->list); return; From b206525ad1f653b7da35f5827be93770d28eae11 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 14 Apr 2009 23:04:37 +0530 Subject: [PATCH 190/900] x86: k8 convert node_to_k8_nb_misc() from a macro to an inline function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Converting node_to_k8_nb_misc() from a macro to an inline function makes compiler see the 'node' parameter in the !CONFIG_K8_NB too, which eliminates these compiler warnings: arch/x86/kernel/cpu/intel_cacheinfo.c: In function ‘show_cache_disable’: arch/x86/kernel/cpu/intel_cacheinfo.c:712: warning: unused variable ‘node’ arch/x86/kernel/cpu/intel_cacheinfo.c: In function ‘store_cache_disable’: arch/x86/kernel/cpu/intel_cacheinfo.c:739: warning: unused variable ‘node’ Signed-off-by: Jaswinder Singh Rajput Cc: Andreas Herrmann Cc: Mark Langsdorf LKML-Reference: <1239730477.2966.26.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/k8.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h index c23b3d171be..c2d1f3b58e5 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/k8.h @@ -13,10 +13,15 @@ extern void k8_flush_garts(void); extern int k8_scan_nodes(unsigned long start, unsigned long end); #ifdef CONFIG_K8_NB -#define node_to_k8_nb_misc(node) \ - (node < num_k8_northbridges) ? k8_northbridges[node] : NULL +static inline struct pci_dev *node_to_k8_nb_misc(int node) +{ + return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; +} #else -#define node_to_k8_nb_misc(node) NULL +static inline struct pci_dev *node_to_k8_nb_misc(int node) +{ + return NULL; +} #endif From e6a1a89d572c31b62d6dcf11a371c7323852d9b2 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 15 Apr 2009 18:22:41 +0900 Subject: [PATCH 191/900] dma-debug: add dma_debug_resize_entries() to adjust the number of dma_debug_entries We use a static value for the number of dma_debug_entries. It can be overwritten by a kernel command line option. Some IOMMUs (e.g. GART) can't set an appropriate value by a kernel command line option because they can't know such value until they finish initializing up their hardware. This patch adds dma_debug_resize_entries() enables IOMMUs to adjust the number of dma_debug_entries anytime. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Cc: fujita.tomonori@lab.ntt.co.jp Cc: akpm@linux-foundation.org LKML-Reference: <20090415182234R.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- include/linux/dma-debug.h | 7 ++++ lib/dma-debug.c | 72 +++++++++++++++++++++++++++++++++++---- 2 files changed, 73 insertions(+), 6 deletions(-) diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h index 28d53cb7b5a..171ad8aedc8 100644 --- a/include/linux/dma-debug.h +++ b/include/linux/dma-debug.h @@ -32,6 +32,8 @@ extern void dma_debug_add_bus(struct bus_type *bus); extern void dma_debug_init(u32 num_entries); +extern int dma_debug_resize_entries(u32 num_entries); + extern void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, int direction, dma_addr_t dma_addr, @@ -91,6 +93,11 @@ static inline void dma_debug_init(u32 num_entries) { } +static inline int dma_debug_resize_entries(u32 num_entries) +{ + return 0; +} + static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, int direction, dma_addr_t dma_addr, diff --git a/lib/dma-debug.c b/lib/dma-debug.c index d3da7edc034..5d61019330c 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -85,6 +85,7 @@ static u32 show_num_errors = 1; static u32 num_free_entries; static u32 min_free_entries; +static u32 nr_total_entries; /* number of preallocated entries requested by kernel cmdline */ static u32 req_entries; @@ -257,6 +258,21 @@ static void add_dma_entry(struct dma_debug_entry *entry) put_hash_bucket(bucket, &flags); } +static struct dma_debug_entry *__dma_entry_alloc(void) +{ + struct dma_debug_entry *entry; + + entry = list_entry(free_entries.next, struct dma_debug_entry, list); + list_del(&entry->list); + memset(entry, 0, sizeof(*entry)); + + num_free_entries -= 1; + if (num_free_entries < min_free_entries) + min_free_entries = num_free_entries; + + return entry; +} + /* struct dma_entry allocator * * The next two functions implement the allocator for @@ -276,9 +292,7 @@ static struct dma_debug_entry *dma_entry_alloc(void) goto out; } - entry = list_entry(free_entries.next, struct dma_debug_entry, list); - list_del(&entry->list); - memset(entry, 0, sizeof(*entry)); + entry = __dma_entry_alloc(); #ifdef CONFIG_STACKTRACE entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; @@ -286,9 +300,6 @@ static struct dma_debug_entry *dma_entry_alloc(void) entry->stacktrace.skip = 2; save_stack_trace(&entry->stacktrace); #endif - num_free_entries -= 1; - if (num_free_entries < min_free_entries) - min_free_entries = num_free_entries; out: spin_unlock_irqrestore(&free_entries_lock, flags); @@ -310,6 +321,53 @@ static void dma_entry_free(struct dma_debug_entry *entry) spin_unlock_irqrestore(&free_entries_lock, flags); } +int dma_debug_resize_entries(u32 num_entries) +{ + int i, delta, ret = 0; + unsigned long flags; + struct dma_debug_entry *entry; + LIST_HEAD(tmp); + + spin_lock_irqsave(&free_entries_lock, flags); + + if (nr_total_entries < num_entries) { + delta = num_entries - nr_total_entries; + + spin_unlock_irqrestore(&free_entries_lock, flags); + + for (i = 0; i < delta; i++) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + break; + + list_add_tail(&entry->list, &tmp); + } + + spin_lock_irqsave(&free_entries_lock, flags); + + list_splice(&tmp, &free_entries); + nr_total_entries += i; + num_free_entries += i; + } else { + delta = nr_total_entries - num_entries; + + for (i = 0; i < delta && !list_empty(&free_entries); i++) { + entry = __dma_entry_alloc(); + kfree(entry); + } + + nr_total_entries -= i; + } + + if (nr_total_entries != num_entries) + ret = 1; + + spin_unlock_irqrestore(&free_entries_lock, flags); + + return ret; +} +EXPORT_SYMBOL(dma_debug_resize_entries); + /* * DMA-API debugging init code * @@ -490,6 +548,8 @@ void dma_debug_init(u32 num_entries) return; } + nr_total_entries = num_free_entries; + printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n"); } From 19c1a6f5764d787113fa323ffb18be7991208f82 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 14 Apr 2009 09:43:19 +0900 Subject: [PATCH 192/900] x86 gart: reimplement IOMMU_LEAK feature by using DMA_API_DEBUG IOMMU_LEAK, GART's own feature, dumps the used IOMMU entries when IOMMU entries is full, which might be useful to find a bad driver that eats IOMMU entries. DMA_API_DEBUG provides the similar feature, debug_dma_dump_mappings, and it's better than GART's IOMMU_LEAK feature. GART's IOMMU_LEAK feature doesn't say who uses IOMMU entries so it's hard to find a bad driver. This patch reimplements the GART's IOMMU_LEAK feature by using DMA_API_DEBUG. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Cc: Andrew Morton LKML-Reference: <1239669799-23579-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 3 +-- arch/x86/kernel/pci-gart_64.c | 45 +++++++---------------------------- 2 files changed, 9 insertions(+), 39 deletions(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d8359e73317..5865712d105 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -161,8 +161,7 @@ config IOMMU_DEBUG config IOMMU_LEAK bool "IOMMU leak tracing" - depends on DEBUG_KERNEL - depends on IOMMU_DEBUG + depends on IOMMU_DEBUG && DMA_API_DEBUG ---help--- Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index b284b58c035..1e8920d98f7 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -144,48 +144,21 @@ static void flush_gart(void) } #ifdef CONFIG_IOMMU_LEAK - -#define SET_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = __builtin_return_address(0);\ - } while (0) - -#define CLEAR_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = NULL; \ - } while (0) - /* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab; static int leak_trace; static int iommu_leak_pages = 20; static void dump_leak(void) { - int i; static int dump; - if (dump || !iommu_leak_tab) + if (dump) return; dump = 1; - show_stack(NULL, NULL); - /* Very crude. dump some from the end of the table too */ - printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", - iommu_leak_pages); - for (i = 0; i < iommu_leak_pages; i += 2) { - printk(KERN_DEBUG "%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], - 0); - printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); - } - printk(KERN_DEBUG "\n"); + show_stack(NULL, NULL); + debug_dma_dump_mappings(NULL); } -#else -# define SET_LEAK(x) -# define CLEAR_LEAK(x) #endif static void iommu_full(struct device *dev, size_t size, int dir) @@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); - SET_LEAK(iommu_page + i); phys_mem += PAGE_SIZE; } return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); @@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; - CLEAR_LEAK(iommu_page + i); } free_iommu(iommu_page, npages); } @@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); - SET_LEAK(iommu_page); addr += PAGE_SIZE; iommu_page++; } @@ -801,11 +771,12 @@ void __init gart_iommu_init(void) #ifdef CONFIG_IOMMU_LEAK if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, - get_order(iommu_pages*sizeof(void *))); - if (!iommu_leak_tab) + int ret; + + ret = dma_debug_resize_entries(iommu_pages); + if (ret) printk(KERN_DEBUG - "PCI-DMA: Cannot allocate leak trace area\n"); + "PCI-DMA: Cannot trace all the entries\n"); } #endif From 13318a7186d8e0ae08c996ea4111a945e7789772 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 Apr 2009 09:59:10 +0800 Subject: [PATCH 193/900] sched: use group_first_cpu() instead of cpumask_first(sched_group_cpus()) Impact: cleanup This patch changes cpumask_first(sched_group_cpus()) to group_first_cpu() for maintainability. Signed-off-by: Miao Xie Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 92b4b56ad09..7601ceebf7c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7995,7 +7995,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) struct sched_domain *sd; sd = &per_cpu(phys_domains, j).sd; - if (j != cpumask_first(sched_group_cpus(sd->groups))) { + if (j != group_first_cpu(sd->groups)) { /* * Only add "power" once for each * physical package. @@ -8073,7 +8073,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != cpumask_first(sched_group_cpus(sd->groups))) + if (cpu != group_first_cpu(sd->groups)) return; child = sd->child; From 77857dc07247ed5fa700a197c96ef842d8dbebdf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 15 Apr 2009 11:57:01 -0700 Subject: [PATCH 194/900] x86: use used_vectors in init_IRQ() Impact: fix crash with many devices I found this crash: [ 552.616646] general protection fault: 0403 [#1] SMP [ 552.620013] last sysfs file: /sys/devices/pci0000:00/0000:00:02.0/usb1/1-1/1-1:1.0/host13/target13:0:0/13:0:0:0/block/sr0/size [ 552.620013] CPU 0 [ 552.620013] Modules linked in: [ 552.620013] Pid: 0, comm: swapper Not tainted 2.6.30-rc1-tip-01931-g8fcafd8-dirty #28 Sun Fire X4440 [ 552.620013] RIP: 0010:[] [] default_idle+0x7d/0xda [ 552.620013] RSP: 0018:ffffffff81345e68 EFLAGS: 00010246 [ 552.620013] RAX: 0000000000000000 RBX: ffffffff8133d870 RCX: ffffc20000000000 [ 552.620013] RDX: 00000000001d0620 RSI: ffffffff8023bad8 RDI: ffffffff802a3169 [ 552.620013] RBP: ffffffff81345e98 R08: 0000000000000000 R09: ffffffff812244a0 [ 552.620013] R10: ffffffff81345dc8 R11: 7ebe1b6fa0bcac50 R12: 4ec4ec4ec4ec4ec5 [ 552.620013] R13: ffffffff813a54d0 R14: ffffffff813a7a40 R15: 0000000000000000 [ 552.620013] FS: 00000000006d1880(0000) GS:ffffc20000000000(0000) knlGS:0000000000000000 [ 552.620013] CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b [ 552.620013] CR2: 00007fec9d936a50 CR3: 000000007d1a9000 CR4: 00000000000006e0 [ 552.620013] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 552.620013] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 552.620013] Process swapper (pid: 0, threadinfo ffffffff81344000,task ffffffff812244a0) [ 552.620013] Stack: [ 552.620013] 0000000000000000 ffffc20000000000 00000000001d0620 7ebe1b6fa0bcac50 [ 552.620013] ffffffff8133d870 4ec4ec4ec4ec4ec5 ffffffff81345ec8 ffffffff8023bd84 [ 552.620013] 4ec4ec4ec4ec4ec5 ffffffff813a54d0 7ebe1b6fa0bcac50 ffffffff8133d870 [ 552.620013] Call Trace: [ 552.620013] [] c1e_idle+0x109/0x124 [ 552.620013] [] cpu_idle+0xb8/0x101 [ 552.620013] [] rest_init+0x7e/0x94 [ 552.620013] [] start_kernel+0x3dc/0x3fd [ 552.620013] [] x86_64_start_reservations+0xb9/0xd4 [ 552.620013] [] x86_64_start_kernel+0xee/0x109 [ 552.620013] Code: 48 8b 04 25 f8 b4 00 00 83 a0 3c e0 ff ff fb 0f ae f0 65 48 8b 04 25 f8 b4 00 00 f6 80 38 e0 ff ff 08 75 09 e8 71 76 06 00 fb f4 06 e8 68 76 06 00 fb 65 48 8b 04 25 f8 b4 00 00 83 88 3c e0 [ 552.620013] RIP [] default_idle+0x7d/0xda [ 552.620013] RSP [ 552.828646] ---[ end trace 4cbfc5c01382af7f ]--- Joerg Roedel said "The 0403 error code means that there was an external interrupt with vector 0x80. Yinghai, my theory is that the kernel on this machine has no 32bit emulation compiled in, right? In this case the selector points to a zero entry which may cause the #gpf right after the hlt. But I have no idea where the external int 0x80 comes from" it turns out that we could use 0x80 for external device on 64-bit when 32-bit emulation is disabled. But we forgot to set the gate for it. try to set gate for it by checking used_vectors. Also move apic_intr_init() early to avoid setting that gate two times. Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Joerg Roedel LKML-Reference: <49E62DFD.6010904@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index b424c32c4a0..2e08b10ad51 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -240,19 +240,19 @@ void __init native_init_IRQ(void) /* Execute any quirks before the call gates are initialised: */ x86_quirk_pre_intr_init(); + apic_intr_init(); + /* * Cover the whole vector space, no vector can escape * us. (some of these will be overridden and become * 'special' SMP interrupts) */ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ - if (i != IA32_SYSCALL_VECTOR) + /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ + if (!test_bit(i, used_vectors)) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } - apic_intr_init(); - if (!acpi_ioapic) setup_irq(2, &irq2); From d0deef5b14af7d5bbd0003a0a2a1a32326e20a6d Mon Sep 17 00:00:00 2001 From: Shawn Du Date: Tue, 14 Apr 2009 13:58:56 +0800 Subject: [PATCH 195/900] blktrace: support per-partition tracing Though one can specify '-d /dev/sda1' when using blktrace, it still traces the whole sda. To support per-partition tracing, when we start tracing, we initialize bt->start_lba and bt->end_lba to the start and end sector of that partition. Note some actions are per device, thus we don't filter 0-sector events. The original patch and discussion can be found here: http://marc.info/?l=linux-btrace&m=122949374214540&w=2 Signed-off-by: Shawn Du Signed-off-by: Li Zefan Acked-by: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Cc: Jens Axboe LKML-Reference: <49E42620.4050701@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- block/compat_ioctl.c | 2 +- drivers/scsi/sg.c | 1 + include/linux/blktrace_api.h | 24 +++++++++++++----------- kernel/trace/blktrace.c | 29 +++++++++++++++++++++-------- 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index f87615dea46..f8c218cd08e 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -568,7 +568,7 @@ static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg) memcpy(&buts.name, &cbuts.name, 32); mutex_lock(&bdev->bd_mutex); - ret = do_blk_trace_setup(q, b, bdev->bd_dev, &buts); + ret = do_blk_trace_setup(q, b, bdev->bd_dev, bdev, &buts); mutex_unlock(&bdev->bd_mutex); if (ret) return ret; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 82312df9b0b..49c98730bb8 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1065,6 +1065,7 @@ sg_ioctl(struct inode *inode, struct file *filp, return blk_trace_setup(sdp->device->request_queue, sdp->disk->disk_name, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), + NULL, (char *)arg); case BLKTRACESTART: return blk_trace_startstop(sdp->device->request_queue, 1); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index d960889e92e..267edc4017e 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -165,8 +165,9 @@ struct blk_trace { extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); extern void blk_trace_shutdown(struct request_queue *); -extern int do_blk_trace_setup(struct request_queue *q, - char *name, dev_t dev, struct blk_user_trace_setup *buts); +extern int do_blk_trace_setup(struct request_queue *q, char *name, + dev_t dev, struct block_device *bdev, + struct blk_user_trace_setup *buts); extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); /** @@ -193,6 +194,7 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); extern void blk_add_driver_data(struct request_queue *q, struct request *rq, void *data, size_t len); extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg); extern int blk_trace_startstop(struct request_queue *q, int start); extern int blk_trace_remove(struct request_queue *q); @@ -200,15 +202,15 @@ extern int blk_trace_remove(struct request_queue *q); extern struct attribute_group blk_trace_attr_group; #else /* !CONFIG_BLK_DEV_IO_TRACE */ -#define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) -#define blk_trace_shutdown(q) do { } while (0) -#define do_blk_trace_setup(q, name, dev, buts) (-ENOTTY) -#define blk_add_driver_data(q, rq, data, len) do {} while (0) -#define blk_trace_setup(q, name, dev, arg) (-ENOTTY) -#define blk_trace_startstop(q, start) (-ENOTTY) -#define blk_trace_remove(q) (-ENOTTY) -#define blk_add_trace_msg(q, fmt, ...) do { } while (0) - +# define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) +# define blk_trace_shutdown(q) do { } while (0) +# define do_blk_trace_setup(q, name, dev, bdev, buts) (-ENOTTY) +# define blk_add_driver_data(q, rq, data, len) do {} while (0) +# define blk_trace_setup(q, name, dev, bdev, arg) (-ENOTTY) +# define blk_trace_startstop(q, start) (-ENOTTY) +# define blk_trace_remove(q) (-ENOTTY) +# define blk_add_trace_msg(q, fmt, ...) do { } while (0) #endif /* CONFIG_BLK_DEV_IO_TRACE */ + #endif /* __KERNEL__ */ #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2b98195b338..e932654cf59 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,7 +147,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) return 1; - if (sector < bt->start_lba || sector > bt->end_lba) + if (sector && (sector < bt->start_lba || sector > bt->end_lba)) return 1; if (bt->pid && pid != bt->pid) return 1; @@ -192,7 +192,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(rw, DISCARD); pid = tsk->pid; - if (unlikely(act_log_check(bt, what, sector, pid))) + if (act_log_check(bt, what, sector, pid)) return; cpu = raw_smp_processor_id(); @@ -407,11 +407,13 @@ static struct rchan_callbacks blk_relay_callbacks = { * Setup everything required to start tracing */ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct blk_user_trace_setup *buts) + struct block_device *bdev, + struct blk_user_trace_setup *buts) { struct blk_trace *old_bt, *bt = NULL; struct dentry *dir = NULL; int ret, i; + struct hd_struct *part = NULL; if (!buts->buf_size || !buts->buf_nr) return -EINVAL; @@ -480,11 +482,21 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->act_mask) bt->act_mask = (u16) -1; - bt->start_lba = buts->start_lba; - bt->end_lba = buts->end_lba; - if (!bt->end_lba) + if (bdev) + part = bdev->bd_part; + + if (part) { + bt->start_lba = part->start_sect; + bt->end_lba = part->start_sect + part->nr_sects; + } else bt->end_lba = -1ULL; + /* overwrite with user settings */ + if (buts->start_lba) + bt->start_lba = buts->start_lba; + if (buts->end_lba) + bt->end_lba = buts->end_lba; + bt->pid = buts->pid; bt->trace_state = Blktrace_setup; @@ -505,6 +517,7 @@ err: } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; @@ -514,7 +527,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (ret) return -EFAULT; - ret = do_blk_trace_setup(q, name, dev, &buts); + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; @@ -582,7 +595,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, arg); + ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; case BLKTRACESTART: start = 1; From 9908c30997b8a73c95f836170b9998dae9aa3f4a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 Apr 2009 13:59:34 +0800 Subject: [PATCH 196/900] blktrace: support per-partition tracing for ftrace plugin The previous patch adds support to trace a single partition for relay+ioctl blktrace, and this patch is for ftrace plugin blktrace: # echo 1 > /sys/block/sda/sda7/enable # cat start_lba 102398373 # cat end_lba 102703545 Signed-off-by: Li Zefan Acked-by: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Cc: Shawn Du Cc: Jens Axboe LKML-Reference: <49E42646.4060608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e932654cf59..d1098988052 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -403,6 +403,23 @@ static struct rchan_callbacks blk_relay_callbacks = { .remove_buf_file = blk_remove_buf_file_callback, }; +static void blk_trace_setup_lba(struct blk_trace *bt, + struct block_device *bdev) +{ + struct hd_struct *part = NULL; + + if (bdev) + part = bdev->bd_part; + + if (part) { + bt->start_lba = part->start_sect; + bt->end_lba = part->start_sect + part->nr_sects; + } else { + bt->start_lba = 0; + bt->end_lba = -1ULL; + } +} + /* * Setup everything required to start tracing */ @@ -413,7 +430,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct blk_trace *old_bt, *bt = NULL; struct dentry *dir = NULL; int ret, i; - struct hd_struct *part = NULL; if (!buts->buf_size || !buts->buf_nr) return -EINVAL; @@ -482,14 +498,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->act_mask) bt->act_mask = (u16) -1; - if (bdev) - part = bdev->bd_part; - - if (part) { - bt->start_lba = part->start_sect; - bt->end_lba = part->start_sect + part->nr_sects; - } else - bt->end_lba = -1ULL; + blk_trace_setup_lba(bt, bdev); /* overwrite with user settings */ if (buts->start_lba) @@ -1370,7 +1379,8 @@ static int blk_trace_remove_queue(struct request_queue *q) /* * Setup everything required to start tracing */ -static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) +static int blk_trace_setup_queue(struct request_queue *q, + struct block_device *bdev) { struct blk_trace *old_bt, *bt = NULL; int ret = -ENOMEM; @@ -1383,9 +1393,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) if (!bt->msg_data) goto free_bt; - bt->dev = dev; + bt->dev = bdev->bd_dev; bt->act_mask = (u16)-1; - bt->end_lba = -1ULL; + + blk_trace_setup_lba(bt, bdev); old_bt = xchg(&q->blk_trace, bt); if (old_bt != NULL) { @@ -1602,7 +1613,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (attr == &dev_attr_enable) { if (value) - ret = blk_trace_setup_queue(q, bdev->bd_dev); + ret = blk_trace_setup_queue(q, bdev); else ret = blk_trace_remove_queue(q); goto out_unlock_bdev; @@ -1610,7 +1621,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, ret = 0; if (q->blk_trace == NULL) - ret = blk_trace_setup_queue(q, bdev->bd_dev); + ret = blk_trace_setup_queue(q, bdev); if (ret == 0) { if (attr == &dev_attr_act_mask) From 1d54ad6da9192fed5dd3b60224d9f2dfea0dcd82 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 Apr 2009 14:00:05 +0800 Subject: [PATCH 197/900] blktrace: add trace/ to /sys/block/sda Impact: allow ftrace-plugin blktrace to trace device-mapper devices To trace a single partition: # echo 1 > /sys/block/sda/sda1/enable To trace the whole sda instead: # echo 1 > /sys/block/sda/enable Thus we also fix an issue reported by Ted, that ftrace-plugin blktrace can't be used to trace device-mapper devices. Now: # echo 1 > /sys/block/dm-0/trace/enable echo: write error: No such device or address # mount -t ext4 /dev/dm-0 /mnt # echo 1 > /sys/block/dm-0/trace/enable # echo blk > /debug/tracing/current_tracer Reported-by: Theodore Tso Signed-off-by: Li Zefan Acked-by: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Cc: Shawn Du Cc: Jens Axboe LKML-Reference: <49E42665.6020506@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- block/blk-sysfs.c | 7 ++++++- include/linux/blktrace_api.h | 6 ++++++ kernel/trace/blktrace.c | 5 +++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 73f36beff5c..8653d710b39 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -387,16 +387,21 @@ struct kobj_type blk_queue_ktype = { int blk_register_queue(struct gendisk *disk) { int ret; + struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; if (WARN_ON(!q)) return -ENXIO; + ret = blk_trace_init_sysfs(dev); + if (ret) + return ret; + if (!q->request_fn) return 0; - ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj), + ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); if (ret < 0) return ret; diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 267edc4017e..62763c95285 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -198,6 +198,7 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, char __user *arg); extern int blk_trace_startstop(struct request_queue *q, int start); extern int blk_trace_remove(struct request_queue *q); +extern int blk_trace_init_sysfs(struct device *dev); extern struct attribute_group blk_trace_attr_group; @@ -210,6 +211,11 @@ extern struct attribute_group blk_trace_attr_group; # define blk_trace_startstop(q, start) (-ENOTTY) # define blk_trace_remove(q) (-ENOTTY) # define blk_add_trace_msg(q, fmt, ...) do { } while (0) +static inline int blk_trace_init_sysfs(struct device *dev) +{ + return 0; +} + #endif /* CONFIG_BLK_DEV_IO_TRACE */ #endif /* __KERNEL__ */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d1098988052..8e7c5da3a3e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1644,3 +1644,8 @@ out: return ret ? ret : count; } +int blk_trace_init_sysfs(struct device *dev) +{ + return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); +} + From f3948f8857ef5de239f28a61dddb1554a0ae4c2c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 15 Apr 2009 11:02:56 +0800 Subject: [PATCH 198/900] blktrace: fix context-info when mixed-using blk tracer and trace events When current tracer is set to blk tracer, TRACE_ITER_CONTEXT_INFO is unset, but actually context-info is printed: pdflush-431 [000] 821.181576: 8,0 P N [pdflush] And then if we enable TRACE_ITER_CONTEXT_INFO: # echo context-info > trace_options We'll see context-info printed twice. What's worse, when we use blk tracer and trace events at the same time, we'll see no context-info for trace events at all: jbd2_commit_logging: dev dm-0:8 transaction 333227 jbd2_end_commit: dev dm-0:8 transaction 333227 head 332814 rm-25433 [001] 9578.307485: 8,18 m N cfq25433 slice expired t=0 rm-25433 [001] 9578.307486: 8,18 m N cfq25433 put_queue This patch adds blk_tracer->set_flags(), and context-info flag is unset only when we set the output to classic mode. Note after this patch, one should unset context-info explicitly if he wants to get binary output that can be parsed by blkparse: # echo nocontext-info > trace_options # echo bin > trace_options # echo blk > current_tracer # cat trace_pipe | blkparse -i - Reported-by: Theodore Ts'o Signed-off-by: Li Zefan Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <49E54E60.50408@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8e7c5da3a3e..c32062bd10b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1211,7 +1211,6 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { blk_tracer_enabled = true; - trace_flags &= ~TRACE_ITER_CONTEXT_INFO; } static int blk_tracer_init(struct trace_array *tr) @@ -1224,7 +1223,6 @@ static int blk_tracer_init(struct trace_array *tr) static void blk_tracer_stop(struct trace_array *tr) { blk_tracer_enabled = false; - trace_flags |= TRACE_ITER_CONTEXT_INFO; } static void blk_tracer_reset(struct trace_array *tr) @@ -1289,9 +1287,6 @@ out: static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, int flags) { - if (!trace_print_context(iter)) - return TRACE_TYPE_PARTIAL_LINE; - return print_one_line(iter, false); } @@ -1326,6 +1321,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) return print_one_line(iter, true); } +static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +{ + /* don't output context-info for blk_classic output */ + if (bit == TRACE_BLK_OPT_CLASSIC) { + if (set) + trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + else + trace_flags |= TRACE_ITER_CONTEXT_INFO; + } + return 0; +} + static struct tracer blk_tracer __read_mostly = { .name = "blk", .init = blk_tracer_init, @@ -1335,6 +1342,7 @@ static struct tracer blk_tracer __read_mostly = { .print_header = blk_tracer_print_header, .print_line = blk_tracer_print_line, .flags = &blk_tracer_flags, + .set_flag = blk_tracer_set_flag, }; static struct trace_event trace_blk_event = { From d1f0ae5e2e45e74cff4c3bdefb0fc77608cdfeec Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 15 Apr 2009 21:34:55 +0200 Subject: [PATCH 199/900] x86: standardize Kbuild rules Introducing this Kbuild file allow us to: make arch/x86/ And thus building all the core part of x86. Signed-off-by: Sam Ravnborg Cc: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/Kbuild | 16 ++++++++++++++++ arch/x86/Makefile | 19 ++----------------- 2 files changed, 18 insertions(+), 17 deletions(-) create mode 100644 arch/x86/Kbuild diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild new file mode 100644 index 00000000000..ad8ec356fb3 --- /dev/null +++ b/arch/x86/Kbuild @@ -0,0 +1,16 @@ + +obj-$(CONFIG_KVM) += kvm/ + +# Xen paravirtualization support +obj-$(CONFIG_XEN) += xen/ + +# lguest paravirtualization support +obj-$(CONFIG_LGUEST_GUEST) += lguest/ + +obj-y += kernel/ +obj-y += mm/ + +obj-y += crypto/ +obj-y += vdso/ +obj-$(CONFIG_IA32_EMULATION) += ia32/ + diff --git a/arch/x86/Makefile b/arch/x86/Makefile index f05d8c91d9e..e81f0b27776 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -7,8 +7,6 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif -core-$(CONFIG_KVM) += arch/x86/kvm/ - # BITS is used as extension for files which are available in a 32 bit # and a 64 bit version to simplify shared Makefiles. # e.g.: obj-y += foo_$(BITS).o @@ -118,21 +116,8 @@ head-y += arch/x86/kernel/init_task.o libs-y += arch/x86/lib/ -# Sub architecture files that needs linking first -core-y += $(fcore-y) - -# Xen paravirtualization support -core-$(CONFIG_XEN) += arch/x86/xen/ - -# lguest paravirtualization support -core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ - -core-y += arch/x86/kernel/ -core-y += arch/x86/mm/ - -core-y += arch/x86/crypto/ -core-y += arch/x86/vdso/ -core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/ +# See arch/x86/Kbuild for content of core part of the kernel +core-y += arch/x86/ # drivers-y are linked after core-y drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ From 93eb677d74a4f7d3edfb678c94f6c0544d9fbad2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2009 13:24:06 -0400 Subject: [PATCH 200/900] ftrace: use module notifier for function tracer The hooks in the module code for the function tracer must be called before any of that module code runs. The function tracer hooks modify the module (replacing calls to mcount to nops). If the code is executed while the change occurs, then the CPU can take a GPF. To handle the above with a bit of paranoia, I originally implemented the hooks as calls directly from the module code. After examining the notifier calls, it looks as though the start up notify is called before any of the module's code is executed. This makes the use of the notify safe with ftrace. Only the startup notify is required to be "safe". The shutdown simply removes the entries from the ftrace function list, and does not modify any code. This change has another benefit. It removes a issue with a reverse dependency in the mutexes of ftrace_lock and module_mutex. [ Impact: fix lock dependency bug, cleanup ] Cc: Rusty Russell Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 ---- include/linux/module.h | 4 ++ kernel/module.c | 19 ++++----- kernel/trace/ftrace.c | 90 ++++++++++++++++++++++++++++++------------ 4 files changed, 75 insertions(+), 45 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 53869bef610..97c83e1bc58 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -233,8 +233,6 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size); extern int skip_trace(unsigned long ip); -extern void ftrace_release(void *start, unsigned long size); - extern void ftrace_disable_daemon(void); extern void ftrace_enable_daemon(void); #else @@ -325,13 +323,8 @@ static inline void __ftrace_enabled_restore(int enabled) #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); -extern void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end); #else static inline void ftrace_init(void) { } -static inline void -ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) { } #endif /* diff --git a/include/linux/module.h b/include/linux/module.h index 6155fa44168..a8f2c0aa4c3 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -341,6 +341,10 @@ struct module struct ftrace_event_call *trace_events; unsigned int num_trace_events; #endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + unsigned long *ftrace_callsites; + unsigned int num_ftrace_callsites; +#endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ diff --git a/kernel/module.c b/kernel/module.c index a0394706f10..2383e60fcf3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1490,9 +1490,6 @@ static void free_module(struct module *mod) /* Free any allocated parameters. */ destroy_params(mod->kp, mod->num_kp); - /* release any pointers to mcount in this module */ - ftrace_release(mod->module_core, mod->core_size); - /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); @@ -1893,11 +1890,9 @@ static noinline struct module *load_module(void __user *umod, unsigned int symindex = 0; unsigned int strindex = 0; unsigned int modindex, versindex, infoindex, pcpuindex; - unsigned int num_mcount; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ - unsigned long *mseg; mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2179,7 +2174,13 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->trace_events), &mod->num_trace_events); #endif - +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + /* sechdrs[0].sh_size is always zero */ + mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, + "__mcount_loc", + sizeof(*mod->ftrace_callsites), + &mod->num_ftrace_callsites); +#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2244,11 +2245,6 @@ static noinline struct module *load_module(void __user *umod, dynamic_debug_setup(debug, num_debug); } - /* sechdrs[0].sh_size is always zero */ - mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", - sizeof(*mseg), &num_mcount); - ftrace_init_module(mod, mseg, mseg + num_mcount); - err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2309,7 +2305,6 @@ static noinline struct module *load_module(void __user *umod, cleanup: kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); - ftrace_release(mod->module_core, mod->core_size); free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a2348898858..5b606f45b6c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -916,30 +916,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec) rec->flags |= FTRACE_FL_FREE; } -void ftrace_release(void *start, unsigned long size) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - unsigned long s = (unsigned long)start; - unsigned long e = s + size; - - if (ftrace_disabled || !start) - return; - - mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) { - /* - * rec->ip is changed in ftrace_free_rec() - * It should not between s and e if record was freed. - */ - FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); - ftrace_free_rec(rec); - } - } while_for_each_ftrace_rec(); - mutex_unlock(&ftrace_lock); -} - static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -2752,14 +2728,72 @@ static int ftrace_convert_nops(struct module *mod, return 0; } -void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) +#ifdef CONFIG_MODULES +void ftrace_release(void *start, void *end) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long s = (unsigned long)start; + unsigned long e = (unsigned long)end; + + if (ftrace_disabled || !start || start == end) + return; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + if ((rec->ip >= s) && (rec->ip < e)) { + /* + * rec->ip is changed in ftrace_free_rec() + * It should not between s and e if record was freed. + */ + FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); + ftrace_free_rec(rec); + } + } while_for_each_ftrace_rec(); + mutex_unlock(&ftrace_lock); +} + +static void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { if (ftrace_disabled || start == end) return; ftrace_convert_nops(mod, start, end); } +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + ftrace_init_module(mod, mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + case MODULE_STATE_GOING: + ftrace_release(mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + } + + return 0; +} +#else +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + +struct notifier_block ftrace_module_nb = { + .notifier_call = ftrace_module_notify, + .priority = 0, +}; + extern unsigned long __start_mcount_loc[]; extern unsigned long __stop_mcount_loc[]; @@ -2791,6 +2825,10 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); + ret = register_module_notifier(&ftrace_module_nb); + if (!ret) + pr_warning("Failed to register trace ftrace module notifier\n"); + return; failed: ftrace_disabled = 1; From e6187007d6c365b551c69ea3df46f06fd1c8bd19 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2009 13:36:40 -0400 Subject: [PATCH 201/900] tracing/events: add startup tests for events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As events start to become popular, and the new way to add tracing infrastructure into ftrace, it is important to catch any problems that might happen with a mistake in the TRACE_EVENT macro. This patch introduces a startup self test on the registered trace events. Note, it can only do a generic test, any type of testing that needs more involement is needed to be implemented by the tracepoint creators. The test goes down one by one enabling a trace point and running some random tasks (random in the sense that I just made them up). Those tasks are creating threads, grabbing mutexes and spinlocks and using workqueues. After testing each event individually, it does the same test after enabling each system of trace points. Like sched, irq, lockdep. Then finally it enables all tracepoints and performs the tasks again. The output to the console on bootup will look like this when everything works: Running tests on trace events: Testing event kfree_skb: OK Testing event kmalloc: OK Testing event kmem_cache_alloc: OK Testing event kmalloc_node: OK Testing event kmem_cache_alloc_node: OK Testing event kfree: OK Testing event kmem_cache_free: OK Testing event irq_handler_exit: OK Testing event irq_handler_entry: OK Testing event softirq_entry: OK Testing event softirq_exit: OK Testing event lock_acquire: OK Testing event lock_release: OK Testing event sched_kthread_stop: OK Testing event sched_kthread_stop_ret: OK Testing event sched_wait_task: OK Testing event sched_wakeup: OK Testing event sched_wakeup_new: OK Testing event sched_switch: OK Testing event sched_migrate_task: OK Testing event sched_process_free: OK Testing event sched_process_exit: OK Testing event sched_process_wait: OK Testing event sched_process_fork: OK Testing event sched_signal_send: OK Running tests on trace event systems: Testing event system skb: OK Testing event system kmem: OK Testing event system irq: OK Testing event system lockdep: OK Testing event system sched: OK Running tests on all trace events: Testing all events: OK [ folded in: tracing: add #include to fix build failure in test_work() This build failure occured on a few rare configs: kernel/trace/trace_events.c: In function ‘test_work’: kernel/trace/trace_events.c:975: error: implicit declaration of function ‘udelay’ kernel/trace/trace_events.c:980: error: implicit declaration of function ‘msleep’ delay.h is included in way too many other headers, hiding cases where new usage is added without header inclusion. [ Impact: build fix ] Signed-off-by: Ingo Molnar ] [ Impact: add event tracer self-tests ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 178 ++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6591d83e1e7..f81d6eec4e4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -8,10 +8,14 @@ * */ +#include +#include +#include #include #include #include #include +#include #include "trace_output.h" @@ -920,3 +924,177 @@ static __init int event_trace_init(void) return 0; } fs_initcall(event_trace_init); + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static DEFINE_SPINLOCK(test_spinlock); +static DEFINE_SPINLOCK(test_spinlock_irq); +static DEFINE_MUTEX(test_mutex); + +static __init void test_work(struct work_struct *dummy) +{ + spin_lock(&test_spinlock); + spin_lock_irq(&test_spinlock_irq); + udelay(1); + spin_unlock_irq(&test_spinlock_irq); + spin_unlock(&test_spinlock); + + mutex_lock(&test_mutex); + msleep(1); + mutex_unlock(&test_mutex); +} + +static __init int event_test_thread(void *unused) +{ + void *test_malloc; + + test_malloc = kmalloc(1234, GFP_KERNEL); + if (!test_malloc) + pr_info("failed to kmalloc\n"); + + schedule_on_each_cpu(test_work); + + kfree(test_malloc); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) + schedule(); + + return 0; +} + +/* + * Do various things that may trigger events. + */ +static __init void event_test_stuff(void) +{ + struct task_struct *test_thread; + + test_thread = kthread_run(event_test_thread, NULL, "test-events"); + msleep(1); + kthread_stop(test_thread); +} + +/* + * For every trace event defined, we will test each trace point separately, + * and then by groups, and finally all trace points. + */ +static __init int event_trace_self_tests(void) +{ + struct ftrace_event_call *call; + struct event_subsystem *system; + char *sysname; + int ret; + + pr_info("Running tests on trace events:\n"); + + list_for_each_entry(call, &ftrace_events, list) { + + /* Only test those that have a regfunc */ + if (!call->regfunc) + continue; + + pr_info("Testing event %s: ", call->name); + + /* + * If an event is already enabled, someone is using + * it and the self test should not be on. + */ + if (call->enabled) { + pr_warning("Enabled event during self test!\n"); + WARN_ON_ONCE(1); + continue; + } + + call->enabled = 1; + call->regfunc(); + + event_test_stuff(); + + call->unregfunc(); + call->enabled = 0; + + pr_cont("OK\n"); + } + + /* Now test at the sub system level */ + + pr_info("Running tests on trace event systems:\n"); + + list_for_each_entry(system, &event_subsystems, list) { + + /* the ftrace system is special, skip it */ + if (strcmp(system->name, "ftrace") == 0) + continue; + + pr_info("Testing event system %s: ", system->name); + + /* ftrace_set_clr_event can modify the name passed in. */ + sysname = kstrdup(system->name, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return 0; + } + ret = ftrace_set_clr_event(sysname, 1); + kfree(sysname); + if (WARN_ON_ONCE(ret)) { + pr_warning("error enabling system %s\n", + system->name); + continue; + } + + event_test_stuff(); + + sysname = kstrdup(system->name, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return 0; + } + ret = ftrace_set_clr_event(sysname, 0); + kfree(sysname); + + if (WARN_ON_ONCE(ret)) + pr_warning("error disabling system %s\n", + system->name); + + pr_cont("OK\n"); + } + + /* Test with all events enabled */ + + pr_info("Running tests on all trace events:\n"); + pr_info("Testing all events: "); + + sysname = kmalloc(4, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return 0; + } + memcpy(sysname, "*:*", 4); + ret = ftrace_set_clr_event(sysname, 1); + if (WARN_ON_ONCE(ret)) { + kfree(sysname); + pr_warning("error enabling all events\n"); + return 0; + } + + event_test_stuff(); + + /* reset sysname */ + memcpy(sysname, "*:*", 4); + ret = ftrace_set_clr_event(sysname, 0); + kfree(sysname); + + if (WARN_ON_ONCE(ret)) { + pr_warning("error disabling all events\n"); + return 0; + } + + pr_cont("OK\n"); + + return 0; +} + +late_initcall(event_trace_self_tests); + +#endif From d1b182a8d49ed6416325b4e0a1cb0f17cd4e702a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2009 16:53:47 -0400 Subject: [PATCH 202/900] tracing/events/ring-buffer: expose format of ring buffer headers to users Currently, every thing needed to read the binary output from the ring buffers is available, with the exception of the way the ring buffers handles itself internally. This patch creates two special files in the debugfs/tracing/events directory: # cat /debug/tracing/events/header_page field: u64 timestamp; offset:0; size:8; field: local_t commit; offset:8; size:8; field: char data; offset:16; size:4080; # cat /debug/tracing/events/header_event type : 2 bits len : 3 bits time_delta : 27 bits array : 32 bits padding : type == 0 time_extend : type == 1 data : type == 3 This is to allow a userspace app to see if the ring buffer format changes or not. [ Impact: allow userspace apps to know of ringbuffer format changes ] Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 5 +++++ kernel/trace/ring_buffer.c | 44 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events.c | 38 ++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index f0aa486d131..fac8f1ac6f4 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -166,6 +166,11 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, size_t len, int cpu, int full); +struct trace_seq; + +int ring_buffer_print_entry_header(struct trace_seq *s); +int ring_buffer_print_page_header(struct trace_seq *s); + enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, }; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f935bd5ec3e..84a6055f37c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -21,6 +21,28 @@ #include "trace.h" +/* + * The ring buffer header is special. We must manually up keep it. + */ +int ring_buffer_print_entry_header(struct trace_seq *s) +{ + int ret; + + ret = trace_seq_printf(s, "\ttype : 2 bits\n"); + ret = trace_seq_printf(s, "\tlen : 3 bits\n"); + ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); + ret = trace_seq_printf(s, "\tarray : 32 bits\n"); + ret = trace_seq_printf(s, "\n"); + ret = trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + ret = trace_seq_printf(s, "\tdata : type == %d\n", + RINGBUF_TYPE_DATA); + + return ret; +} + /* * The ring buffer is made up of a list of pages. A separate list of pages is * allocated for each CPU. A writer may only write to a buffer that is @@ -340,6 +362,28 @@ static inline int test_time_stamp(u64 delta) #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) +int ring_buffer_print_page_header(struct trace_seq *s) +{ + struct buffer_data_page field; + int ret; + + ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\n", + (unsigned int)sizeof(field.time_stamp)); + + ret = trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit)); + + ret = trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE); + + return ret; +} + /* * head_page == tail_page && head == tail then buffer is empty. */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f81d6eec4e4..7163a2bb021 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -610,6 +610,30 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int (*func)(struct trace_seq *s) = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + func(s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + + kfree(s); + + return r; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -667,6 +691,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = { .write = subsystem_filter_write, }; +static const struct file_operations ftrace_show_header_fops = { + .open = tracing_open_generic, + .read = show_header, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -909,6 +938,15 @@ static __init int event_trace_init(void) if (!d_events) return 0; + /* ring buffer internal formats */ + trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + + trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) From 69abe6a5d18a9394baa325bab8f57748b037c517 Mon Sep 17 00:00:00 2001 From: Avadh Patel Date: Fri, 10 Apr 2009 16:04:48 -0400 Subject: [PATCH 203/900] tracing: add saved_cmdlines file to show cached task comms Export the cached task comms to userspace. This allows user apps to translate the pids from a trace into their respective task command lines. [ Impact: let userspace apps reading binary buffer know comm's of pids ] Signed-off-by: Avadh Patel [ added error checking and use of buf pointer to index file_buf ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2d69b26b3cc..031c46f11bb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2421,6 +2421,56 @@ static const struct file_operations tracing_readme_fops = { .read = tracing_readme_read, }; +static ssize_t +tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf_comm; + char *file_buf; + char *buf; + int len = 0; + int pid; + int i; + + file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); + if (!file_buf) + return -ENOMEM; + + buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!buf_comm) { + kfree(file_buf); + return -ENOMEM; + } + + buf = file_buf; + + for (i = 0; i < SAVED_CMDLINES; i++) { + int r; + + pid = map_cmdline_to_pid[i]; + if (pid == -1 || pid == NO_CMDLINE_MAP) + continue; + + trace_find_cmdline(pid, buf_comm); + r = sprintf(buf, "%d %s\n", pid, buf_comm); + buf += r; + len += r; + } + + len = simple_read_from_buffer(ubuf, cnt, ppos, + file_buf, len); + + kfree(file_buf); + kfree(buf_comm); + + return len; +} + +static const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_read, +}; + static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3973,6 +4023,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("trace_marker", 0220, d_tracer, NULL, &tracing_mark_fops); + trace_create_file("saved_cmdlines", 0444, d_tracer, + NULL, &tracing_saved_cmdlines_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); From 9ea21c1ecdb35ecdcac5fd9d95f62a1f6a7ffec0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Apr 2009 12:15:44 -0400 Subject: [PATCH 204/900] tracing/events: perform function tracing in event selftests We can find some bugs in the trace events if we stress the writes as well. The function tracer is a good way to stress the events. [ Impact: extend scope of event tracer self-tests ] Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: Peter Zijlstra Cc: Frederic Weisbecker LKML-Reference: <20090416161746.604786131@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 78 ++++++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7163a2bb021..1137f951be4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1017,7 +1017,7 @@ static __init void event_test_stuff(void) * For every trace event defined, we will test each trace point separately, * and then by groups, and finally all trace points. */ -static __init int event_trace_self_tests(void) +static __init void event_trace_self_tests(void) { struct ftrace_event_call *call; struct event_subsystem *system; @@ -1071,7 +1071,7 @@ static __init int event_trace_self_tests(void) sysname = kstrdup(system->name, GFP_KERNEL); if (WARN_ON(!sysname)) { pr_warning("Can't allocate memory, giving up!\n"); - return 0; + return; } ret = ftrace_set_clr_event(sysname, 1); kfree(sysname); @@ -1086,7 +1086,7 @@ static __init int event_trace_self_tests(void) sysname = kstrdup(system->name, GFP_KERNEL); if (WARN_ON(!sysname)) { pr_warning("Can't allocate memory, giving up!\n"); - return 0; + return; } ret = ftrace_set_clr_event(sysname, 0); kfree(sysname); @@ -1106,14 +1106,14 @@ static __init int event_trace_self_tests(void) sysname = kmalloc(4, GFP_KERNEL); if (WARN_ON(!sysname)) { pr_warning("Can't allocate memory, giving up!\n"); - return 0; + return; } memcpy(sysname, "*:*", 4); ret = ftrace_set_clr_event(sysname, 1); if (WARN_ON_ONCE(ret)) { kfree(sysname); pr_warning("error enabling all events\n"); - return 0; + return; } event_test_stuff(); @@ -1125,10 +1125,76 @@ static __init int event_trace_self_tests(void) if (WARN_ON_ONCE(ret)) { pr_warning("error disabling all events\n"); - return 0; + return; } pr_cont("OK\n"); +} + +#ifdef CONFIG_FUNCTION_TRACER + +static DEFINE_PER_CPU(atomic_t, test_event_disable); + +static void +function_test_events_call(unsigned long ip, unsigned long parent_ip) +{ + struct ring_buffer_event *event; + struct ftrace_entry *entry; + unsigned long flags; + long disabled; + int resched; + int cpu; + int pc; + + pc = preempt_count(); + resched = ftrace_preempt_disable(); + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + + if (disabled != 1) + goto out; + + local_save_flags(flags); + + event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), + flags, pc); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->parent_ip = parent_ip; + + trace_current_buffer_unlock_commit(event, flags, pc); + + out: + atomic_dec(&per_cpu(test_event_disable, cpu)); + ftrace_preempt_enable(resched); +} + +static struct ftrace_ops trace_ops __initdata = +{ + .func = function_test_events_call, +}; + +static __init void event_trace_self_test_with_function(void) +{ + register_ftrace_function(&trace_ops); + pr_info("Running tests again, along with the function tracer\n"); + event_trace_self_tests(); + unregister_ftrace_function(&trace_ops); +} +#else +static __init void event_trace_self_test_with_function(void) +{ +} +#endif + +static __init int event_trace_self_tests_init(void) +{ + + event_trace_self_tests(); + + event_trace_self_test_with_function(); return 0; } From 76aa81118ddfbb3dc31533030cf3ec329dd067a6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 16 Apr 2009 23:35:39 -0700 Subject: [PATCH 205/900] tracing: avoid warnings from zero-arg tracepoints Tracepoints with no arguments can issue two warnings: "field" defined by not used "ret" is uninitialized in this function Mark field as being OK to leave unused, and initialize ret. [ Impact: fix false positive compiler warnings. ] Signed-off-by: Jeremy Fitzhardinge Acked-by: Steven Rostedt Cc: mathieu.desnoyers@polymtl.ca LKML-Reference: <1239950139-1119-5-git-send-email-jeremy@goop.org> Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 60c5323bee6..39a3351f2e7 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -160,8 +160,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ static int \ ftrace_format_##call(struct trace_seq *s) \ { \ - struct ftrace_raw_##call field; \ - int ret; \ + struct ftrace_raw_##call field __attribute__((unused)); \ + int ret = 0; \ \ tstruct; \ \ From 339ae5d3c3fc2025e3657637921495fd600027c7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 17 Apr 2009 10:34:30 +0800 Subject: [PATCH 206/900] tracing: fix file mode of trace and README trace is read-write and README is read-only. [ Impact: fix /debug/tracing/ file permissions. ] Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Steven Rostedt LKML-Reference: <49E7EAB6.4070605@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 031c46f11bb..f681f646aa0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4002,7 +4002,7 @@ static __init int tracer_init_debugfs(void) trace_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); - trace_create_file("current_tracer", 0444, d_tracer, + trace_create_file("current_tracer", 0644, d_tracer, &global_trace, &set_tracer_fops); trace_create_file("tracing_max_latency", 0644, d_tracer, @@ -4011,7 +4011,7 @@ static __init int tracer_init_debugfs(void) trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); - trace_create_file("README", 0644, d_tracer, + trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); trace_create_file("trace_pipe", 0444, d_tracer, From 9b94b3a19b13e094c10f65f24bc358f6ffe4eacd Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 17 Apr 2009 12:07:46 +0200 Subject: [PATCH 207/900] x86: fixup numa_node information for AMD CPU northbridge functions Currently the numa_node attribute for these PCI devices is 0 (it corresponds to the numa_node for PCI bus 0). This is not a big issue but incorrect. This inconsistency can be fixed by reading the node number from CPU NB function 0. [ Impact: fill in dev->numa_node information, to optimize DMA allocations ] Signed-off-by: Andreas Herrmann Cc: jbarnes@virtuousgeek.org LKML-Reference: <20090417100746.GG16198@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index e95022e4f5d..94ad0c029f0 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -493,5 +493,42 @@ void force_hpet_resume(void) break; } } - +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void __init quirk_amd_nb_node(struct pci_dev *dev) +{ + struct pci_dev *nb_ht; + unsigned int devfn; + u32 val; + + devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + nb_ht = pci_get_slot(dev->bus, devfn); + if (!nb_ht) + return; + + pci_read_config_dword(nb_ht, 0x60, &val); + set_dev_node(&dev->dev, val & 7); + pci_dev_put(dev); +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, + quirk_amd_nb_node); #endif From 46de405f25f1d9fa73b657ffbb752aa0cc87a91d Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 17 Apr 2009 10:53:43 +0800 Subject: [PATCH 208/900] tracing: Remove include/trace/kmem_event_types.h kmem_event_types.h is no longer necessary since tracepoint definitions are put into include/trace/events/kmem.h [ Impact: remove now-unused file. ] Signed-off-by: Zhao Lei Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49E7EF37.2080205@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/kmem_event_types.h | 193 ------------------------------- 1 file changed, 193 deletions(-) delete mode 100644 include/trace/kmem_event_types.h diff --git a/include/trace/kmem_event_types.h b/include/trace/kmem_event_types.h deleted file mode 100644 index 4ff420fe467..00000000000 --- a/include/trace/kmem_event_types.h +++ /dev/null @@ -1,193 +0,0 @@ - -/* use instead */ -#ifndef TRACE_EVENT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kmem - -TRACE_EVENT(kmalloc, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags) -); - -TRACE_EVENT(kmem_cache_alloc, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags) -); - -TRACE_EVENT(kmalloc_node, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - __entry->node = node; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags, - __entry->node) -); - -TRACE_EVENT(kmem_cache_alloc_node, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - __entry->node = node; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags, - __entry->node) -); - -TRACE_EVENT(kfree, - - TP_PROTO(unsigned long call_site, const void *ptr), - - TP_ARGS(call_site, ptr), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - ), - - TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) -); - -TRACE_EVENT(kmem_cache_free, - - TP_PROTO(unsigned long call_site, const void *ptr), - - TP_ARGS(call_site, ptr), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - ), - - TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) -); - -#undef TRACE_SYSTEM From ac1adc55fc71c7515caa2eb0e63e49b3d1c6a47c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 17 Apr 2009 00:27:08 -0500 Subject: [PATCH 209/900] tracing/filters: add filter_mutex to protect filter predicates This patch adds a filter_mutex to prevent the filter predicates from being accessed concurrently by various external functions. It's based on a previous patch by Li Zefan: "[PATCH 7/7] tracing/filters: make filter preds RCU safe" v2 changes: - fixed wrong value returned in a add_subsystem_pred() failure case noticed by Li Zefan. [ Impact: fix trace filter corruption/crashes on parallel access ] Signed-off-by: Tom Zanussi Reviewed-by: Li Zefan Tested-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: paulmck@linux.vnet.ibm.com LKML-Reference: <1239946028.6639.13.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 4 +- kernel/trace/trace_events.c | 4 +- kernel/trace/trace_events_filter.c | 90 +++++++++++++++++++++++------- 3 files changed, 75 insertions(+), 23 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8817c18ef97..247948e81b0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -757,13 +757,15 @@ struct filter_pred { }; extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct filter_pred **preds, int n_preds, +extern void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s); extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); extern void filter_disable_preds(struct ftrace_event_call *call); extern void filter_free_subsystem_preds(struct event_subsystem *system); +extern void filter_print_subsystem_preds(struct event_subsystem *system, + struct trace_seq *s); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1137f951be4..64f9d6d2735 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -488,7 +488,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call->preds, call->n_preds, s); + filter_print_preds(call, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -558,7 +558,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(system->preds, system->n_preds, s); + filter_print_subsystem_preds(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f8e5eab0424..e0fcfd2a16d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -22,10 +22,13 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" +static DEFINE_MUTEX(filter_mutex); + static int filter_pred_64(struct filter_pred *pred, void *event) { u64 *addr = (u64 *)(event + pred->offset); @@ -112,8 +115,8 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) } EXPORT_SYMBOL_GPL(filter_match_preds); -void filter_print_preds(struct filter_pred **preds, int n_preds, - struct trace_seq *s) +static void __filter_print_preds(struct filter_pred **preds, int n_preds, + struct trace_seq *s) { char *field_name; struct filter_pred *pred; @@ -138,6 +141,21 @@ void filter_print_preds(struct filter_pred **preds, int n_preds, } } +void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s) +{ + mutex_lock(&filter_mutex); + __filter_print_preds(call->preds, call->n_preds, s); + mutex_unlock(&filter_mutex); +} + +void filter_print_subsystem_preds(struct event_subsystem *system, + struct trace_seq *s) +{ + mutex_lock(&filter_mutex); + __filter_print_preds(system->preds, system->n_preds, s); + mutex_unlock(&filter_mutex); +} + static struct ftrace_event_field * find_event_field(struct ftrace_event_call *call, char *name) { @@ -180,7 +198,7 @@ static int filter_set_pred(struct filter_pred *dest, return 0; } -void filter_disable_preds(struct ftrace_event_call *call) +static void __filter_disable_preds(struct ftrace_event_call *call) { int i; @@ -190,6 +208,13 @@ void filter_disable_preds(struct ftrace_event_call *call) call->preds[i]->fn = filter_pred_none; } +void filter_disable_preds(struct ftrace_event_call *call) +{ + mutex_lock(&filter_mutex); + __filter_disable_preds(call); + mutex_unlock(&filter_mutex); +} + int init_preds(struct ftrace_event_call *call) { struct filter_pred *pred; @@ -223,7 +248,7 @@ oom: } EXPORT_SYMBOL_GPL(init_preds); -void filter_free_subsystem_preds(struct event_subsystem *system) +static void __filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; int i; @@ -241,18 +266,25 @@ void filter_free_subsystem_preds(struct event_subsystem *system) continue; if (!strcmp(call->system, system->name)) - filter_disable_preds(call); + __filter_disable_preds(call); } } -static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred, - filter_pred_fn_t fn) +void filter_free_subsystem_preds(struct event_subsystem *system) +{ + mutex_lock(&filter_mutex); + __filter_free_subsystem_preds(system); + mutex_unlock(&filter_mutex); +} + +static int filter_add_pred_fn(struct ftrace_event_call *call, + struct filter_pred *pred, + filter_pred_fn_t fn) { int idx, err; if (call->n_preds && !pred->compound) - filter_disable_preds(call); + __filter_disable_preds(call); if (call->n_preds == MAX_FILTER_PRED) return -ENOSPC; @@ -276,7 +308,8 @@ static int is_string_field(const char *type) return 0; } -int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +static int __filter_add_pred(struct ftrace_event_call *call, + struct filter_pred *pred) { struct ftrace_event_field *field; filter_pred_fn_t fn; @@ -293,7 +326,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) return -EINVAL; fn = filter_pred_string; pred->str_len = field->size; - return __filter_add_pred(call, pred, fn); + return filter_add_pred_fn(call, pred, fn); } else { if (pred->str_len) return -EINVAL; @@ -316,7 +349,18 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) return -EINVAL; } - return __filter_add_pred(call, pred, fn); + return filter_add_pred_fn(call, pred, fn); +} + +int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +{ + int err; + + mutex_lock(&filter_mutex); + err = __filter_add_pred(call, pred); + mutex_unlock(&filter_mutex); + + return err; } int filter_add_subsystem_pred(struct event_subsystem *system, @@ -324,20 +368,27 @@ int filter_add_subsystem_pred(struct event_subsystem *system, { struct ftrace_event_call *call; + mutex_lock(&filter_mutex); + if (system->n_preds && !pred->compound) - filter_free_subsystem_preds(system); + __filter_free_subsystem_preds(system); if (!system->n_preds) { system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!system->preds) + if (!system->preds) { + mutex_unlock(&filter_mutex); return -ENOMEM; + } } - if (system->n_preds == MAX_FILTER_PRED) + if (system->n_preds == MAX_FILTER_PRED) { + mutex_unlock(&filter_mutex); return -ENOSPC; + } system->preds[system->n_preds] = pred; + system->n_preds++; list_for_each_entry(call, &ftrace_events, list) { int err; @@ -348,17 +399,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system, if (strcmp(call->system, system->name)) continue; - if (!find_event_field(call, pred->field_name)) - continue; - - err = filter_add_pred(call, pred); + err = __filter_add_pred(call, pred); if (err == -ENOMEM) { system->preds[system->n_preds] = NULL; + system->n_preds--; + mutex_unlock(&filter_mutex); return err; } } - system->n_preds++; + mutex_unlock(&filter_mutex); return 0; } From b0afdc126d0515e76890f0a5f26b28501cfa298e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 13:02:22 -0400 Subject: [PATCH 210/900] tracing/events: enable code with EVENT_TRACING not EVENT_TRACER The CONFIG_EVENT_TRACER is the way to turn on event tracing when no other tracing has been configured. All code to get enabled should depend on CONFIG_EVENT_TRACING. That is what is enabled when TRACING (or CONFIG_EVENT_TRACER) is selected. This patch enables the include/trace/ftrace.h file when CONFIG_EVENT_TRACING is enabled. [ Impact: fix warning in event tracer selftest ] Signed-off-by: Steven Rostedt --- include/trace/define_trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 18869417109..7f1f23d601e 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -56,7 +56,7 @@ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#ifdef CONFIG_EVENT_TRACER +#ifdef CONFIG_EVENT_TRACING #include #endif From 12acd473d45cf2e40de3782cb2de712e5cd4d715 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 16:01:56 -0400 Subject: [PATCH 211/900] tracing: add EXPORT_SYMBOL_GPL for trace commits Not all the necessary symbols were exported to allow for tracing by modules. This patch adds them in. [ Impact: allow modules to commit data to the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f681f646aa0..183d788038e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -894,18 +894,20 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } +EXPORT_SYMBOL(trace_current_buffer_unlock_commit); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); } +EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit); void trace_current_buffer_discard_commit(struct ring_buffer_event *event) { ring_buffer_discard_commit(global_trace.buffer, event); } -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); void trace_function(struct trace_array *tr, From 261842b7c9099f56de2eb969c8ad65402d68e00e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Apr 2009 21:41:52 -0400 Subject: [PATCH 212/900] tracing: add same level recursion detection The tracing infrastructure allows for recursion. That is, an interrupt may interrupt the act of tracing an event, and that interrupt may very well perform its own trace. This is a recursive trace, and is fine to do. The problem arises when there is a bug, and the utility doing the trace calls something that recurses back into the tracer. This recursion is not caused by an external event like an interrupt, but by code that is not expected to recurse. The result could be a lockup. This patch adds a bitmask to the task structure that keeps track of the trace recursion. To find the interrupt depth, the following algorithm is used: level = hardirq_count() + softirq_count() + in_nmi; Here, level will be the depth of interrutps and softirqs, and even handles the nmi. Then the corresponding bit is set in the recursion bitmask. If the bit was already set, we know we had a recursion at the same level and we warn about it and fail the writing to the buffer. After the data has been committed to the buffer, we clear the bit. No atomics are needed. The only races are with interrupts and they reset the bitmask before returning anywy. [ Impact: detect same irq level trace recursion ] Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 +++++++ include/linux/init_task.h | 1 + include/linux/sched.h | 4 +++- kernel/trace/ring_buffer.c | 42 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 97c83e1bc58..39b95c56587 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -488,8 +488,15 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk) extern int ftrace_dump_on_oops; +#ifdef CONFIG_PREEMPT +#define INIT_TRACE_RECURSION .trace_recursion = 0, +#endif + #endif /* CONFIG_TRACING */ +#ifndef INIT_TRACE_RECURSION +#define INIT_TRACE_RECURSION +#endif #ifdef CONFIG_HW_BRANCH_TRACER diff --git a/include/linux/init_task.h b/include/linux/init_task.h index dcfb93337e9..6fc21852986 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -187,6 +187,7 @@ extern struct cred init_cred; INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ + INIT_TRACE_RECURSION \ } diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049..7ede5e49091 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1428,7 +1428,9 @@ struct task_struct { #ifdef CONFIG_TRACING /* state flags for use by tracers */ unsigned long trace; -#endif + /* bitmask of trace recursion */ + unsigned long trace_recursion; +#endif /* CONFIG_TRACING */ }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 84a6055f37c..b421b0ea911 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1481,6 +1481,40 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return event; } +static int trace_irq_level(void) +{ + return hardirq_count() + softirq_count() + in_nmi(); +} + +static int trace_recursive_lock(void) +{ + int level; + + level = trace_irq_level(); + + if (unlikely(current->trace_recursion & (1 << level))) { + /* Disable all tracing before we do anything else */ + tracing_off_permanent(); + WARN_ON_ONCE(1); + return -1; + } + + current->trace_recursion |= 1 << level; + + return 0; +} + +static void trace_recursive_unlock(void) +{ + int level; + + level = trace_irq_level(); + + WARN_ON_ONCE(!current->trace_recursion & (1 << level)); + + current->trace_recursion &= ~(1 << level); +} + static DEFINE_PER_CPU(int, rb_need_resched); /** @@ -1514,6 +1548,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) /* If we are tracing schedule, we don't want to recurse */ resched = ftrace_preempt_disable(); + if (trace_recursive_lock()) + goto out_nocheck; + cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -1543,6 +1580,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) return event; out: + trace_recursive_unlock(); + + out_nocheck: ftrace_preempt_enable(resched); return NULL; } @@ -1581,6 +1621,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + trace_recursive_unlock(); + /* * Only the last preempt count needs to restore preemption. */ From 3189cdb31622f4e40688ce5a6fc5d940b42bc805 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 16:13:55 -0400 Subject: [PATCH 213/900] tracing: protect trace_printk from recursion trace_printk can be called from any context, including NMIs. If this happens, then we must test for for recursion before grabbing any spinlocks. This patch prevents trace_printk from being called recursively. [ Impact: prevent hard lockup in lockdep event tracer ] Cc: Peter Zijlstra Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 183d788038e..b9a3adce922 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1259,6 +1259,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct trace_array_cpu *data; struct bprint_entry *entry; unsigned long flags; + int disable; int resched; int cpu, len = 0, size, pc; @@ -1273,7 +1274,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(atomic_read(&data->disabled))) + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) goto out; /* Lockdep uses trace_printk for lock tracing */ @@ -1301,6 +1303,7 @@ out_unlock: local_irq_restore(flags); out: + atomic_dec_return(&data->disabled); ftrace_preempt_enable(resched); unpause_graph_tracing(); @@ -1320,6 +1323,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) int cpu, len = 0, size, pc; struct print_entry *entry; unsigned long irq_flags; + int disable; if (tracing_disabled || tracing_selftest_running) return 0; @@ -1329,7 +1333,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(atomic_read(&data->disabled))) + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) goto out; pause_graph_tracing(); @@ -1357,6 +1362,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) raw_local_irq_restore(irq_flags); unpause_graph_tracing(); out: + atomic_dec_return(&data->disabled); preempt_enable_notrace(); return len; From 8e668b5b3455207e4540fc7ccab9ecf70142f288 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 17:17:55 -0400 Subject: [PATCH 214/900] tracing: remove format attribute of inline function Due to a cut and paste error, I added the gcc attribute for printf format to the static inline stub of trace_seq_printf. This will cause a compile failure. [ Impact: fix compiler error when CONFIG_TRACING is off ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: =?ISO-8859-15?Q?Fr=E9d=E9ric_Weisbecker?= LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/trace_seq.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 15ca2c71af1..37db9bdfbc1 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -42,7 +42,6 @@ extern int trace_seq_path(struct trace_seq *s, struct path *path); #else /* CONFIG_TRACING */ static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))) { return 0; } From 2b2fd87a6ef56ba7647a578e81bb8c8efda166b8 Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 17 Apr 2009 16:42:12 +0800 Subject: [PATCH 215/900] docs, x86: add nox2apic back to kernel-parameters.txt "nox2apic" was removed from kernel-parameters.txt by mistake, when entries were sorted in alpha order (commit 0cb55ad2). But this early parameter is still there, add it back to kernel-parameters.txt. [ Impact: add boot parameter description ] Signed-off-by: Suresh Siddha Signed-off-by: Weidong Han Acked-by: David Woodhouse Cc: Randy Dunlap Cc: iommu@lists.linux-foundation.org Cc: allen.m.kay@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1239957736-6161-2-git-send-email-weidong.han@intel.com> Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6172e4360f6..33989d284ff 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1588,6 +1588,8 @@ and is between 256 and 4096 characters. It is defined in the file nowb [ARM] + nox2apic [X86-64,APIC] Do not enable x2APIC mode. + nptcg= [IA64] Override max number of concurrent global TLB purges which is reported from either PAL_VM_SUMMARY or SAL PALO. From 5d0ae2db6deac4f15dac4f42f23bc56448fc8d4d Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 17 Apr 2009 16:42:13 +0800 Subject: [PATCH 216/900] x86, intr-remap: fix ack for interrupt remapping Shouldn't call ack_apic_edge() in ir_ack_apic_edge(), because ack_apic_edge() does more than just ack: it also does irq migration in the non-interrupt-remapping case. But there is no such need for interrupt-remapping case, as irq migration is done in the process context. Similarly, ir_ack_apic_level() shouldn't call ack_apic_level, and instead should do the local cpu's EOI + directed EOI to the io-apic. ack_x2APIC_irq() is not neccessary, because ack_APIC_irq() will use MSR write for x2apic, and uncached write for non-x2apic. [ Impact: simplify/standardize intr-remap IRQ acking, fix on !x2apic ] Signed-off-by: Suresh Siddha Signed-off-by: Weidong Han Acked-by: David Woodhouse Cc: iommu@lists.linux-foundation.org Cc: allen.m.kay@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1239957736-6161-3-git-send-email-weidong.han@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 8 -------- arch/x86/kernel/apic/io_apic.c | 32 +++++--------------------------- 2 files changed, 5 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 2bd5a463fd1..d4cb7e590c0 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -204,14 +204,6 @@ static inline int x2apic_enabled(void) extern int get_physical_broadcast(void); -#ifdef CONFIG_X86_X2APIC -static inline void ack_x2APIC_irq(void) -{ - /* Docs say use 0 for future compatibility */ - native_apic_msr_write(APIC_EOI, 0); -} -#endif - extern void apic_disable(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 84990002240..ea22a86e3cd 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2552,20 +2552,6 @@ eoi_ioapic_irq(struct irq_desc *desc) spin_unlock_irqrestore(&ioapic_lock, flags); } -#ifdef CONFIG_X86_X2APIC -static void ack_x2apic_level(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - ack_x2APIC_irq(); - eoi_ioapic_irq(desc); -} - -static void ack_x2apic_edge(unsigned int irq) -{ - ack_x2APIC_irq(); -} -#endif - static void ack_apic_edge(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2629,9 +2615,6 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); - if (irq_remapped(irq)) - eoi_ioapic_irq(desc); - /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2680,20 +2663,15 @@ static void ack_apic_level(unsigned int irq) #ifdef CONFIG_INTR_REMAP static void ir_ack_apic_edge(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_edge(irq); -#endif - return ack_apic_edge(irq); + ack_APIC_irq(); } static void ir_ack_apic_level(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_level(irq); -#endif - return ack_apic_level(irq); + struct irq_desc *desc = irq_to_desc(irq); + + ack_APIC_irq(); + eoi_ioapic_irq(desc); } #endif /* CONFIG_INTR_REMAP */ From 937582382c71b75b29fbb92615629494e1a05ac0 Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 17 Apr 2009 16:42:14 +0800 Subject: [PATCH 217/900] x86, intr-remap: enable interrupt remapping early Currently, when x2apic is not enabled, interrupt remapping will be enabled in init_dmars(), where it is too late to remap ioapic interrupts, that is, ioapic interrupts are really in compatibility mode, not remappable mode. This patch always enables interrupt remapping before ioapic setup, it guarantees all interrupts will be remapped when interrupt remapping is enabled. Thus it doesn't need to set the compatibility interrupt bit. [ Impact: refactor intr-remap init sequence, enable fuller remap mode ] Signed-off-by: Suresh Siddha Signed-off-by: Weidong Han Acked-by: David Woodhouse Cc: iommu@lists.linux-foundation.org Cc: allen.m.kay@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1239957736-6161-4-git-send-email-weidong.han@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 7 ++-- arch/x86/kernel/apic/apic.c | 80 +++++++++++++++++------------------- drivers/pci/intel-iommu.c | 9 ---- drivers/pci/intr_remapping.c | 28 ++++++------- include/linux/dmar.h | 1 + 5 files changed, 56 insertions(+), 69 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index d4cb7e590c0..fbdd65446c7 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -169,7 +169,6 @@ static inline u64 native_x2apic_icr_read(void) extern int x2apic, x2apic_phys; extern void check_x2apic(void); extern void enable_x2apic(void); -extern void enable_IR_x2apic(void); extern void x2apic_icr_write(u32 low, u32 id); static inline int x2apic_enabled(void) { @@ -190,18 +189,18 @@ static inline void check_x2apic(void) static inline void enable_x2apic(void) { } -static inline void enable_IR_x2apic(void) -{ -} static inline int x2apic_enabled(void) { return 0; } #define x2apic 0 +#define x2apic_preenabled 0 #endif +extern void enable_IR_x2apic(void); + extern int get_physical_broadcast(void); extern void apic_disable(void); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 83e47febcc8..0cf1eea750c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -141,6 +141,8 @@ static int x2apic_preenabled; static int disable_x2apic; static __init int setup_nox2apic(char *str) { + if (x2apic_enabled()) + panic("Bios already enabled x2apic, can't enforce nox2apic"); disable_x2apic = 1; setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; @@ -1345,6 +1347,7 @@ void enable_x2apic(void) wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } +#endif /* CONFIG_X86_X2APIC */ void __init enable_IR_x2apic(void) { @@ -1353,32 +1356,21 @@ void __init enable_IR_x2apic(void) unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; - if (!cpu_has_x2apic) - return; - - if (!x2apic_preenabled && disable_x2apic) { - pr_info("Skipped enabling x2apic and Interrupt-remapping " - "because of nox2apic\n"); - return; - } - - if (x2apic_preenabled && disable_x2apic) - panic("Bios already enabled x2apic, can't enforce nox2apic"); - - if (!x2apic_preenabled && skip_ioapic_setup) { - pr_info("Skipped enabling x2apic and Interrupt-remapping " - "because of skipping io-apic setup\n"); - return; - } - ret = dmar_table_init(); if (ret) { - pr_info("dmar_table_init() failed with %d:\n", ret); + pr_debug("dmar_table_init() failed with %d:\n", ret); + goto ir_failed; + } - if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); - else - pr_info("Not enabling x2apic,Intr-remapping\n"); + if (!intr_remapping_supported()) { + pr_debug("intr-remapping not supported\n"); + goto ir_failed; + } + + + if (!x2apic_preenabled && skip_ioapic_setup) { + pr_info("Skipped enabling intr-remap because of skipping " + "io-apic setup\n"); return; } @@ -1398,20 +1390,25 @@ void __init enable_IR_x2apic(void) mask_IO_APIC_setup(ioapic_entries); mask_8259A(); - ret = enable_intr_remapping(EIM_32BIT_APIC_ID); - - if (ret && x2apic_preenabled) { - local_irq_restore(flags); - panic("x2apic enabled by bios. But IR enabling failed"); - } +#ifdef CONFIG_X86_X2APIC + if (cpu_has_x2apic) + ret = enable_intr_remapping(EIM_32BIT_APIC_ID); + else +#endif + ret = enable_intr_remapping(EIM_8BIT_APIC_ID); if (ret) goto end_restore; - if (!x2apic) { + pr_info("Enabled Interrupt-remapping\n"); + +#ifdef CONFIG_X86_X2APIC + if (cpu_has_x2apic && !x2apic) { x2apic = 1; enable_x2apic(); + pr_info("Enabled x2apic\n"); } +#endif end_restore: if (ret) @@ -1426,30 +1423,29 @@ end_restore: local_irq_restore(flags); end: - if (!ret) { - if (!x2apic_preenabled) - pr_info("Enabled x2apic and interrupt-remapping\n"); - else - pr_info("Enabled Interrupt-remapping\n"); - } else - pr_err("Failed to enable Interrupt-remapping and x2apic\n"); if (ioapic_entries) free_ioapic_entries(ioapic_entries); + + if (!ret) + return; + +ir_failed: + if (x2apic_preenabled) + panic("x2apic enabled by bios. But IR enabling failed"); + else if (cpu_has_x2apic) + pr_info("Not enabling x2apic,Intr-remapping\n"); #else if (!cpu_has_x2apic) return; if (x2apic_preenabled) panic("x2apic enabled prior OS handover," - " enable CONFIG_INTR_REMAP"); - - pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping " - " and x2apic\n"); + " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); #endif return; } -#endif /* CONFIG_X86_X2APIC */ + #ifdef CONFIG_X86_64 /* diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 001b328adf8..9ce8f0764be 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1968,15 +1968,6 @@ static int __init init_dmars(void) } } -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) { - ret = enable_intr_remapping(0); - if (ret) - printk(KERN_ERR - "IOMMU: enable interrupt remapping failed\n"); - } -#endif - /* * For each rmrr * for each dev attached to rmrr diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index f5e0ea724a6..5c2142656e9 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -423,20 +423,6 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode) readl, (sts & DMA_GSTS_IRTPS), sts); spin_unlock_irqrestore(&iommu->register_lock, flags); - if (mode == 0) { - spin_lock_irqsave(&iommu->register_lock, flags); - - /* enable comaptiblity format interrupt pass through */ - cmd = iommu->gcmd | DMA_GCMD_CFI; - iommu->gcmd |= DMA_GCMD_CFI; - writel(cmd, iommu->reg + DMAR_GCMD_REG); - - IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (sts & DMA_GSTS_CFIS), sts); - - spin_unlock_irqrestore(&iommu->register_lock, flags); - } - /* * global invalidation of interrupt entry cache before enabling * interrupt-remapping. @@ -516,6 +502,20 @@ end: spin_unlock_irqrestore(&iommu->register_lock, flags); } +int __init intr_remapping_supported(void) +{ + struct dmar_drhd_unit *drhd; + + for_each_drhd_unit(drhd) { + struct intel_iommu *iommu = drhd->iommu; + + if (!ecap_ir_support(iommu->ecap)) + return 0; + } + + return 1; +} + int __init enable_intr_remapping(int eim) { struct dmar_drhd_unit *drhd; diff --git a/include/linux/dmar.h b/include/linux/dmar.h index e397dc342cd..06f592a7f73 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -108,6 +108,7 @@ struct irte { }; #ifdef CONFIG_INTR_REMAP extern int intr_remapping_enabled; +extern int intr_remapping_supported(void); extern int enable_intr_remapping(int); extern void disable_intr_remapping(void); extern int reenable_intr_remapping(int); From 03ea81550676296d94596e4337c771c6ba29f542 Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 17 Apr 2009 16:42:15 +0800 Subject: [PATCH 218/900] x86, intr-remap: add option to disable interrupt remapping Add option "nointremap" to disable interrupt remapping. [ Impact: add new boot option ] Signed-off-by: Suresh Siddha Signed-off-by: Weidong Han Acked-by: David Woodhouse Cc: iommu@lists.linux-foundation.org Cc: allen.m.kay@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1239957736-6161-5-git-send-email-weidong.han@intel.com> Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 3 +++ drivers/pci/intr_remapping.c | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 33989d284ff..843cb6646d8 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1533,6 +1533,9 @@ and is between 256 and 4096 characters. It is defined in the file noinitrd [RAM] Tells the kernel not to load any configured initial RAM disk. + nointremap [X86-64, Intel-IOMMU] Do not enable interrupt + remapping. + nointroute [IA-64] nojitter [IA64] Disables jitter checking for ITC timers. diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index 5c2142656e9..842039e4955 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -15,6 +15,14 @@ static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; static int ir_ioapic_num; int intr_remapping_enabled; +static int disable_intremap; +static __init int setup_nointremap(char *str) +{ + disable_intremap = 1; + return 0; +} +early_param("nointremap", setup_nointremap); + struct irq_2_iommu { struct intel_iommu *iommu; u16 irte_index; @@ -506,6 +514,9 @@ int __init intr_remapping_supported(void) { struct dmar_drhd_unit *drhd; + if (disable_intremap) + return 0; + for_each_drhd_unit(drhd) { struct intel_iommu *iommu = drhd->iommu; From 9a2755c3569e4db92bd9b1daadeddb4045b0cccd Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 17 Apr 2009 16:42:16 +0800 Subject: [PATCH 219/900] x86, intr-remap: fix x2apic/intr-remap resume Interrupt remapping was decoupled from x2apic. Shouldn't check x2apic before resume interrupt remapping. Otherwise, interrupt remapping won't be resumed when x2apic is not enabled. [ Impact: fix potential intr-remap resume hang on !x2apic ] Signed-off-by: Suresh Siddha Signed-off-by: Weidong Han Acked-by: David Woodhouse Cc: iommu@lists.linux-foundation.org Cc: allen.m.kay@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1239957736-6161-6-git-send-email-weidong.han@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0cf1eea750c..7b41a32339e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2032,7 +2032,7 @@ static int lapic_resume(struct sys_device *dev) return 0; local_irq_save(flags); - if (x2apic) { + if (intr_remapping_enabled) { ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); @@ -2048,8 +2048,10 @@ static int lapic_resume(struct sys_device *dev) mask_IO_APIC_setup(ioapic_entries); mask_8259A(); - enable_x2apic(); } + + if (x2apic) + enable_x2apic(); #else if (!apic_pm_state.active) return 0; @@ -2097,10 +2099,12 @@ static int lapic_resume(struct sys_device *dev) apic_read(APIC_ESR); #ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - reenable_intr_remapping(EIM_32BIT_APIC_ID); + if (intr_remapping_enabled) { + if (x2apic) + reenable_intr_remapping(EIM_32BIT_APIC_ID); + else + reenable_intr_remapping(EIM_8BIT_APIC_ID); - if (x2apic) { unmask_8259A(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); @@ -2109,7 +2113,6 @@ static int lapic_resume(struct sys_device *dev) local_irq_restore(flags); - return 0; } From cece3155d869a50ba534ed161b5a05e8a29dcad0 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 18 Apr 2009 23:45:28 +0400 Subject: [PATCH 220/900] x86: smpboot - wakeup_secondary should be done via __cpuinit section A caller (do_boot_cpu) already has __cpuinit attribute. Since HOTPLUG_CPU depends on SMP && HOTPLUG it doesn't lead to panic at moment. [ Impact: cleanup ] Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090418194528.GD25510@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bf8ad6344b1..d2e8de95815 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid) * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -int __devinit +int __cpuinit wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -static int __devinit +static int __cpuinit wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; From 667c5296cc76fefe0abcb79228952b28d9af45e3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 19 Apr 2009 11:43:11 +0400 Subject: [PATCH 221/900] x86: es7000, uv - use __cpuinit for kicking secondary cpus The caller already has __cpuinit attribute. [ Impact: save memory, address section mismatch warning ] Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu Cc: Pavel Emelyanov LKML-Reference: <20090419074311.GA8670@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 1c11b819f24..8e07c141866 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } -static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { unsigned long vect = 0, psaival = 0; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index de1a50af807..873bf7121e8 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -91,7 +91,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) cpumask_set_cpu(cpu, retmask); } -static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { #ifdef CONFIG_SMP unsigned long val; From e057a5e5647a1c9d0d0054fbd298bfa04b3d1cb4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 23:38:12 +0200 Subject: [PATCH 222/900] tracing/core: Add current context on tracing recursion warning In case of tracing recursion detection, we only get the stacktrace. But the current context may be very useful to debug the issue. This patch adds the softirq/hardirq/nmi context with the warning using lockdep context display to have a familiar output. v2: Use printk_once() v3: drop {hardirq,softirq}_context which depend on lockdep, only keep what is part of current->trace_recursion, sufficient to debug the warning source. [ Impact: print context necessary to debug recursion ] Signed-off-by: Frederic Weisbecker --- kernel/trace/ring_buffer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b421b0ea911..bffde630c4e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1495,6 +1495,13 @@ static int trace_recursive_lock(void) if (unlikely(current->trace_recursion & (1 << level))) { /* Disable all tracing before we do anything else */ tracing_off_permanent(); + + printk_once(KERN_WARNING "Tracing recursion: " + "HC[%lu]:SC[%lu]:NMI[%lu]\n", + hardirq_count() >> HARDIRQ_SHIFT, + softirq_count() >> SOFTIRQ_SHIFT, + in_nmi()); + WARN_ON_ONCE(1); return -1; } From f3b9aae16219aaeca2dd5a9ca69f7a10faa063df Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 23:39:33 +0200 Subject: [PATCH 223/900] tracing/ring-buffer: Add unlock recursion protection on discard The pair of helpers trace_recursive_lock() and trace_recursive_unlock() have been introduced recently to provide generic tracing recursion protection. They are used in a symetric way: - trace_recursive_lock() on buffer reserve - trace_recursive_unlock() on buffer commit However sometimes, we don't commit but discard on entry to the buffer, ie: in case of filter checking. Then we must also unlock the recursion protection on discard time, otherwise the tracing gets definitely deactivated and a warning is raised spuriously, such as: 111.119821] ------------[ cut here ]------------ [ 111.119829] WARNING: at kernel/trace/ring_buffer.c:1498 ring_buffer_lock_reserve+0x1b7/0x1d0() [ 111.119835] Hardware name: AMILO Li 2727 [ 111.119839] Modules linked in: [ 111.119846] Pid: 5731, comm: Xorg Tainted: G W 2.6.30-rc1 #69 [ 111.119851] Call Trace: [ 111.119863] [] warn_slowpath+0xd8/0x130 [ 111.119873] [] ? __lock_acquire+0x19f/0x1ae0 [ 111.119882] [] ? __lock_acquire+0x19f/0x1ae0 [ 111.119891] [] ? native_sched_clock+0x20/0x70 [ 111.119899] [] ? put_lock_stats+0xe/0x30 [ 111.119906] [] ? lock_release_holdtime+0xa8/0x150 [ 111.119913] [] ring_buffer_lock_reserve+0x1b7/0x1d0 [ 111.119921] [] trace_buffer_lock_reserve+0x30/0x70 [ 111.119930] [] trace_current_buffer_lock_reserve+0x20/0x30 [ 111.119939] [] ftrace_raw_event_sched_switch+0x58/0x100 [ 111.119948] [] __schedule+0x3a7/0x4cd [ 111.119957] [] ? ftrace_call+0x5/0x2b [ 111.119964] [] ? ftrace_call+0x5/0x2b [ 111.119971] [] schedule+0x18/0x40 [ 111.119977] [] preempt_schedule+0x39/0x60 [ 111.119985] [] _read_unlock+0x53/0x60 [ 111.119993] [] sock_def_readable+0x72/0x80 [ 111.120002] [] unix_stream_sendmsg+0x24d/0x3d0 [ 111.120011] [] sock_aio_write+0x143/0x160 [ 111.120019] [] ? ftrace_call+0x5/0x2b [ 111.120026] [] ? sock_aio_write+0x0/0x160 [ 111.120033] [] ? sock_aio_write+0x0/0x160 [ 111.120042] [] do_sync_readv_writev+0xf3/0x140 [ 111.120049] [] ? ftrace_call+0x5/0x2b [ 111.120057] [] ? autoremove_wake_function+0x0/0x40 [ 111.120067] [] ? cap_file_permission+0x9/0x10 [ 111.120074] [] ? security_file_permission+0x16/0x20 [ 111.120082] [] do_readv_writev+0xd4/0x1f0 [ 111.120089] [] ? ftrace_call+0x5/0x2b [ 111.120097] [] ? ftrace_call+0x5/0x2b [ 111.120105] [] vfs_writev+0x48/0x70 [ 111.120111] [] sys_writev+0x55/0xc0 [ 111.120119] [] system_call_fastpath+0x16/0x1b [ 111.120125] ---[ end trace 15605f4e98d5ccb5 ]--- [ Impact: fix spurious warning triggering tracing shutdown ] Signed-off-by: Frederic Weisbecker --- kernel/trace/ring_buffer.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bffde630c4e..e145969a8ed 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1642,6 +1642,14 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + /** * ring_buffer_event_discard - discard any event in the ring buffer * @event: the event to discard @@ -1656,10 +1664,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); */ void ring_buffer_event_discard(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; + rb_event_discard(event); + trace_recursive_unlock(); } EXPORT_SYMBOL_GPL(ring_buffer_event_discard); @@ -1690,7 +1696,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, int cpu; /* The event is discarded regardless */ - ring_buffer_event_discard(event); + rb_event_discard(event); /* * This must only be called if the event has not been @@ -1735,6 +1741,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, if (rb_is_commit(cpu_buffer, event)) rb_set_commit_to_write(cpu_buffer); + trace_recursive_unlock(); + /* * Only the last preempt count needs to restore preemption. */ From 9ae5b8790037d05d32746f521af146c32089bfec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 10:27:58 -0400 Subject: [PATCH 224/900] tracing: change branch profiling to a choice selection This patch makes the branch profiling into a choice selection: None - no branch profiling likely/unlikely - only profile likely/unlikely branches all - profile all branches The all profiler will also enable the likely/unlikely branches. This does not change the way the profiler works or the dependencies between the profilers. What this patch does, is keep the branch profiling from being selected by an allyesconfig make. The branch profiler is very intrusive and it is known to break various architecture builds when selected as an allyesconfig. [ Impact: prevent branch profiler from being selected in allyesconfig ] Reported-by: Heiko Carstens Reported-by: Al Viro Reported-by: Stephen Rothwell Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 57981d338d1..3ee28db69be 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -212,8 +212,36 @@ config BOOT_TRACER to enable this on bootup. config TRACE_BRANCH_PROFILING - bool "Trace likely/unlikely profiler" + bool select TRACING + +choice + prompt "Branch Profiling" + default BRANCH_PROFILE_NONE + help + The branch profiling is a software profiler. It will add hooks + into the C conditionals to test which path a branch takes. + + The likely/unlikely profiler only looks at the conditions that + are annotated with a likely or unlikely macro. + + The "all branch" profiler will profile every if statement in the + kernel. This profiler will also enable the likely/unlikely + profiler as well. + + Either of the above profilers add a bit of overhead to the system. + If unsure choose "No branch profiling". + +config BRANCH_PROFILE_NONE + bool "No branch profiling" + help + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. + +config PROFILE_ANNOTATED_BRANCHES + bool "Trace likely/unlikely profiler" + select TRACE_BRANCH_PROFILING help This tracer profiles all the the likely and unlikely macros in the kernel. It will display the results in: @@ -223,11 +251,9 @@ config TRACE_BRANCH_PROFILING Note: this will add a significant overhead, only turn this on if you need to profile the system's use of these macros. - Say N if unsure. - config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" - depends on TRACE_BRANCH_PROFILING + select TRACE_BRANCH_PROFILING help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. @@ -235,11 +261,12 @@ config PROFILE_ALL_BRANCHES /debugfs/tracing/profile_branch + This option also enables the likely/unlikely profiler. + This configuration, when enabled, will impose a great overhead on the system. This should only be enabled when the system is to be analyzed - - Say N if unsure. +endchoice config TRACING_BRANCHES bool From 4ed9f0716e46bb9646f26e73f4a1b5b24db7947a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 10:47:36 -0400 Subject: [PATCH 225/900] tracing: create menuconfig for tracing infrastructure During testing we often use randconfig to test various kernels. The current configuration set up does not give an easy way to disable all tracing with a single config. The case where randconfig would test all tracing disabled is very unlikely. This patch adds a config option to enable or disable all tracing. It is hooked into the tracing menu just like other submenus are done. [ Impact: allow randconfig to easily produce all traces disabled ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3ee28db69be..3fa36d2bc29 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -77,7 +77,12 @@ config TRACING_SUPPORT if TRACING_SUPPORT -menu "Tracers" +menuconfig FTRACE + bool "Tracers" + help + Enable the kernel tracing infrastructure. + +if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" @@ -462,7 +467,7 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. -endmenu +endif # FTRACE endif # TRACING_SUPPORT From a7abe97fd8e7a6ccabba5a04a9f17be9211d418c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 10:59:34 -0400 Subject: [PATCH 226/900] tracing: rename EVENT_TRACER config to ENABLE_EVENT_TRACING Currently we have two configs: EVENT_TRACING and EVENT_TRACER. All tracers enable EVENT_TRACING. The EVENT_TRACER is only a convenience to enable the EVENT_TRACING when no other tracers are enabled. The names EVENT_TRACER and EVENT_TRACING are too similar and confusing. This patch renames EVENT_TRACER to ENABLE_EVENT_TRACING to be more appropriate to what it actually does, as well as add a comment in the help menu to explain the option's purpose. [ Impact: rename config option to reduce confusion ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3fa36d2bc29..450d3c2cfbd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -183,7 +183,7 @@ config CONTEXT_SWITCH_TRACER This tracer gets called from the context switch and records all switching of tasks. -config EVENT_TRACER +config ENABLE_EVENT_TRACING bool "Trace various events in the kernel" select TRACING help @@ -191,6 +191,10 @@ config EVENT_TRACER allowing the user to pick and choose which trace point they want to trace. + Note, all tracers enable event tracing. This option is + only a convenience to enable event tracing when no other + tracers are selected. + config FTRACE_SYSCALLS bool "Trace syscalls" depends on HAVE_FTRACE_SYSCALLS From 28d20e2d6e94434827e11c310788b87204b84559 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 12:12:44 -0400 Subject: [PATCH 227/900] tracing/events: call the correct event trace selftest init function The late_initcall calls a helper function instead of the proper init event selftest function. This update may have been lost due to conflicting merges. [ Impact: fix compiler warning and call extended event trace self tests ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 64f9d6d2735..98daf5dc74a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1199,6 +1199,6 @@ static __init int event_trace_self_tests_init(void) return 0; } -late_initcall(event_trace_self_tests); +late_initcall(event_trace_self_tests_init); #endif From 23de29de2d8b227943be191d59fb6d983996d55e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 12:59:29 -0400 Subject: [PATCH 228/900] tracing: remove dangling semicolon Due to a cut and paste error, the trace_seq_putc had a semicolon after the prototype but before the stub function when tracing is disabled. [Impact: fix compile error ] Signed-off-by: Steven Rostedt --- include/linux/trace_seq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 37db9bdfbc1..ba9627f00d3 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -63,7 +63,7 @@ static inline int trace_seq_puts(struct trace_seq *s, const char *str) { return 0; } -static inline int trace_seq_putc(struct trace_seq *s, unsigned char c); +static inline int trace_seq_putc(struct trace_seq *s, unsigned char c) { return 0; } From 17487bfeb6cfb05920e6a9d5a54f345f2917b4e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 13:24:21 -0400 Subject: [PATCH 229/900] tracing: fix recursive test level calculation The recursive tests to detect same level recursion in the ring buffers did not account for the hard/softirq_counts to be shifted. Thus the numbers could be larger than then mask to be tested. This patch includes the shift for the calculation of the irq depth. [ Impact: stop false positives in trace recursion detection ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e145969a8ed..aa40ae92233 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1483,7 +1483,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, static int trace_irq_level(void) { - return hardirq_count() + softirq_count() + in_nmi(); + return (hardirq_count() >> HARDIRQ_SHIFT) + + (softirq_count() >> + SOFTIRQ_SHIFT) + + !!in_nmi(); } static int trace_recursive_lock(void) From e395898e98119085f666febbc7b631dd69bc637f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 13:32:44 -0400 Subject: [PATCH 230/900] tracing: remove recursive test from ring_buffer_event_discard The ring_buffer_event_discard is not tied to ring_buffer_lock_reserve. It can be called inside or outside the reserve/commit. Even if it is called inside the reserve/commit the commit part must also be called. Only ring_buffer_discard_commit can be used as a replacement for ring_buffer_unlock_commit. This patch removes the trace_recursive_unlock from ring_buffer_event_discard since it would be the wrong place to do so. [Impact: prevent breakage in trace recursive testing ] Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index aa40ae92233..a6997670cc4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1667,7 +1667,6 @@ static inline void rb_event_discard(struct ring_buffer_event *event) void ring_buffer_event_discard(struct ring_buffer_event *event) { rb_event_discard(event); - trace_recursive_unlock(); } EXPORT_SYMBOL_GPL(ring_buffer_event_discard); From ff743345bf7685a207868048a70e23164c4785e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:26 +0100 Subject: [PATCH 231/900] sched: remove extra call overhead for schedule() Lai Jiangshan's patch reminded me that I promised Nick to remove that extra call overhead in schedule(). Signed-off-by: Peter Zijlstra LKML-Reference: <20090313112300.927414207@chello.nl> Signed-off-by: Ingo Molnar --- kernel/mutex.c | 4 +++- kernel/sched.c | 12 ++++-------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/kernel/mutex.c b/kernel/mutex.c index 5d79781394a..e1fb7351040 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -248,7 +248,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didnt get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - __schedule(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); spin_lock_mutex(&lock->wait_lock, flags); } diff --git a/kernel/sched.c b/kernel/sched.c index 7601ceebf7c..797f6fdabad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5131,13 +5131,15 @@ pick_next_task(struct rq *rq) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched __schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; +need_resched: + preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -5194,15 +5196,9 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -} -asmlinkage void __sched schedule(void) -{ -need_resched: - preempt_disable(); - __schedule(); preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); From aa18efb2a2f07e1cf062039848e9d369bb358724 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 16:16:11 -0400 Subject: [PATCH 232/900] tracing: use recursive counter over irq level Althought using the irq level (hardirq_count, softirq_count and in_nmi) was nice to detect bad recursion right away, but since the counters are not atomically updated with respect to the interrupts, the function tracer might trigger the test from an interrupt handler before the hardirq_count is updated. This will trigger a false warning. This patch converts the recursive detection to a simple counter. If the depth is greater than 16 then the recursive detection will trigger. 16 is more than enough for any nested interrupts. [ Impact: fix false positive trace recursion detection ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 45 ++++++++++++++------------------------ 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a6997670cc4..7bcfd3e6053 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1481,47 +1481,34 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return event; } -static int trace_irq_level(void) -{ - return (hardirq_count() >> HARDIRQ_SHIFT) + - (softirq_count() >> + SOFTIRQ_SHIFT) + - !!in_nmi(); -} +#define TRACE_RECURSIVE_DEPTH 16 static int trace_recursive_lock(void) { - int level; + current->trace_recursion++; - level = trace_irq_level(); + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; - if (unlikely(current->trace_recursion & (1 << level))) { - /* Disable all tracing before we do anything else */ - tracing_off_permanent(); + /* Disable all tracing before we do anything else */ + tracing_off_permanent(); - printk_once(KERN_WARNING "Tracing recursion: " - "HC[%lu]:SC[%lu]:NMI[%lu]\n", - hardirq_count() >> HARDIRQ_SHIFT, - softirq_count() >> SOFTIRQ_SHIFT, - in_nmi()); + printk_once(KERN_WARNING "Tracing recursion: depth[%d]:" + "HC[%lu]:SC[%lu]:NMI[%lu]\n", + current->trace_recursion, + hardirq_count() >> HARDIRQ_SHIFT, + softirq_count() >> SOFTIRQ_SHIFT, + in_nmi()); - WARN_ON_ONCE(1); - return -1; - } - - current->trace_recursion |= 1 << level; - - return 0; + WARN_ON_ONCE(1); + return -1; } static void trace_recursive_unlock(void) { - int level; + WARN_ON_ONCE(!current->trace_recursion); - level = trace_irq_level(); - - WARN_ON_ONCE(!current->trace_recursion & (1 << level)); - - current->trace_recursion &= ~(1 << level); + current->trace_recursion--; } static DEFINE_PER_CPU(int, rb_need_resched); From cb4764a6dbffd9bb3cf759421ae82384071a933d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 18:16:44 -0400 Subject: [PATCH 233/900] tracing: use nowakeup version of commit for function event trace tests The startup tests for the event tracer also runs with the function tracer enabled. The "wakeup" version of the trace commit was used which can grab spinlocks. If a task was preempted by an NMI that called a function being traced, it could deadlock due to the function tracer trying to grab the same lock. Thanks to Frederic Weisbecker for pointing out where the bug was. Reported-by: Ingo Molnar Reported-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 98daf5dc74a..672b195f86c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1164,7 +1164,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) entry->ip = ip; entry->parent_ip = parent_ip; - trace_current_buffer_unlock_commit(event, flags, pc); + trace_nowake_buffer_unlock_commit(event, flags, pc); out: atomic_dec(&per_cpu(test_event_disable, cpu)); From 6e29ec5701e9d44fa02b96c1c5c45f7516182b65 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 21 Apr 2009 08:40:49 +0530 Subject: [PATCH 234/900] sched: Replace first_cpu() with cpumask_first() in ILB nomination code Stephen Rothwell reported this build warning: > kernel/sched.c: In function 'find_new_ilb': > kernel/sched.c:4355: warning: passing argument 1 of '__first_cpu' from incompatible pointer type > > Possibly caused by commit f711f6090a81cbd396b63de90f415d33f563af9b > ("sched: Nominate idle load balancer from a semi-idle package") from > the sched tree. Should this call to first_cpu be cpumask_first? For !(CONFIG_SCHED_MC || CONFIG_SCHED_SMT), find_new_ilb() nominates the Idle load balancer as the first cpu from the nohz.cpu_mask. This code uses the older API first_cpu(). Replace it with cpumask_first(), which is the correct API here. [ Impact: cleanup, address build warning ] Reported-by: Stephen Rothwell Signed-off-by: Gautham R Shenoy Cc: Rusty Russell LKML-Reference: <20090421031049.GA4140@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 797f6fdabad..54d67b94f1a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4356,7 +4356,7 @@ out_done: #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) { - return first_cpu(nohz.cpu_mask); + return cpumask_first(nohz.cpu_mask); } #endif From fc1edaf9e7cc4d4696f83dee495b8f158d01c4eb Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:27 -0700 Subject: [PATCH 235/900] x86: x2apic, IR: Clean up X86_X2APIC and INTR_REMAP config checks Add x2apic_supported() to clean up CONFIG_X86_X2APIC checks. Fix CONFIG_INTR_REMAP checks. [ Impact: cleanup ] Signed-off-by: Suresh Siddha Cc: dwmw2@infradead.org Cc: Suresh Siddha Cc: Weidong Han LKML-Reference: <20090420200450.128993000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 10 +++--- arch/x86/include/asm/io_apic.h | 2 -- arch/x86/include/asm/irq_remapping.h | 2 +- arch/x86/kernel/apic/apic.c | 49 +++++++--------------------- arch/x86/kernel/apic/io_apic.c | 2 -- arch/x86/kernel/apic/probe_64.c | 2 +- include/linux/dmar.h | 2 ++ 7 files changed, 21 insertions(+), 48 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index fbdd65446c7..3738438a91f 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void); extern void native_apic_icr_write(u32 low, u32 id); extern u64 native_apic_icr_read(void); -#define EIM_8BIT_APIC_ID 0 -#define EIM_32BIT_APIC_ID 1 +extern int x2apic_mode; #ifdef CONFIG_X86_X2APIC /* @@ -166,7 +165,7 @@ static inline u64 native_x2apic_icr_read(void) return val; } -extern int x2apic, x2apic_phys; +extern int x2apic_phys; extern void check_x2apic(void); extern void enable_x2apic(void); extern void x2apic_icr_write(u32 low, u32 id); @@ -182,6 +181,8 @@ static inline int x2apic_enabled(void) return 1; return 0; } + +#define x2apic_supported() (cpu_has_x2apic) #else static inline void check_x2apic(void) { @@ -194,9 +195,8 @@ static inline int x2apic_enabled(void) return 0; } -#define x2apic 0 #define x2apic_preenabled 0 - +#define x2apic_supported() 0 #endif extern void enable_IR_x2apic(void); diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 9d826e43601..34eaa37f7ad 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -161,7 +161,6 @@ extern int io_apic_set_pci_routing(int ioapic, int pin, int irq, extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); -#ifdef CONFIG_X86_64 extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); @@ -169,7 +168,6 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern void reinit_intr_remapped_IO_APIC(int intr_remapping, struct IO_APIC_route_entry **ioapic_entries); -#endif extern void probe_nr_irqs_gsi(void); diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 0396760fccb..f275e224450 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -1,6 +1,6 @@ #ifndef _ASM_X86_IRQ_REMAPPING_H #define _ASM_X86_IRQ_REMAPPING_H -#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8) +#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) #endif /* _ASM_X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7b41a32339e..2b30e520dce 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -134,8 +134,8 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif +int x2apic_mode; #ifdef CONFIG_X86_X2APIC -int x2apic; /* x2apic enabled before OS handover */ static int x2apic_preenabled; static int disable_x2apic; @@ -858,7 +858,7 @@ void clear_local_APIC(void) u32 v; /* APIC hasn't been mapped yet */ - if (!x2apic && !apic_phys) + if (!x2apic_mode && !apic_phys) return; maxlvt = lapic_get_maxlvt(); @@ -1330,7 +1330,7 @@ void check_x2apic(void) { if (x2apic_enabled()) { pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); - x2apic_preenabled = x2apic = 1; + x2apic_preenabled = x2apic_mode = 1; } } @@ -1338,7 +1338,7 @@ void enable_x2apic(void) { int msr, msr2; - if (!x2apic) + if (!x2apic_mode) return; rdmsr(MSR_IA32_APICBASE, msr, msr2); @@ -1390,25 +1390,17 @@ void __init enable_IR_x2apic(void) mask_IO_APIC_setup(ioapic_entries); mask_8259A(); -#ifdef CONFIG_X86_X2APIC - if (cpu_has_x2apic) - ret = enable_intr_remapping(EIM_32BIT_APIC_ID); - else -#endif - ret = enable_intr_remapping(EIM_8BIT_APIC_ID); - + ret = enable_intr_remapping(x2apic_supported()); if (ret) goto end_restore; pr_info("Enabled Interrupt-remapping\n"); -#ifdef CONFIG_X86_X2APIC - if (cpu_has_x2apic && !x2apic) { - x2apic = 1; + if (x2apic_supported() && !x2apic_mode) { + x2apic_mode = 1; enable_x2apic(); pr_info("Enabled x2apic\n"); } -#endif end_restore: if (ret) @@ -1576,7 +1568,7 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { - if (x2apic) { + if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); return; } @@ -2010,10 +2002,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) local_irq_save(flags); disable_local_APIC(); -#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) disable_intr_remapping(); -#endif + local_irq_restore(flags); return 0; } @@ -2023,8 +2015,6 @@ static int lapic_resume(struct sys_device *dev) unsigned int l, h; unsigned long flags; int maxlvt; - -#ifdef CONFIG_INTR_REMAP int ret; struct IO_APIC_route_entry **ioapic_entries = NULL; @@ -2050,17 +2040,8 @@ static int lapic_resume(struct sys_device *dev) mask_8259A(); } - if (x2apic) + if (x2apic_mode) enable_x2apic(); -#else - if (!apic_pm_state.active) - return 0; - - local_irq_save(flags); - if (x2apic) - enable_x2apic(); -#endif - else { /* * Make sure the APICBASE points to the right address @@ -2098,18 +2079,12 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) { - if (x2apic) - reenable_intr_remapping(EIM_32BIT_APIC_ID); - else - reenable_intr_remapping(EIM_8BIT_APIC_ID); - + reenable_intr_remapping(x2apic_mode); unmask_8259A(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } -#endif local_irq_restore(flags); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ea22a86e3cd..3a45d2ec974 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -736,7 +736,6 @@ static int __init ioapic_pirq_setup(char *str) __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_INTR_REMAP struct IO_APIC_route_entry **alloc_ioapic_entries(void) { int apic; @@ -857,7 +856,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) kfree(ioapic_entries); } -#endif /* * Find the IRQ entry number of a certain pin. diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 1783652bb0e..bc3e880f9b8 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = { void __init default_setup_apic_routing(void) { #ifdef CONFIG_X86_X2APIC - if (x2apic && (apic != &apic_x2apic_phys && + if (x2apic_mode && (apic != &apic_x2apic_phys && #ifdef CONFIG_X86_UV apic != &apic_x2apic_uv_x && #endif diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 06f592a7f73..10ff5c49882 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -158,6 +158,8 @@ static inline struct intel_iommu *map_ioapic_to_ir(int apic) } #define irq_remapped(irq) (0) #define enable_intr_remapping(mode) (-1) +#define disable_intr_remapping() (0) +#define reenable_intr_remapping(mode) (0) #define intr_remapping_enabled (0) #endif From 25629d810a52176758401184d9b437fbb7f79195 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:28 -0700 Subject: [PATCH 236/900] x86: x2apic, IR: Move eoi_ioapic_irq() into a CONFIG_INTR_REMAP section Address the following complier warning: arch/x86/kernel/apic/io_apic.c:2543: warning: `eoi_ioapic_irq' defined but not used By moving that function (and eoi_ioapic_irq()) into an existing #ifdef CONFIG_INTR_REMAP section of the code. [ Impact: cleanup ] Signed-off-by: Suresh Siddha Cc: dwmw2@infradead.org Cc: Weidong Han LKML-Reference: <20090420200450.271099000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar Cc: Weidong Han --- arch/x86/kernel/apic/io_apic.c | 66 +++++++++++++++++----------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3a45d2ec974..4baa9cbd630 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2517,39 +2517,6 @@ static void irq_complete_move(struct irq_desc **descp) static inline void irq_complete_move(struct irq_desc **descp) {} #endif -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - static void ack_apic_edge(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2659,6 +2626,39 @@ static void ack_apic_level(unsigned int irq) } #ifdef CONFIG_INTR_REMAP +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + + entry = cfg->irq_2_pin; + for (;;) { + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + io_apic_eoi(apic, pin); + entry = entry->next; + } +} + +static void +eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ir_ack_apic_edge(unsigned int irq) { ack_APIC_irq(); From 39d83a5d684a457046aa2a6dac60f105966e78e9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:29 -0700 Subject: [PATCH 237/900] x86: x2apic, IR: Clean up panic() with nox2apic boot option Instead of panic() ignore the "nox2apic" boot option when BIOS has already enabled x2apic prior to OS handover. [ Impact: printk warning instead of panic() when BIOS has enabled x2apic already ] Signed-off-by: Suresh Siddha Cc: dwmw2@infradead.org Cc: Weidong Han LKML-Reference: <20090420200450.425091000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2b30e520dce..d32f5589f1d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -141,8 +141,12 @@ static int x2apic_preenabled; static int disable_x2apic; static __init int setup_nox2apic(char *str) { - if (x2apic_enabled()) - panic("Bios already enabled x2apic, can't enforce nox2apic"); + if (x2apic_enabled()) { + pr_warning("Bios already enabled x2apic, " + "can't enforce nox2apic"); + return 0; + } + disable_x2apic = 1; setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; From 9d6c26e73bd248c286bb3597aaf788716e8fcceb Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:31 -0700 Subject: [PATCH 238/900] x86: x2apic, IR: Make config X86_UV dependent on X86_X2APIC Instead of selecting X86_X2APIC, make config X86_UV dependent on X86_X2APIC. This will eliminate enabling CONFIG_X86_X2APIC with out enabling CONFIG_INTR_REMAP. [ Impact: cleanup ] Signed-off-by: Suresh Siddha Acked-by: Jack Steiner Cc: dwmw2@infradead.org Cc: Suresh Siddha Cc: Weidong Han LKML-Reference: <20090420200450.694598000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c9086e6307a..58fb7b3bcd1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -354,7 +354,7 @@ config X86_UV depends on X86_64 depends on X86_EXTENDED_PLATFORM depends on NUMA - select X86_X2APIC + depends on X86_X2APIC ---help--- This option is needed in order to support SGI Ultraviolet systems. If you don't have one of these, you should say N here. From 89388913f2c88a2cd15d24abab571b17a2596127 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 21 Apr 2009 11:39:27 +0300 Subject: [PATCH 239/900] x86: unify noexec handling This patch unifies noexec handling on 32-bit and 64-bit. [ Impact: cleanup ] Signed-off-by: Pekka Enberg [ mingo@elte.hu: build fix ] LKML-Reference: <1240303167.771.69.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_types.h | 1 - arch/x86/mm/init.c | 67 ++++++++++++++++++++++++++-- arch/x86/mm/init_32.c | 52 --------------------- arch/x86/mm/init_64.c | 33 -------------- 4 files changed, 63 insertions(+), 90 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b8238dc8786..4d258ad76a0 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -273,7 +273,6 @@ typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern int nx_enabled; -extern void set_nx(void); #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index fd3da1dda1c..fedde5359a0 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -22,6 +22,69 @@ int direct_gbpages #endif ; +int nx_enabled; + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static int disable_nx __cpuinitdata; + +/* + * noexec = on|off + * + * Control non-executable mappings for processes. + * + * on Enable + * off Disable + */ +static int __init noexec_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } else if (!strncmp(str, "off", 3)) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", noexec_setup); +#endif + +#ifdef CONFIG_X86_PAE +static void __init set_nx(void) +{ + unsigned int v[4], l, h; + + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + + if ((v[3] & (1 << 20)) && !disable_nx) { + rdmsr(MSR_EFER, l, h); + l |= EFER_NX; + wrmsr(MSR_EFER, l, h); + nx_enabled = 1; + __supported_pte_mask |= _PAGE_NX; + } + } +} +#else +static inline void set_nx(void) +{ +} +#endif + +#ifdef CONFIG_X86_64 +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || disable_nx) + __supported_pte_mask &= ~_PAGE_NX; +} +#endif + static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { @@ -158,12 +221,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) printk(KERN_INFO "NX (Execute Disable) protection: active\n"); -#endif /* Enable PSE if available */ if (cpu_has_pse) @@ -174,7 +234,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, set_in_cr4(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } -#endif if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 749559ed80f..2b27120665b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -587,61 +587,9 @@ void zap_low_mappings(void) flush_tlb_all(); } -int nx_enabled; - pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); EXPORT_SYMBOL_GPL(__supported_pte_mask); -#ifdef CONFIG_X86_PAE - -static int disable_nx __initdata; - -/* - * noexec = on|off - * - * Control non executable mappings. - * - * on Enable - * off Disable - */ -static int __init noexec_setup(char *str) -{ - if (!str || !strcmp(str, "on")) { - if (cpu_has_nx) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } - } else { - if (!strcmp(str, "off")) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } else { - return -EINVAL; - } - } - - return 0; -} -early_param("noexec", noexec_setup); - -void __init set_nx(void) -{ - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); - - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; - } - } -} -#endif - /* user-defined highmem size */ static unsigned int highmem_pages = -1; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1753e8020df..a4e7846efb1 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -85,39 +85,6 @@ early_param("gbpages", parse_direct_gbpages_on); pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; EXPORT_SYMBOL_GPL(__supported_pte_mask); -static int disable_nx __cpuinitdata; - -/* - * noexec=on|off - * Control non-executable mappings for 64-bit processes. - * - * on Enable (default) - * off Disable - */ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } else if (!strncmp(str, "off", 3)) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || disable_nx) - __supported_pte_mask &= ~_PAGE_NX; -} - int force_personality32; /* From e8082f3f5a17d7a7bfc7dd1050a3f958dc034e9a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 21 Apr 2009 17:11:46 +0800 Subject: [PATCH 240/900] tracing/filters: don't remove old filters when failed to write subsys->filter If writing subsys->filter returns EINVAL or ENOSPC, the original filters in subsys/ and subsys/events/ will be removed. This is definitely wrong. [ Impact: fix filter setting semantics on error condition ] Signed-off-by: Li Zefan Cc: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49ED8DD2.2070700@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 672b195f86c..9ea55a7dfde 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -600,7 +600,6 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, err = filter_add_subsystem_pred(system, pred); if (err < 0) { - filter_free_subsystem_preds(system); filter_free_pred(pred); return err; } From f66578a7637b87810cbb9041c4e3a77fd2fa4706 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 21 Apr 2009 17:12:11 +0800 Subject: [PATCH 241/900] tracing/filters: allow user-input to be integer-like string Suppose we would like to trace all tasks named '123', but this will fail: # echo 'parent_comm == 123' > events/sched/sched_process_fork/filter bash: echo: write error: Invalid argument Don't guess the type of the filter pred in filter_parse(), but instead we check it in __filter_add_pred(). [ Impact: extend allowed filter field string values ] Signed-off-by: Li Zefan Cc: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49ED8DEB.6000700@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e0fcfd2a16d..65418288f95 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -313,6 +313,7 @@ static int __filter_add_pred(struct ftrace_event_call *call, { struct ftrace_event_field *field; filter_pred_fn_t fn; + unsigned long long val; field = find_event_field(call, pred->field_name); if (!field) @@ -322,14 +323,13 @@ static int __filter_add_pred(struct ftrace_event_call *call, pred->offset = field->offset; if (is_string_field(field->type)) { - if (!pred->str_len) - return -EINVAL; fn = filter_pred_string; pred->str_len = field->size; return filter_add_pred_fn(call, pred, fn); } else { - if (pred->str_len) + if (strict_strtoull(pred->str_val, 0, &val)) return -EINVAL; + pred->val = val; } switch (field->size) { @@ -413,12 +413,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system, return 0; } +/* + * The filter format can be + * - 0, which means remove all filter preds + * - [||/&&] ==/!= + */ int filter_parse(char **pbuf, struct filter_pred *pred) { - char *tmp, *tok, *val_str = NULL; + char *tok, *val_str = NULL; int tok_n = 0; - /* field ==/!= number, or/and field ==/!= number, number */ while ((tok = strsep(pbuf, " \n"))) { if (tok_n == 0) { if (!strcmp(tok, "0")) { @@ -478,19 +482,13 @@ int filter_parse(char **pbuf, struct filter_pred *pred) return -EINVAL; } + strcpy(pred->str_val, val_str); + pred->str_len = strlen(val_str); + pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); if (!pred->field_name) return -ENOMEM; - pred->str_len = 0; - pred->val = simple_strtoull(val_str, &tmp, 0); - if (tmp == val_str) { - strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL); - pred->str_len = strlen(val_str); - pred->str_val[pred->str_len] = '\0'; - } else if (*tmp != '\0') - return -EINVAL; - return 0; } From 3554228d4289098a8fe5cfd87512ec32a19bbe5a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Apr 2009 09:41:26 -0400 Subject: [PATCH 242/900] ring-buffer: only warn on wrap if buffer is bigger than two pages On boot up, to save memory, ftrace allocates the minimum buffer which is two pages. Ftrace also goes through a series of tests (when configured) on boot up. These tests can fill up a page within a single interrupt. The ring buffer also has a WARN_ON when it detects that the buffer was completely filled within a single commit (other commits are allowed to be nested). Combine the small buffer on start up, with the tests that can fill more than a single page within an interrupt, this can trigger the WARN_ON. This patch makes the WARN_ON only happen when the ring buffer consists of more than two pages. [ Impact: prevent false WARN_ON in ftrace startup tests ] Reported-by: Ingo Molnar LKML-Reference: <20090421094616.GA14561@elte.hu> Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7bcfd3e6053..61dbdf21cd3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1241,7 +1241,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * about it. */ if (unlikely(next_page == commit_page)) { - WARN_ON_ONCE(1); + /* This can easily happen on small ring buffers */ + WARN_ON_ONCE(buffer->pages > 2); goto out_reset; } From 7a4f453b6d7379a7c380825949977c5a838aa012 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 22 Apr 2009 16:53:34 +0800 Subject: [PATCH 243/900] tracing/events: make struct trace_entry->type to be int type struct trace_entry->type is unsigned char, while trace event's id is int type, thus for a event with id >= 256, it's entry->type is cast to (id % 256), and then we can't see the trace output of this event. # insmod trace-events-sample.ko # echo foo_bar > /mnt/tracing/set_event # cat /debug/tracing/events/trace-events-sample/foo_bar/id 256 # cat /mnt/tracing/trace_pipe <...>-3548 [001] 215.091142: Unknown type 0 <...>-3548 [001] 216.089207: Unknown type 0 <...>-3548 [001] 217.087271: Unknown type 0 <...>-3548 [001] 218.085332: Unknown type 0 [ Impact: fix output for trace events with id >= 256 ] Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <49EEDB0E.5070207@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 4 ++-- include/trace/ftrace.h | 2 +- kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 2 +- kernel/trace/trace_events.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 75f3ac01a87..2a4a4074991 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -16,7 +16,7 @@ struct dentry; * bash-15816 [01] 235.197585: idle_cpu <- irq_enter */ struct trace_entry { - unsigned char type; + int type; unsigned char flags; unsigned char preempt_count; int pid; @@ -73,7 +73,7 @@ enum print_line_t { struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, +trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 39a3351f2e7..15ef08d9add 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -198,7 +198,7 @@ ftrace_define_fields_##call(void) \ struct ftrace_event_call *event_call = &event_##call; \ int ret; \ \ - __common_field(unsigned char, type); \ + __common_field(int, type); \ __common_field(unsigned char, flags); \ __common_field(unsigned char, preempt_count); \ __common_field(int, pid); \ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b9a3adce922..b6183bc9eca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -838,7 +838,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - unsigned char type, + int type, unsigned long len, unsigned long flags, int pc) { @@ -881,7 +881,7 @@ void trace_buffer_unlock_commit(struct trace_array *tr, } struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, +trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc) { return trace_buffer_lock_reserve(&global_trace, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 247948e81b0..7d55bcf50e4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -422,7 +422,7 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - unsigned char type, + int type, unsigned long len, unsigned long flags, int pc); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9ea55a7dfde..5d6e879cf87 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", - FIELD(unsigned char, type), + FIELD(int, type), FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), From ff166cb57a17124af75714a9c11f448f56f1a4a3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:30 -0700 Subject: [PATCH 244/900] x86: x2apic, IR: remove reinit_intr_remapped_IO_APIC() When interrupt-remapping is enabled, we are relying on setup_IO_APIC_irqs() to configure remapped entries in the IO-APIC, which comes little bit later after enabling interrupt-remapping. Meanwhile, restoration of old io-apic entries after enabling interrupt-remapping will not make the interrupts through io-apic functional anyway. So remove the unnecessary reinit_intr_remapped_IO_APIC() step. The longer story: When interrupt-remapping is enabled, IO-APIC entries need to be setup in the re-mappable format (pointing to interrupt-remapping table entries setup by the OS). This remapping configuration is happening in the same place where we traditionally configure IO-APIC (i.e., in setup_IO_APIC_irqs()). So when we enable interrupt-remapping successfully, there is no need to restore old io-apic RTE entries before we actually do a complete configuration shortly in setup_IO_APIC_irqs(). Old IO-APIC RTE's may be in traditional format (non re-mappable) or in re-mappable format pointing to interrupt-remapping table entries setup by BIOS. Restoring both of these will not make IO-APIC functional. We have to rely on setup_IO_APIC_irqs() for proper configuration by OS. So I am removing this unnecessary and broken step. [ Impact: remove unnecessary/broken IO-APIC setup step ] Signed-off-by: Suresh Siddha Acked-by: Weidong Han Cc: dwmw2@infradead.org LKML-Reference: <20090420200450.552359000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io_apic.h | 2 -- arch/x86/kernel/apic/apic.c | 2 -- arch/x86/kernel/apic/io_apic.c | 14 -------------- 3 files changed, 18 deletions(-) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 34eaa37f7ad..1cf145039ee 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -166,8 +166,6 @@ extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); -extern void reinit_intr_remapped_IO_APIC(int intr_remapping, - struct IO_APIC_route_entry **ioapic_entries); extern void probe_nr_irqs_gsi(void); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d32f5589f1d..1386dbec552 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1412,8 +1412,6 @@ end_restore: * IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - else - reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries); unmask_8259A(); local_irq_restore(flags); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4baa9cbd630..8aef5f9d947 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -833,20 +833,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) return 0; } -void reinit_intr_remapped_IO_APIC(int intr_remapping, - struct IO_APIC_route_entry **ioapic_entries) - -{ - /* - * for now plain restore of previous settings. - * TBD: In the case of OS enabling interrupt-remapping, - * IO-APIC RTE's need to be setup to point to interrupt-remapping - * table entries. for now, do a plain restore, and wait for - * the setup_IO_APIC_irqs() to do proper initialization. - */ - restore_IO_APIC_setup(ioapic_entries); -} - void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) { int apic; From 9cbf117662e24c6d33245666804487f92c21b59d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 04:51:29 +0200 Subject: [PATCH 245/900] tracing/events: provide string with undefined size support This patch provides the support for dynamic size strings on event tracing. The key concept is to use a structure with an ending char array field of undefined size and use such ability to allocate the minimal size on the ring buffer to make one or more string entries fit inside, as opposite to a fixed length strings with upper bound. The strings themselves are represented using fields which have an offset value from the beginning of the entry. This patch provides three new macros: __string(item, src) This one declares a string to the structure inside TP_STRUCT__entry. You need to provide the name of the string field and the source that will be copied inside. This will also add the dynamic size of the string needed for the ring buffer entry allocation. A stack allocated structure is used to temporarily store the offset of each strings, avoiding double calls to strlen() on each event insertion. __get_str(field) This one will give you a pointer to the string you have created. This is an abstract helper to resolve the absolute address given the field name which is a relative address from the beginning of the trace_structure. __assign_str(dst, src) Use this macro to automatically perform the string copy from src to dst. src must be a variable to assign and dst is the name of a __string field. Example on how to use it: TRACE_EVENT(my_event, TP_PROTO(char *src1, char *src2), TP_ARGS(src1, src2), TP_STRUCT__entry( __string(str1, src1) __string(str2, src2) ), TP_fast_assign( __assign_str(str1, src1); __assign_str(str2, src2); ), TP_printk("%s %s", __get_str(src1), __get_str(src2)) ) Of course you can mix-up any __field or __array inside this TRACE_EVENT. The position of the __string or __assign_str doesn't matter. Changes in v2: Address the suggestion of Steven Rostedt: drop the opening_string() macro and redefine __ending_string() to get the size of the string to be copied instead of overwritting the whole ring buffer allocation. Changes in v3: Address other suggestions of Steven Rostedt and Peter Zijlstra with some changes: drop the __ending_string and the need to have only one string field. Use offsets instead of absolute addresses. [ Impact: allow more compact memory usage for string tracing ] Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Li Zefan Cc: Peter Zijlstra --- include/trace/ftrace.h | 88 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 15ef08d9add..5a7d18c4363 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -27,6 +27,9 @@ #undef __field #define __field(type, item) type item; +#undef __string +#define __string(item, src) int __str_loc_##item; + #undef TP_STRUCT__entry #define TP_STRUCT__entry(args...) args @@ -35,14 +38,53 @@ struct ftrace_raw_##name { \ struct trace_entry ent; \ tstruct \ + char __str_data[0]; \ }; \ static struct ftrace_event_call event_##name #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + /* * Stage 2 of the trace events. * + * Include the following: + * + * struct ftrace_str_offsets_ { + * int ; + * int ; + * [...] + * }; + * + * The __string() macro will create each int , this is to + * keep the offset of each string from the beggining of the event + * once we perform the strlen() of the src strings. + * + */ + +#undef TRACE_FORMAT +#define TRACE_FORMAT(call, proto, args, fmt) + +#undef __array +#define __array(type, item, len) + +#undef __field +#define __field(type, item); + +#undef __string +#define __string(item, src) int item; + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ + struct ftrace_str_offsets_##call { \ + tstruct; \ + }; + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +/* + * Stage 3 of the trace events. + * * Override the macros in to include the following: * * enum print_line_t @@ -80,6 +122,9 @@ #undef TP_printk #define TP_printk(fmt, args...) fmt "\n", args +#undef __get_str +#define __get_str(field) (char *)__entry + __entry->__str_loc_##field + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ enum print_line_t \ @@ -146,6 +191,16 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ if (!ret) \ return 0; +#undef __string +#define __string(item, src) \ + ret = trace_seq_printf(s, "\tfield: __str_loc " #item ";\t" \ + "offset:%u;tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), \ + __str_loc_##item), \ + (unsigned int)sizeof(field.__str_loc_##item)); \ + if (!ret) \ + return 0; + #undef __entry #define __entry REC @@ -189,6 +244,12 @@ ftrace_format_##call(struct trace_seq *s) \ if (ret) \ return ret; +#undef __string +#define __string(item, src) \ + ret = trace_define_field(event_call, "__str_loc", #item, \ + offsetof(typeof(field), __str_loc_##item), \ + sizeof(field.__str_loc_##item)); + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ int \ @@ -212,7 +273,7 @@ ftrace_define_fields_##call(void) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) /* - * Stage 3 of the trace events. + * Stage 4 of the trace events. * * Override the macros in to include the following: * @@ -409,6 +470,23 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #undef __entry #define __entry entry +#undef __field +#define __field(type, item) + +#undef __array +#define __array(type, item, len) + +#undef __string +#define __string(item, src) \ + __str_offsets.item = __str_size + \ + offsetof(typeof(*entry), __str_data); \ + __str_size += strlen(src) + 1; + +#undef __assign_str +#define __assign_str(dst, src) \ + __entry->__str_loc_##dst = __str_offsets.dst; \ + strcpy(__get_str(dst), src); + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ _TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ @@ -417,18 +495,22 @@ static struct ftrace_event_call event_##call; \ \ static void ftrace_raw_event_##call(proto) \ { \ + struct ftrace_str_offsets_##call __maybe_unused __str_offsets; \ struct ftrace_event_call *call = &event_##call; \ struct ring_buffer_event *event; \ struct ftrace_raw_##call *entry; \ unsigned long irq_flags; \ + int __str_size = 0; \ int pc; \ \ local_save_flags(irq_flags); \ pc = preempt_count(); \ \ + tstruct; \ + \ event = trace_current_buffer_lock_reserve(event_##call.id, \ - sizeof(struct ftrace_raw_##call), \ - irq_flags, pc); \ + sizeof(struct ftrace_raw_##call) + __str_size,\ + irq_flags, pc); \ if (!event) \ return; \ entry = ring_buffer_event_data(event); \ From 7e7ca9a22dbbc5c91763cd16923c7509918709b6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 04:54:49 +0200 Subject: [PATCH 246/900] tracing/lock: provide lock_acquired event support for dynamic size string Now that we can support the dynamic sized string, make the lock tracing able to use it, making it safe against modules removal and consuming the right amount of memory needed for each lock name Changes in v2: adapt to the __ending_string() updates and the opening_string() removal. [ Impact: protect lock tracer against module removal ] Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Steven Rostedt --- include/trace/events/lockdep.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h index 45e326b5c7f..3ca315c1429 100644 --- a/include/trace/events/lockdep.h +++ b/include/trace/events/lockdep.h @@ -38,16 +38,16 @@ TRACE_EVENT(lock_acquired, TP_ARGS(lock, ip, waittime), TP_STRUCT__entry( - __field(const char *, name) + __string(name, lock->name) __field(unsigned long, wait_usec) __field(unsigned long, wait_nsec_rem) ), TP_fast_assign( - __entry->name = lock->name; + __assign_str(name, lock->name); __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); __entry->wait_usec = (unsigned long) waittime; ), - TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, + TP_printk("%s (%lu.%03lu us)", __get_str(name), __entry->wait_usec, __entry->wait_nsec_rem) ); From 6a74aa40907757ec98d8710ff66cd4cfe064e7d8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Apr 2009 00:41:09 +0200 Subject: [PATCH 247/900] tracing/events: protect __get_str() The __get_str() macro is used in a code part then its content should be protected with parenthesis. [ Impact: make macro definition more robust ] Reported-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/trace/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 5a7d18c4363..a77f71a46db 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -123,7 +123,7 @@ #define TP_printk(fmt, args...) fmt "\n", args #undef __get_str -#define __get_str(field) (char *)__entry + __entry->__str_loc_##field +#define __get_str(field) ((char *)__entry + __entry->__str_loc_##field) #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ From 9be24414aad047dcf9d8d2a9a929321536c7ebec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Mar 2009 10:25:24 -0400 Subject: [PATCH 248/900] tracing/wakeup: move access to wakeup_cpu into spinlock The code had the following outside the lock: if (next != wakeup_task) return; pc = preempt_count(); /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; On initialization, wakeup_task is NULL and wakeup_cpu -1. This code is not under a lock. If wakeup_task is set on another CPU as that task is waking up, we can see the wakeup_task before wakeup_cpu is set. If we read wakeup_cpu while it is still -1 then we will have a bad data pointer. This patch moves the reading of wakeup_cpu within the protection of the spinlock used to protect the writing of wakeup_cpu and wakeup_task. [ Impact: remove possible race causing invalid pointer dereference ] Reported-by: Maneesh Soni Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index b8b13c5540f..eacb2722517 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, pc = preempt_count(); - /* The task we are waiting for is waking up */ - data = wakeup_trace->data[wakeup_cpu]; - /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); @@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; + /* The task we are waiting for is waking up */ + data = wakeup_trace->data[wakeup_cpu]; + trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); From 89ec0dee9eba6275d47be0b878cf5f6d5c2fb6eb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Mar 2009 11:03:29 -0400 Subject: [PATCH 249/900] tracing: increase size of number of possible events With the new event tracing registration, we must increase the number of events that can be registered. Currently the type field is only one byte, which leaves us only 256 possible events. Since we do not save the CPU number in the tracer anymore (it is determined by the per cpu ring buffer that is used) we have an extra byte to use. This patch increases the size of type from 1 byte (256 events) to 2 bytes (65,536 events). It also adds a WARN_ON_ONCE if we exceed that limit. [ Impact: allow more than 255 events ] Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 5 ++++- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_output.c | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2a4a4074991..07e0a6d64a2 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -16,13 +16,16 @@ struct dentry; * bash-15816 [01] 235.197585: idle_cpu <- irq_enter */ struct trace_entry { - int type; + unsigned short type; unsigned char flags; unsigned char preempt_count; int pid; int tgid; }; +#define FTRACE_MAX_EVENT \ + ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) + /* * Trace iterator - used by printout routines who present trace * results to users and which routines might sleep, etc: diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5d6e879cf87..9887131afa0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", - FIELD(int, type), + FIELD(unsigned short, type), FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 83a8abb9640..06997e75114 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -537,6 +537,8 @@ int register_ftrace_event(struct trace_event *event) out: mutex_unlock(&trace_event_mutex); + WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT); + return ret; } EXPORT_SYMBOL_GPL(register_ftrace_event); From 75db37d2f4c0ad9466ead57d467277d097b4105c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Mar 2009 11:43:36 -0400 Subject: [PATCH 250/900] tracing: add size checks for exported ftrace internal structures The events exported by TRACE_EVENT are automated and are guaranteed to be correct when used. The internal ftrace structures on the other hand are more manually exported. These require the ftrace maintainer to make sure they are up to date. This patch adds a size check to help flag when a type changes in an internal ftrace data structure, and the update needs to be reflected in the export. If a export is incorrect, then the only harm is that the user space tools will not know how to correctly read the internal structures of ftrace. [ Impact: help prevent inconsistent ftrace format print outs ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 3 +++ kernel/trace/trace_export.c | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9887131afa0..b9208158808 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -381,8 +381,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +extern char *__bad_type_size(void); + #undef FIELD #define FIELD(type, name) \ + sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ #type, "common_" #name, offsetof(typeof(field), name), \ sizeof(field.name) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 48fc02fe73a..0cb1a142c74 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -19,8 +19,12 @@ #undef TRACE_STRUCT #define TRACE_STRUCT(args...) args +extern void __bad_type_size(void); + #undef TRACE_FIELD #define TRACE_FIELD(type, item, assign) \ + if (sizeof(type) != sizeof(field.item)) \ + __bad_type_size(); \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), item), \ From d7285c6b5c54397fdf112c2fb98ee43193173aa9 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Thu, 23 Apr 2009 10:21:38 -0700 Subject: [PATCH 251/900] x86: use native register access for native tlb flushing currently these are paravirtulaized, doesn't appear any callers rely on this (no pv_ops backends are using native_tlb and overriding cr3/4 access). [ Impact: fix lockdep warning with paravirt and function tracer ] Signed-off-by: Chris Wright LKML-Reference: <20090423172138.GR3036@sequoia.sous-sol.org> Signed-off-by: Steven Rostedt --- arch/x86/include/asm/tlbflush.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d3539f998f8..e2927c5f45b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -17,7 +17,7 @@ static inline void __native_flush_tlb(void) { - write_cr3(read_cr3()); + native_write_cr3(native_read_cr3()); } static inline void __native_flush_tlb_global(void) @@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void) */ raw_local_irq_save(flags); - cr4 = read_cr4(); + cr4 = native_read_cr4(); /* clear PGE */ - write_cr4(cr4 & ~X86_CR4_PGE); + native_write_cr4(cr4 & ~X86_CR4_PGE); /* write old PGE again and flush TLBs */ - write_cr4(cr4); + native_write_cr4(cr4); raw_local_irq_restore(flags); } From c2518c4366f087ebc10b3919cb2461bbe4f42d0c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Apr 2009 23:26:18 -0400 Subject: [PATCH 252/900] tracing: fix cut and paste macro error In case a module uses the TRACE_EVENT macro for creating automated events in ftrace, it may choose to use a different file name than the defined system name, or choose to use a different path than the default "include/trace/events" include path. If this is done, then before including trace/define_trace.h the header would define either "TRACE_INCLUDE_FILE" for the file name or "TRACE_INCLUDE_PATH" for the include path. If it does not define these, then the define_trace.h defines them instead. If define trace defines them, then define_trace.h should also undefine them before exiting. To do this a macro is used to note this: #ifndef TRACE_INCLUDE_FILE # define TRACE_INCLUDE_FILE TRACE_SYSTEM # define UNDEF_TRACE_INCLUDE_FILE #endif [...] #ifdef UNDEF_TRACE_INCLUDE_FILE # undef TRACE_INCLUDE_FILE # undef UNDEF_TRACE_INCLUDE_FILE #endif The UNDEF_TRACE_INCLUDE_FILE acts as a CPP variable to know to undef the TRACE_INCLUDE_FILE before leaving define_trace.h. Unfortunately, due to cut and paste errors, the macros between FILE and PATH got mixed up. [ Impact: undef TRACE_INCLUDE_FILE and/or TRACE_INCLUDE_PATH when needed ] Signed-off-by: Steven Rostedt --- include/trace/define_trace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 7f1f23d601e..abc611feeb8 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -44,7 +44,7 @@ #ifndef TRACE_INCLUDE_PATH # define __TRACE_INCLUDE(system) -# define UNDEF_TRACE_INCLUDE_FILE +# define UNDEF_TRACE_INCLUDE_PATH #else # define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h) #endif @@ -64,13 +64,13 @@ /* Only undef what we defined in this file */ #ifdef UNDEF_TRACE_INCLUDE_FILE -# undef TRACE_INCLUDE_PATH +# undef TRACE_INCLUDE_FILE # undef UNDEF_TRACE_INCLUDE_FILE #endif -#ifdef UNDEF_TRACE_INCLUDE_FILE +#ifdef UNDEF_TRACE_INCLUDE_PATH # undef TRACE_INCLUDE_PATH -# undef UNDEF_TRACE_INCLUDE_FILE +# undef UNDEF_TRACE_INCLUDE_PATH #endif /* We may be processing more files */ From 334d4169a6592d3fcd863bbe822a8f6985ffa9af Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 24 Apr 2009 11:27:05 +0800 Subject: [PATCH 253/900] ring_buffer: compressed event header RB_MAX_SMALL_DATA = 28bytes is too small for most tracers, it wastes an 'u32' to save the actually length for events which data size > 28. This fix uses compressed event header and enlarges RB_MAX_SMALL_DATA. [ Impact: saves about 0%-12.5%(depends on tracer) memory in ring_buffer ] Signed-off-by: Lai Jiangshan LKML-Reference: <49F13189.3090000@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 16 +++---- kernel/trace/ring_buffer.c | 83 ++++++++++++++++++------------------- 2 files changed, 50 insertions(+), 49 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index fac8f1ac6f4..1c2f80911fb 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -11,7 +11,7 @@ struct ring_buffer_iter; * Don't refer to this struct directly, use functions below. */ struct ring_buffer_event { - u32 type:2, len:3, time_delta:27; + u32 type_len:5, time_delta:27; u32 array[]; }; @@ -24,7 +24,8 @@ struct ring_buffer_event { * size is variable depending on how much * padding is needed * If time_delta is non zero: - * everything else same as RINGBUF_TYPE_DATA + * array[0] holds the actual length + * size = 4 + length (bytes) * * @RINGBUF_TYPE_TIME_EXTEND: Extend the time delta * array[0] = time delta (28 .. 59) @@ -35,22 +36,23 @@ struct ring_buffer_event { * array[1..2] = tv_sec * size = 16 bytes * - * @RINGBUF_TYPE_DATA: Data record - * If len is zero: + * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX: + * Data record + * If type_len is zero: * array[0] holds the actual length * array[1..(length+3)/4] holds data - * size = 4 + 4 + length (bytes) + * size = 4 + length (bytes) * else - * length = len << 2 + * length = type_len << 2 * array[0..(length+3)/4-1] holds data * size = 4 + length (bytes) */ enum ring_buffer_type { + RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28, RINGBUF_TYPE_PADDING, RINGBUF_TYPE_TIME_EXTEND, /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */ RINGBUF_TYPE_TIME_STAMP, - RINGBUF_TYPE_DATA, }; unsigned ring_buffer_event_length(struct ring_buffer_event *event); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 61dbdf21cd3..9692f100ec1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -28,8 +28,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) { int ret; - ret = trace_seq_printf(s, "\ttype : 2 bits\n"); - ret = trace_seq_printf(s, "\tlen : 3 bits\n"); + ret = trace_seq_printf(s, "# compressed entry header\n"); + ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); ret = trace_seq_printf(s, "\tarray : 32 bits\n"); ret = trace_seq_printf(s, "\n"); @@ -37,8 +37,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) RINGBUF_TYPE_PADDING); ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); - ret = trace_seq_printf(s, "\tdata : type == %d\n", - RINGBUF_TYPE_DATA); + ret = trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); return ret; } @@ -204,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA 28 +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + +/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ +#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX enum { RB_LEN_TIME_EXTEND = 8, @@ -213,17 +216,18 @@ enum { static inline int rb_null_event(struct ring_buffer_event *event) { - return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; + return event->type_len == RINGBUF_TYPE_PADDING + && event->time_delta == 0; } static inline int rb_discarded_event(struct ring_buffer_event *event) { - return event->type == RINGBUF_TYPE_PADDING && event->time_delta; + return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; + event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } @@ -232,8 +236,8 @@ rb_event_data_length(struct ring_buffer_event *event) { unsigned length; - if (event->len) - length = event->len * RB_ALIGNMENT; + if (event->type_len) + length = event->type_len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; @@ -243,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event) static unsigned rb_event_length(struct ring_buffer_event *event) { - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) /* undefined */ return -1; - return rb_event_data_length(event); + return event->array[0] + RB_EVNT_HDR_SIZE; case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; @@ -272,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event) unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length = rb_event_length(event); - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) @@ -285,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static void * rb_event_data(struct ring_buffer_event *event) { - BUG_ON(event->type != RINGBUF_TYPE_DATA); + BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ - if (event->len) + if (event->type_len) return (void *)&event->array[0]; /* Otherwise length is in array[0] and array[1] has the data */ return (void *)&event->array[1]; @@ -988,7 +992,7 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) return; /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; cpu_buffer->overrun++; cpu_buffer->entries--; @@ -1133,28 +1137,21 @@ static void rb_update_event(struct ring_buffer_event *event, unsigned type, unsigned length) { - event->type = type; + event->type_len = type; switch (type) { case RINGBUF_TYPE_PADDING: - break; - case RINGBUF_TYPE_TIME_EXTEND: - event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT); - break; - case RINGBUF_TYPE_TIME_STAMP: - event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT); break; - case RINGBUF_TYPE_DATA: + case 0: length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) { - event->len = 0; + if (length > RB_MAX_SMALL_DATA) event->array[0] = length; - } else - event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); + else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); break; default: BUG(); @@ -1562,7 +1559,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_PAGE_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); + event = rb_reserve_next_event(cpu_buffer, 0, length); if (!event) goto out; @@ -1634,7 +1631,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static inline void rb_event_discard(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ if (!event->time_delta) event->time_delta = 1; @@ -1786,8 +1785,7 @@ int ring_buffer_write(struct ring_buffer *buffer, goto out; event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, - RINGBUF_TYPE_DATA, event_length); + event = rb_reserve_next_event(cpu_buffer, 0, event_length); if (!event) goto out; @@ -2035,7 +2033,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -2066,7 +2064,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -2181,7 +2179,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX + || rb_discarded_event(event)) cpu_buffer->entries--; rb_update_read_stamp(cpu_buffer, event); @@ -2262,7 +2261,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_reader_event(cpu_buffer); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) RB_WARN_ON(cpu_buffer, 1); @@ -2334,7 +2333,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_head_event(iter); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) { rb_inc_iter(iter); @@ -2393,7 +2392,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2421,7 +2420,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2466,7 +2465,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) out: preempt_enable(); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2559,7 +2558,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2766,7 +2765,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) return; /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; cpu_buffer->entries--; } From 782cc5ae6331d63b4febaa312c9d14493aafa9b8 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 24 Apr 2009 09:43:09 +0200 Subject: [PATCH 254/900] x86, ds: fix buffer alignment in debug store selftest The debug store selftest code uses a stack-allocated buffer, which is not necessarily correctly aligned. For tests using a buffer to hold a single entry, the buffer that is passed to ds_request must already be suitably aligned. Pass a suitably aligned portion of the bigger buffer. [ Impact: fix hw-branch-tracer self-test failure ] Signed-off-by: Markus Metzger Cc: markus.t.metzger@gmail.com LKML-Reference: <20090424094309.A30145@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index 5f104a0ace6..6bc7c199ab9 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -323,13 +323,15 @@ static int ds_selftest_bts_bad_request_task(void *buffer) int ds_selftest_bts(void) { struct ds_selftest_bts_conf conf; - unsigned char buffer[BUFFER_SIZE]; + unsigned char buffer[BUFFER_SIZE], *small_buffer; unsigned long irq; int cpu; printk(KERN_INFO "[ds] bts selftest..."); conf.error = 0; + small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8; + get_online_cpus(); for_each_online_cpu(cpu) { conf.suspend = ds_suspend_bts_wrap; @@ -381,7 +383,7 @@ int ds_selftest_bts(void) conf.suspend = ds_suspend_bts_noirq; conf.resume = ds_resume_bts_noirq; conf.tracer = - ds_request_bts_task(current, buffer, SMALL_BUFFER_SIZE, + ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE, NULL, (size_t)-1, BTS_KERNEL); local_irq_save(irq); ds_selftest_bts_cpu(&conf); From 7e0bfad24d85de7cf2202a7b0ce51de11a077b21 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 24 Apr 2009 09:44:48 +0200 Subject: [PATCH 255/900] x86, bts: reenable ptrace branch trace support The races found by Oleg Nesterov have been fixed. Reenable branch trace support. Signed-off-by: Markus Metzger Acked-by: Oleg Nesterov LKML-Reference: <20090424094448.A30216@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 8130334329c..924e156a85a 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -506,7 +506,6 @@ config X86_PTRACE_BTS bool "Branch Trace Store" default y depends on X86_DEBUGCTLMSR - depends on BROKEN ---help--- This adds a ptrace interface to the hardware's branch trace store. From 1cb81b143fa8f0e4629f10690862e2e52ca792ff Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 24 Apr 2009 09:51:43 +0200 Subject: [PATCH 256/900] x86, bts, mm: clean up buffer allocation The current mm interface is asymetric. One function allocates a locked buffer, another function only refunds the memory. Change this to have two functions for accounting and refunding locked memory, respectively; and do the actual buffer allocation in ptrace. [ Impact: refactor BTS buffer allocation code ] Signed-off-by: Markus Metzger Acked-by: Andrew Morton Cc: Peter Zijlstra LKML-Reference: <20090424095143.A30265@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 39 ++++++++++++++++++++++++++------------- include/linux/mm.h | 6 ++++-- mm/mlock.c | 36 +++++++++++++++++------------------- 3 files changed, 47 insertions(+), 34 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index d5252ae6c52..09ecbde91c1 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -617,17 +617,28 @@ struct bts_context { struct work_struct work; }; -static inline void alloc_bts_buffer(struct bts_context *context, - unsigned int size) +static int alloc_bts_buffer(struct bts_context *context, unsigned int size) { - void *buffer; + void *buffer = NULL; + int err = -ENOMEM; - buffer = alloc_locked_buffer(size); - if (buffer) { - context->buffer = buffer; - context->size = size; - context->mm = get_task_mm(current); - } + err = account_locked_memory(current->mm, current->signal->rlim, size); + if (err < 0) + return err; + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto out_refund; + + context->buffer = buffer; + context->size = size; + context->mm = get_task_mm(current); + + return 0; + + out_refund: + refund_locked_memory(current->mm, size); + return err; } static inline void free_bts_buffer(struct bts_context *context) @@ -638,7 +649,7 @@ static inline void free_bts_buffer(struct bts_context *context) kfree(context->buffer); context->buffer = NULL; - refund_locked_buffer_memory(context->mm, context->size); + refund_locked_memory(context->mm, context->size); context->size = 0; mmput(context->mm); @@ -786,13 +797,15 @@ static int ptrace_bts_config(struct task_struct *child, context->tracer = NULL; if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { + int err; + free_bts_buffer(context); if (!cfg.size) return 0; - alloc_bts_buffer(context, cfg.size); - if (!context->buffer) - return -ENOMEM; + err = alloc_bts_buffer(context, cfg.size); + if (err < 0) + return err; } if (cfg.flags & PTRACE_BTS_O_TRACE) diff --git a/include/linux/mm.h b/include/linux/mm.h index a3963ba23a6..009eabd3c21 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -19,6 +19,7 @@ struct anon_vma; struct file_ra_state; struct user_struct; struct writeback_control; +struct rlimit; #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -1319,7 +1320,8 @@ int vmemmap_populate_basepages(struct page *start_page, int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); -extern void *alloc_locked_buffer(size_t size); -extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size); +extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, + size_t size); +extern void refund_locked_memory(struct mm_struct *mm, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 28be15ead9c..ac130433c7d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -629,38 +629,36 @@ void user_shm_unlock(size_t size, struct user_struct *user) free_uid(user); } -void *alloc_locked_buffer(size_t size) +int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, + size_t size) { - unsigned long rlim, vm, pgsz; - void *buffer = NULL; + unsigned long lim, vm, pgsz; + int error = -ENOMEM; pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + pgsz; - if (rlim < vm) + lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = mm->total_vm + pgsz; + if (lim < vm) goto out; - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + pgsz; - if (rlim < vm) + lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = mm->locked_vm + pgsz; + if (lim < vm) goto out; - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto out; - - current->mm->total_vm += pgsz; - current->mm->locked_vm += pgsz; + mm->total_vm += pgsz; + mm->locked_vm += pgsz; + error = 0; out: - up_write(¤t->mm->mmap_sem); - return buffer; + up_write(&mm->mmap_sem); + return error; } -void refund_locked_buffer_memory(struct mm_struct *mm, size_t size) +void refund_locked_memory(struct mm_struct *mm, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; From 39517091f88fae32b52254b561ced78da1eaf0a7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 11:05:52 -0400 Subject: [PATCH 257/900] tracing/lockdep: convert lockdep to use TRACE_EVENT macro The TRACE_FORMAT will soon be deprecated. This patch converts it to the TRACE_EVENT macro. Note, this change should also speed up the tracing. [ Impact: remove a user of deprecated TRACE_FORMAT ] Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/trace/events/lockdep.h | 58 +++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h index 3ca315c1429..0e956c9dfd7 100644 --- a/include/trace/events/lockdep.h +++ b/include/trace/events/lockdep.h @@ -9,28 +9,64 @@ #ifdef CONFIG_LOCKDEP -TRACE_FORMAT(lock_acquire, +TRACE_EVENT(lock_acquire, + TP_PROTO(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *next_lock, unsigned long ip), - TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), - TP_FMT("%s%s%s", trylock ? "try " : "", - read ? "read " : "", lock->name) - ); -TRACE_FORMAT(lock_release, + TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __string(name, lock->name) + ), + + TP_fast_assign( + __entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0); + __assign_str(name, lock->name); + ), + + TP_printk("%s%s%s", (__entry->flags & 1) ? "try " : "", + (__entry->flags & 2) ? "read " : "", + __get_str(name)) +); + +TRACE_EVENT(lock_release, + TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TP_ARGS(lock, nested, ip), - TP_FMT("%s", lock->name) - ); + + TP_STRUCT__entry( + __string(name, lock->name) + ), + + TP_fast_assign( + __assign_str(name, lock->name); + ), + + TP_printk("%s", __get_str(name)) +); #ifdef CONFIG_LOCK_STAT -TRACE_FORMAT(lock_contended, +TRACE_EVENT(lock_contended, + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + TP_ARGS(lock, ip), - TP_FMT("%s", lock->name) - ); + + TP_STRUCT__entry( + __string(name, lock->name) + ), + + TP_fast_assign( + __assign_str(name, lock->name); + ), + + TP_printk("%s", __get_str(name)) +); TRACE_EVENT(lock_acquired, TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), From 160031b556e93590fa8635210d73d93c3d3853a9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 11:26:55 -0400 Subject: [PATCH 258/900] tracing/irq: convert irq traces to use TRACE_EVENT macro The TRACE_FORMAT will soon be deprecated. This patch converts it to the TRACE_EVENT macro. Note, this change should also speed up the tracing. [ Impact: remove a user of deprecated TRACE_FORMAT ] Cc: Jason Baron Signed-off-by: Steven Rostedt --- include/trace/events/irq.h | 61 +++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 75e3468e449..76868646751 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -10,11 +10,24 @@ /* * Tracepoint for entry of interrupt handler: */ -TRACE_FORMAT(irq_handler_entry, +TRACE_EVENT(irq_handler_entry, + TP_PROTO(int irq, struct irqaction *action), + TP_ARGS(irq, action), - TP_FMT("irq=%d handler=%s", irq, action->name) - ); + + TP_STRUCT__entry( + __field( int, irq ) + __string( name, action->name ) + ), + + TP_fast_assign( + __entry->irq = irq; + __assign_str(name, action->name); + ), + + TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name)) +); /* * Tracepoint for return of an interrupt handler: @@ -39,17 +52,43 @@ TRACE_EVENT(irq_handler_exit, __entry->irq, __entry->ret ? "handled" : "unhandled") ); -TRACE_FORMAT(softirq_entry, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), - TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); +TRACE_EVENT(softirq_entry, -TRACE_FORMAT(softirq_exit, TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); + + TP_STRUCT__entry( + __field( int, vec ) + __string( name, softirq_to_name[h-vec] ) + ), + + TP_fast_assign( + __entry->vec = (int)(h - vec); + __assign_str(name, softirq_to_name[h-vec]); + ), + + TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) +); + +TRACE_EVENT(softirq_exit, + + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + + TP_ARGS(h, vec), + + TP_STRUCT__entry( + __field( int, vec ) + __string( name, softirq_to_name[h-vec] ) + ), + + TP_fast_assign( + __entry->vec = (int)(h - vec); + __assign_str(name, softirq_to_name[h-vec]); + ), + + TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) +); #endif /* _TRACE_IRQ_H */ From b8e65554d80b4c560d201362d0e8fa02109d89fd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 11:50:39 -0400 Subject: [PATCH 259/900] tracing: remove deprecated TRACE_FORMAT The TRACE_FORMAT macro has been deprecated by the TRACE_EVENT macro. There are no more users. All new users must use the TRACE_EVENT macro. [ Impact: remove old functionality ] Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 5 --- include/trace/define_trace.h | 4 --- include/trace/ftrace.h | 66 ------------------------------------ 3 files changed, 75 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 4353f3f7e62..14df7e635d4 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -158,11 +158,6 @@ static inline void tracepoint_synchronize_unregister(void) #define PARAMS(args...) args -#ifndef TRACE_FORMAT -#define TRACE_FORMAT(name, proto, args, fmt) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) -#endif - #ifndef TRACE_EVENT /* * For use with the TRACE_EVENT macro: diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index abc611feeb8..f7a7ae1e8f9 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -26,10 +26,6 @@ #define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ DEFINE_TRACE(name) -#undef TRACE_FORMAT -#define TRACE_FORMAT(name, proto, args, print) \ - DEFINE_TRACE(name) - #undef DECLARE_TRACE #define DECLARE_TRACE(name, proto, args) \ DEFINE_TRACE(name) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index a77f71a46db..1e681142f1d 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -18,9 +18,6 @@ #include -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - #undef __array #define __array(type, item, len) type item[len]; @@ -62,9 +59,6 @@ * */ -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - #undef __array #define __array(type, item, len) @@ -298,16 +292,6 @@ ftrace_define_fields_##call(void) \ * unregister_trace_(ftrace_event_); * } * - * For those macros defined with TRACE_FORMAT: - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_ = { - * .name = "", - * .regfunc = ftrace_reg_event_, - * .unregfunc = ftrace_unreg_event_, - * } - * * * For those macros defined with TRACE_EVENT: * @@ -417,56 +401,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ #define _TRACE_PROFILE_INIT(call) #endif -#define _TRACE_FORMAT(call, proto, args, fmt) \ -static void ftrace_event_##call(proto) \ -{ \ - event_trace_printk(_RET_IP_, #call ": " fmt); \ -} \ - \ -static int ftrace_reg_event_##call(void) \ -{ \ - int ret; \ - \ - ret = register_trace_##call(ftrace_event_##call); \ - if (ret) \ - pr_info("event trace: Could not activate trace point " \ - "probe to " #call "\n"); \ - return ret; \ -} \ - \ -static void ftrace_unreg_event_##call(void) \ -{ \ - unregister_trace_##call(ftrace_event_##call); \ -} \ - \ -static struct ftrace_event_call event_##call; \ - \ -static int ftrace_init_event_##call(void) \ -{ \ - int id; \ - \ - id = register_ftrace_event(NULL); \ - if (!id) \ - return -ENODEV; \ - event_##call.id = id; \ - return 0; \ -} - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) \ -_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ -static struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_init_event_##call, \ - .regfunc = ftrace_reg_event_##call, \ - .unregfunc = ftrace_unreg_event_##call, \ - _TRACE_PROFILE_INIT(call) \ -} - #undef __entry #define __entry entry From 060fa5c83e67901ba47ab484cfcdb32737d630ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 12:20:52 -0400 Subject: [PATCH 260/900] tracing/events: reuse trace event ids after overflow With modules being able to add trace events, and the max trace event counter is 16 bits (65536) we can overflow the counter easily with a simple while loop adding and removing modules that contain trace events. This patch links together the registered trace events and on overflow searches for available trace event ids. It will still fail if over 65536 events are registered, but considering that a typical kernel only has 22000 functions, 65000 events should be sufficient. Reported-by: Li Zefan Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + kernel/trace/trace_output.c | 71 ++++++++++++++++++++++++++++++------ 2 files changed, 61 insertions(+), 11 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 07e0a6d64a2..78a9ba24cbf 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -56,6 +56,7 @@ typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, int flags); struct trace_event { struct hlist_node node; + struct list_head list; int type; trace_print_func trace; trace_print_func raw; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 06997e75114..5fc51f0f75f 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -483,6 +483,36 @@ struct trace_event *ftrace_find_event(int type) return NULL; } +static LIST_HEAD(ftrace_event_list); + +static int trace_search_list(struct list_head **list) +{ + struct trace_event *e; + int last = __TRACE_LAST_TYPE; + + if (list_empty(&ftrace_event_list)) { + *list = &ftrace_event_list; + return last + 1; + } + + /* + * We used up all possible max events, + * lets see if somebody freed one. + */ + list_for_each_entry(e, &ftrace_event_list, list) { + if (e->type != last + 1) + break; + last++; + } + + /* Did we used up all 65 thousand events??? */ + if ((last + 1) > FTRACE_MAX_EVENT) + return 0; + + *list = &e->list; + return last + 1; +} + /** * register_ftrace_event - register output for an event type * @event: the event type to register @@ -505,20 +535,40 @@ int register_ftrace_event(struct trace_event *event) mutex_lock(&trace_event_mutex); - if (!event) { - ret = next_event_type++; + if (WARN_ON(!event)) goto out; - } - if (!event->type) - event->type = next_event_type++; - else if (event->type > __TRACE_LAST_TYPE) { + INIT_LIST_HEAD(&event->list); + + if (!event->type) { + struct list_head *list; + + if (next_event_type > FTRACE_MAX_EVENT) { + + event->type = trace_search_list(&list); + if (!event->type) + goto out; + + } else { + + event->type = next_event_type++; + list = &ftrace_event_list; + } + + if (WARN_ON(ftrace_find_event(event->type))) + goto out; + + list_add_tail(&event->list, list); + + } else if (event->type > __TRACE_LAST_TYPE) { printk(KERN_WARNING "Need to add type to trace.h\n"); WARN_ON(1); - } - - if (ftrace_find_event(event->type)) goto out; + } else { + /* Is this event already used */ + if (ftrace_find_event(event->type)) + goto out; + } if (event->trace == NULL) event->trace = trace_nop_print; @@ -537,8 +587,6 @@ int register_ftrace_event(struct trace_event *event) out: mutex_unlock(&trace_event_mutex); - WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT); - return ret; } EXPORT_SYMBOL_GPL(register_ftrace_event); @@ -551,6 +599,7 @@ int unregister_ftrace_event(struct trace_event *event) { mutex_lock(&trace_event_mutex); hlist_del(&event->node); + list_del(&event->list); mutex_unlock(&trace_event_mutex); return 0; From 701970b3a83cc639c1ec8fc6f40a7871cb99426f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 23:11:22 -0400 Subject: [PATCH 261/900] tracing/events: make modules have their own file_operations structure For proper module reference counting, the file_operations that modules use must have the "owner" field set to the module. Unfortunately, the trace events use share file_operations. The same file_operations are used by all both kernel core and all modules. This patch makes the modules allocate their own file_operations and copies the functions from the core kernel. This allows those file operations to be owned by the module. Care is taken to free this code on module unload. Thanks to Greg KH for reminding me that file_operations must be owned by the module to have reference counting take place. [ Impact: fix modular tracepoints / potential crash ] Signed-off-by: Steven Rostedt Acked-by: Greg Kroah-Hartman --- kernel/trace/trace_events.c | 95 ++++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b9208158808..be4d3a437c1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -770,7 +770,11 @@ event_subsystem_dir(const char *name, struct dentry *d_events) } static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) +event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) { struct dentry *entry; int ret; @@ -800,11 +804,11 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) if (call->regfunc) entry = trace_create_file("enable", 0644, call->dir, call, - &ftrace_enable_fops); + enable); if (call->id) entry = trace_create_file("id", 0444, call->dir, call, - &ftrace_event_id_fops); + id); if (call->define_fields) { ret = call->define_fields(); @@ -814,7 +818,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return ret; } entry = trace_create_file("filter", 0644, call->dir, call, - &ftrace_event_filter_fops); + filter); } /* A trace may not want to export its format */ @@ -822,7 +826,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return 0; entry = trace_create_file("format", 0444, call->dir, call, - &ftrace_event_format_fops); + format); return 0; } @@ -833,8 +837,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) event++) #ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { + struct list_head list; + struct module *mod; + struct file_operations id; + struct file_operations enable; + struct file_operations format; + struct file_operations filter; +}; + +static struct ftrace_module_file_ops * +trace_create_file_ops(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops; + + /* + * This is a bit of a PITA. To allow for correct reference + * counting, modules must "own" their file_operations. + * To do this, we allocate the file operations that will be + * used in the event directory. + */ + + file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); + if (!file_ops) + return NULL; + + file_ops->mod = mod; + + file_ops->id = ftrace_event_id_fops; + file_ops->id.owner = mod; + + file_ops->enable = ftrace_enable_fops; + file_ops->enable.owner = mod; + + file_ops->filter = ftrace_event_filter_fops; + file_ops->filter.owner = mod; + + file_ops->format = ftrace_event_format_fops; + file_ops->format.owner = mod; + + list_add(&file_ops->list, &ftrace_module_file_list); + + return file_ops; +} + static void trace_module_add_events(struct module *mod) { + struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call, *start, *end; struct dentry *d_events; @@ -852,14 +908,27 @@ static void trace_module_add_events(struct module *mod) /* The linker may leave blanks */ if (!call->name) continue; + + /* + * This module has events, create file ops for this module + * if not already done. + */ + if (!file_ops) { + file_ops = trace_create_file_ops(mod); + if (!file_ops) + return; + } call->mod = mod; list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events); + event_create_dir(call, d_events, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); } } static void trace_module_remove_events(struct module *mod) { + struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; list_for_each_entry_safe(call, p, &ftrace_events, list) { @@ -874,6 +943,16 @@ static void trace_module_remove_events(struct module *mod) list_del(&call->list); } } + + /* Now free the file_operations */ + list_for_each_entry(file_ops, &ftrace_module_file_list, list) { + if (file_ops->mod == mod) + break; + } + if (&file_ops->list != &ftrace_module_file_list) { + list_del(&file_ops->list); + kfree(file_ops); + } } static int trace_module_notify(struct notifier_block *self, @@ -954,7 +1033,9 @@ static __init int event_trace_init(void) if (!call->name) continue; list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events); + event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, &ftrace_event_filter_fops, + &ftrace_event_format_fops); } ret = register_module_notifier(&trace_module_nb); From 0a3ec21fcd311b26ab0f249d62960e127bc20ca8 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 26 Apr 2009 23:07:42 +0200 Subject: [PATCH 262/900] x86: beautify vmlinux_64.lds.S Beautify vmlinux_64.lds.S: - Use tabs for indent - Located curly braces like in C code - Rearranged a few comments There is no functional changes in this patch The beautification is done to prepare a unification of the _32 and the _64 variants of the linker scripts. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <20090426210742.GA3464@uranus.ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_64.lds.S | 430 +++++++++++++++++-------------- 1 file changed, 234 insertions(+), 196 deletions(-) diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index c8742507b03..6d5a5b05eaa 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -15,69 +15,79 @@ OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) jiffies_64 = jiffies; PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - user PT_LOAD FLAGS(7); /* RWE */ + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(7); /* RWE */ data.init PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(7); /* RWE */ #endif data.init2 PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - /* First the code that has to be first for bootstrapping */ - *(.text.head) - _stext = .; - /* Then the rest */ - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; - NOTES :text :note + /* Text and read-only data */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; + /* First the code that has to be first for bootstrapping */ + *(.text.head) + _stext = .; + /* Then the rest */ + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 + NOTES :text :note - RODATA + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - _edata = .; /* End of data section */ + RODATA + + /* Align data segment to page size boundary */ + . = ALIGN(PAGE_SIZE); + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS + /* End of data section */ + _edata = .; } :data - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - *(.data.cacheline_aligned) - } - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + *(.data.cacheline_aligned) + } + + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + } #define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) @@ -85,37 +95,53 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + *(.vsyscall_0) + } :user - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) - { *(.vsyscall_gtod_data) } - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) - { *(.vsyscall_clock) } - vsyscall_clock = VVIRT(.vsyscall_clock); + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + *(.vsyscall_fn) + } + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { + *(.vsyscall_gtod_data) + } + + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { + *(.vsyscall_clock) + } + vsyscall_clock = VVIRT(.vsyscall_clock); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) - { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) - { *(.vsyscall_2) } + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + *(.vsyscall_1) + } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + *(.vsyscall_2) + } - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } - vgetcpu_mode = VVIRT(.vgetcpu_mode); + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { + *(.vgetcpu_mode) + } + vgetcpu_mode = VVIRT(.vgetcpu_mode); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } - jiffies = VVIRT(.jiffies); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .jiffies : AT(VLOAD(.jiffies)) { + *(.jiffies) + } + jiffies = VVIRT(.jiffies); - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) - { *(.vsyscall_3) } + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { + *(.vsyscall_3) + } - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; #undef VSYSCALL_ADDR #undef VSYSCALL_PHYS_ADDR @@ -125,156 +151,168 @@ SECTIONS #undef VVIRT_OFFSET #undef VVIRT - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - . = ALIGN(THREAD_SIZE); /* init_task */ - *(.data.init_task) - }:data.init + /* init_task */ + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + . = ALIGN(THREAD_SIZE); + *(.data.init_task) + } :data.init - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + *(.data.page_aligned) + } + + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + /* might get freed after init */ + . = ALIGN(PAGE_SIZE); + __smp_alt_begin = .; + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + __smp_alt_end = .; + } + + /* Init code and data */ . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - } + __init_begin = .; /* paired with __init_end */ + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + _einittext = .; + } - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; - } + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + __initdata_begin = .; + INIT_DATA + __initdata_end = .; + } - . = ALIGN(PAGE_SIZE); /* Init code and data */ - __init_begin = .; /* paired with __init_end */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - __initdata_begin = .; - INIT_DATA - __initdata_end = .; - } + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + . = ALIGN(16); + __setup_start = .; + *(.init.setup) + __setup_end = .; + } - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - . = ALIGN(16); - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { . = ALIGN(8); - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + . = ALIGN(8); + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } #ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } #endif #ifdef CONFIG_SMP - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - __data_nosave - should + * start another section data.init2. Also, pda should be at the head of + * percpu area. Preallocate it and define the percpu offset symbol + * so that it can be accessed as a percpu variable. + */ + . = ALIGN(PAGE_SIZE); + PERCPU_VADDR(0, :percpu) #else - PERCPU(PAGE_SIZE) + PERCPU(PAGE_SIZE) #endif - . = ALIGN(PAGE_SIZE); - __init_end = .; - - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ + __init_end = .; - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - __bss_stop = .; - } + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } :data.init2 + /* use another section data.init2, see PERCPU_VADDR() above */ - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __bss_start = .; /* BSS */ + *(.bss.page_aligned) + *(.bss) + __bss_stop = .; } - STABS_DEBUG + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } - DWARF_DEBUG + _end = . ; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.eh_frame) + *(.discard) + } + + STABS_DEBUG + DWARF_DEBUG } - /* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ #define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load INIT_PER_CPU(gdt_page); INIT_PER_CPU(irq_stack_union); From 51b26ada79b605ed709ddcedbb6012e8f8e0ebed Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 26 Apr 2009 10:12:47 -0700 Subject: [PATCH 263/900] x86: unify arch/x86/boot/compressed/vmlinux_*.lds Look at the: diff -u arch/x86/boot/compressed/vmlinux_*.lds output and realize that they're basially exactly the same except for trivial naming differences, and the fact that the 64-bit version has a "pgtable" thing. So unify them. There's some trivial cleanup there (make the output format a Kconfig thing rather than doing #ifdef's for it, and unify both 32-bit and 64-bit BSS end to "_ebss", where 32-bit used to use the traditional "_end"), but other than that it's really very mindless and straigt conversion. For example, I think we should aim to remove "startup_32" vs "startup_64", and just call it "startup", and get rid of one more difference. I didn't do that. Also, notice the comment in the unified vmlinux.lds.S talks about "head_64" and "startup_32" which is an odd and incorrect mix, but that was actually what the old 64-bit only lds file had, so the confusion isn't new, and now that mixing is arguably more accurate thanks to the vmlinux.lds.S file being shared between the two cases ;) [ Impact: cleanup, unification ] Signed-off-by: Linus Torvalds Acked-by: Sam Ravnborg Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 5 +++ arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/head_32.S | 8 ++-- .../{vmlinux_64.lds => vmlinux.lds.S} | 11 ++++- arch/x86/boot/compressed/vmlinux_32.lds | 43 ------------------- 5 files changed, 20 insertions(+), 49 deletions(-) rename arch/x86/boot/compressed/{vmlinux_64.lds => vmlinux.lds.S} (78%) delete mode 100644 arch/x86/boot/compressed/vmlinux_32.lds diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bc25b9f5e4c..039c3f04aac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -47,6 +47,11 @@ config X86 select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA +config OUTPUT_FORMAT + string + default "elf32-i386" if X86_32 + default "elf64-x86-64" if X86_64 + config ARCH_DEFCONFIG string default "arch/x86/configs/i386_defconfig" if X86_32 diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 65551c9f857..0f4b5e2abd3 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -19,7 +19,7 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ LDFLAGS := -m elf_$(UTS_MACHINE) LDFLAGS_vmlinux := -T -$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE +$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 3a8a866fb2e..85bd3285706 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -88,9 +88,9 @@ ENTRY(startup_32) * where decompression in place becomes safe. */ pushl %esi - leal _end(%ebp), %esi - leal _end(%ebx), %edi - movl $(_end - startup_32), %ecx + leal _ebss(%ebp), %esi + leal _ebss(%ebx), %edi + movl $(_ebss - startup_32), %ecx std rep movsb @@ -121,7 +121,7 @@ relocated: */ xorl %eax,%eax leal _edata(%ebx),%edi - leal _end(%ebx), %ecx + leal _ebss(%ebx), %ecx subl %edi,%ecx cld rep diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux.lds.S similarity index 78% rename from arch/x86/boot/compressed/vmlinux_64.lds rename to arch/x86/boot/compressed/vmlinux.lds.S index bef1ac891bc..ffcb19134bf 100644 --- a/arch/x86/boot/compressed/vmlinux_64.lds +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -1,6 +1,13 @@ -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#ifdef CONFIG_X86_64 OUTPUT_ARCH(i386:x86-64) ENTRY(startup_64) +#else +OUTPUT_ARCH(i386) +ENTRY(startup_32) +#endif + SECTIONS { /* Be careful parts of head_64.S assume startup_32 is at @@ -38,11 +45,13 @@ SECTIONS *(.bss) *(.bss.*) *(COMMON) +#ifdef CONFIG_X86_64 . = ALIGN(8); _end_before_pgt = . ; . = ALIGN(4096); pgtable = . ; . = . + 4096 * 6; +#endif _ebss = .; } } diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds deleted file mode 100644 index bb3c48379c4..00000000000 --- a/arch/x86/boot/compressed/vmlinux_32.lds +++ /dev/null @@ -1,43 +0,0 @@ -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(startup_32) -SECTIONS -{ - /* Be careful parts of head_32.S assume startup_32 is at - * address 0. - */ - . = 0; - .text.head : { - _head = . ; - *(.text.head) - _ehead = . ; - } - .rodata.compressed : { - *(.rodata.compressed) - } - .text : { - _text = .; /* Text */ - *(.text) - *(.text.*) - _etext = . ; - } - .rodata : { - _rodata = . ; - *(.rodata) /* read-only data */ - *(.rodata.*) - _erodata = . ; - } - .data : { - _data = . ; - *(.data) - *(.data.*) - _edata = . ; - } - .bss : { - _bss = . ; - *(.bss) - *(.bss.*) - *(COMMON) - _end = . ; - } -} From b2ba83ff4f4405cebc10884121ee71338a1a6c94 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 26 Apr 2009 23:38:08 -0700 Subject: [PATCH 264/900] x86: apic: Remove duplicated macros XAPIC_DEST_* is dupliicated to the one in apicdef.h [ Impact: cleanup ] Signed-off-by: Yinghai Lu LKML-Reference: <49F552D0.5050505@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/summit_32.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9cfe1f415d8..344eee4ac0a 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){ rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); } - -/* In clustered mode, the high nibble of APIC ID is a cluster number. - * The low nibble is a 4-bit bitmap. */ -#define XAPIC_DEST_CPUS_SHIFT 4 -#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) -#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) - #define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) static const struct cpumask *summit_target_cpus(void) From e0e42142bab96404de535cceb85d6533d5ad7942 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 26 Apr 2009 23:39:38 -0700 Subject: [PATCH 265/900] x86: Use dmi check in apic_is_clustered() on 64-bit to mark the TSC unstable We will have systems with 2 and more sockets 8cores/2thread, but we treat them as multi chassis - while they could have a stable TSC domain. Use DMI check instead. [ Impact: do not turn possibly stable TSCs off incorrectly ] Signed-off-by: Yinghai Lu Cc: Ravikiran Thirumalai LKML-Reference: <49F5532A.5000802@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 86 +++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1386dbec552..28f747d61d7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2138,31 +2138,14 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ #ifdef CONFIG_X86_64 -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) + +static int __cpuinit apic_cluster_num(void) { int i, clusters, zeros; unsigned id; u16 *bios_cpu_apicid; DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - /* - * there is not this kind of box with AMD CPU yet. - * Some AMD box with quadcore cpu and 8 sockets apicid - * will be [4, 0x23] or [8, 0x27] could be thought to - * vsmp box still need checking... - */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) - return 0; - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); @@ -2198,18 +2181,67 @@ __cpuinit int apic_is_clustered_box(void) ++zeros; } - /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (is_vsmp_box() && clusters > 1) + return clusters; +} + +static int __cpuinitdata multi_checked; +static int __cpuinitdata multi; + +static int __cpuinit set_multi(const struct dmi_system_id *d) +{ + if (multi) + return 0; + printk(KERN_INFO "APIC: %s detected, Multi Chassis\n", d->ident); + multi = 1; + return 0; +} + +static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = { + { + .callback = set_multi, + .ident = "IBM System Summit2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"), + }, + }, + {} +}; + +static void __cpuinit dmi_check_multi(void) +{ + if (multi_checked) + return; + + dmi_check_system(multi_dmi_table); + multi_checked = 1; +} + +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. + * Use DMI to check them + */ +__cpuinit int apic_is_clustered_box(void) +{ + dmi_check_multi(); + if (multi) return 1; + if (!is_vsmp_box()) + return 0; + /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. + * ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards */ - return (clusters > 2); + if (apic_cluster_num() > 1) + return 1; + + return 0; } #endif From 9ec4fa271faf2db3b8e1419c998da1ca6b094eb6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 17:57:18 -0700 Subject: [PATCH 266/900] irq, cpumask: correct CPUMASKS_OFFSTACK typo and fix fallout CPUMASKS_OFFSTACK is not defined anywhere (it is CPUMASK_OFFSTACK). It is a typo and init_allocate_desc_masks() is called before it set affinity to all cpus... Split init_alloc_desc_masks() into all_desc_masks() and init_desc_masks(). Also use CPUMASK_OFFSTACK in alloc_desc_masks(). [ Impact: fix smp_affinity copying/setup when moving irq_desc between CPUs ] Signed-off-by: Yinghai Lu Acked-by: Rusty Russell Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" LKML-Reference: <49F6546E.3040406@kernel.org> Signed-off-by: Ingo Molnar --- include/linux/irq.h | 27 ++++++++++++++++++--------- kernel/irq/handle.c | 9 ++++++--- kernel/irq/numa_migrate.c | 2 +- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index b7cbeed972e..c4953cf27e5 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -424,27 +424,25 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); #ifdef CONFIG_SMP /** - * init_alloc_desc_masks - allocate cpumasks for irq_desc + * alloc_desc_masks - allocate cpumasks for irq_desc * @desc: pointer to irq_desc struct * @cpu: cpu which will be handling the cpumasks * @boot: true if need bootmem * * Allocates affinity and pending_mask cpumask if required. * Returns true if successful (or not required). - * Side effect: affinity has all bits set, pending_mask has all bits clear. */ -static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, +static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu, bool boot) { +#ifdef CONFIG_CPUMASK_OFFSTACK int node; if (boot) { alloc_bootmem_cpumask_var(&desc->affinity); - cpumask_setall(desc->affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ alloc_bootmem_cpumask_var(&desc->pending_mask); - cpumask_clear(desc->pending_mask); #endif return true; } @@ -453,18 +451,25 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node)) return false; - cpumask_setall(desc->affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) { free_cpumask_var(desc->affinity); return false; } - cpumask_clear(desc->pending_mask); +#endif #endif return true; } +static inline void init_desc_masks(struct irq_desc *desc) +{ + cpumask_setall(desc->affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_clear(desc->pending_mask); +#endif +} + /** * init_copy_desc_masks - copy cpumasks for irq_desc * @old_desc: pointer to old irq_desc struct @@ -478,7 +483,7 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, static inline void init_copy_desc_masks(struct irq_desc *old_desc, struct irq_desc *new_desc) { -#ifdef CONFIG_CPUMASKS_OFFSTACK +#ifdef CONFIG_CPUMASK_OFFSTACK cpumask_copy(new_desc->affinity, old_desc->affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -499,12 +504,16 @@ static inline void free_desc_masks(struct irq_desc *old_desc, #else /* !CONFIG_SMP */ -static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, +static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu, bool boot) { return true; } +static inline void init_desc_masks(struct irq_desc *desc) +{ +} + static inline void init_copy_desc_masks(struct irq_desc *old_desc, struct irq_desc *new_desc) { diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d82142be8dd..882c7980010 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -115,10 +115,11 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } - if (!init_alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, cpu, false)) { printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); BUG_ON(1); } + init_desc_masks(desc); arch_init_chip_data(desc, cpu); } @@ -169,7 +170,8 @@ int __init early_irq_init(void) desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - init_alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], 0, true); + init_desc_masks(&desc[i]); irq_desc_ptrs[i] = desc + i; } @@ -256,7 +258,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq = i; - init_alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], 0, true); + init_desc_masks(&desc[i]); desc[i].kstat_irqs = kstat_irqs_all[i]; } return arch_early_irq_init(); diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 44bbdcbaf8d..5760d725162 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -37,7 +37,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, struct irq_desc *desc, int cpu) { memcpy(desc, old_desc, sizeof(struct irq_desc)); - if (!init_alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, cpu, false)) { printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " "for migration.\n", irq); return false; From fcef5911c7ea89b80d5bfc727f402f37c9eefd57 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 17:58:23 -0700 Subject: [PATCH 267/900] x86/irq: remove leftover code from NUMA_MIGRATE_IRQ_DESC The original feature of migrating irq_desc dynamic was too fragile and was causing problems: it caused crashes on systems with lots of cards with MSI-X when user-space irq-balancer was enabled. We now have new patches that create irq_desc according to device numa node. This patch removes the leftover bits of the dynamic balancer. [ Impact: remove dead code ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F654AF.8000808@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 10 ------ arch/x86/configs/x86_64_defconfig | 1 - arch/x86/kernel/apic/io_apic.c | 56 +++---------------------------- include/linux/irq.h | 10 ------ kernel/irq/Makefile | 2 +- kernel/irq/chip.c | 12 ++----- kernel/irq/handle.c | 9 ++--- kernel/irq/numa_migrate.c | 2 -- 8 files changed, 9 insertions(+), 93 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index df9e885eee1..e1b2543f8ed 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -274,16 +274,6 @@ config SPARSE_IRQ If you don't know what to do here, say N. -config NUMA_MIGRATE_IRQ_DESC - bool "Move irq desc when changing irq smp_affinity" - depends on SPARSE_IRQ && NUMA - depends on BROKEN - default n - ---help--- - This enables moving irq_desc to cpu/node that irq will use handled. - - If you don't know what to do here, say N. - config X86_MPPARSE bool "Enable MPS table" if ACPI default y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 9fe5d212ab4..27b8ce0f590 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -195,7 +195,6 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y CONFIG_SPARSE_IRQ=y -# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y # CONFIG_X86_ELAN is not set diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 30da617d18e..9fbf0f7ec7e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -148,9 +148,6 @@ struct irq_cfg { unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - u8 move_desc_pending : 1; -#endif }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ @@ -254,8 +251,7 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu) return 0; } -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - +/* for move_irq_desc */ static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) { @@ -356,19 +352,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) old_desc->chip_data = NULL; } } - -static void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg = desc->chip_data; - - if (!cfg->move_in_progress) { - /* it means that domain is not changed */ - if (!cpumask_intersects(desc->affinity, mask)) - cfg->move_desc_pending = 1; - } -} -#endif +/* end for move_irq_desc */ #else static struct irq_cfg *irq_cfg(unsigned int irq) @@ -378,13 +362,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq) #endif -#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC -static inline void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ -} -#endif - struct io_apic { unsigned int index; unsigned int unused[3]; @@ -592,9 +569,6 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) if (assign_irq_vector(irq, cfg, mask)) return BAD_APICID; - /* check that before desc->addinity get updated */ - set_extra_move_desc(desc, mask); - cpumask_copy(desc->affinity, mask); return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); @@ -2393,8 +2367,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) if (assign_irq_vector(irq, cfg, mask)) return; - set_extra_move_desc(desc, mask); - dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); irte.vector = cfg->vector; @@ -2491,34 +2463,14 @@ static void irq_complete_move(struct irq_desc **descp) struct irq_cfg *cfg = desc->chip_data; unsigned vector, me; - if (likely(!cfg->move_in_progress)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - if (likely(!cfg->move_desc_pending)) - return; - - /* domain has not changed, but affinity did */ - me = smp_processor_id(); - if (cpumask_test_cpu(me, desc->affinity)) { - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; - cfg->move_desc_pending = 0; - } -#endif + if (likely(!cfg->move_in_progress)) return; - } vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; -#endif + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); - } } #else static inline void irq_complete_move(struct irq_desc **descp) {} diff --git a/include/linux/irq.h b/include/linux/irq.h index c4953cf27e5..2a34cd6281d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -212,16 +212,6 @@ extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu); extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu); -static inline struct irq_desc * -irq_remap_to_desc(unsigned int irq, struct irq_desc *desc) -{ -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - return irq_to_desc(irq); -#else - return desc; -#endif -} - /* * Migration helpers for obsolete names, they will go away: */ diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 3394f8f5296..2f065277f8e 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o +obj-$(CONFIG_SPARSE_IRQ) += numa_migrate.o obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c687ba4363f..13c68e71b72 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) spin_lock(&desc->lock); mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); - desc = irq_remap_to_desc(irq, desc); spin_unlock(&desc->lock); } @@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) !desc->action)) { desc->status |= (IRQ_PENDING | IRQ_MASKED); mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); goto out_unlock; } kstat_incr_irqs_this_cpu(irq, desc); @@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) /* Start handling the irq */ if (desc->chip->ack) desc->chip->ack(irq); - desc = irq_remap_to_desc(irq, desc); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->chip->eoi) { + if (desc->chip->eoi) desc->chip->eoi(irq); - desc = irq_remap_to_desc(irq, desc); - } } void @@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) { + if (desc->chip != &no_irq_chip) mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); - } desc->status |= IRQ_DISABLED; desc->depth = 1; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 882c7980010..3e0cbc44bd7 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -458,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ - if (desc->chip->ack) { + if (desc->chip->ack) desc->chip->ack(irq); - /* get new one */ - desc = irq_remap_to_desc(irq, desc); - } if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) @@ -473,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq) } spin_lock(&desc->lock); - if (desc->chip->ack) { + if (desc->chip->ack) desc->chip->ack(irq); - desc = irq_remap_to_desc(irq, desc); - } /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 5760d725162..ce72bc3f4ce 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -97,9 +97,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, /* free the old one */ free_one_irq_desc(old_desc, desc); - spin_unlock(&old_desc->lock); kfree(old_desc); - spin_lock(&desc->lock); return desc; From d5dedd4507d307eb3f35f21b6e16f336fdc0d82a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 17:59:21 -0700 Subject: [PATCH 268/900] irq: change ->set_affinity() to return status according to Ingo, change set_affinity() in irq_chip should return int, because that way we can handle failure cases in a much cleaner way, in the genirq layer. v2: fix two typos [ Impact: extend API ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell Cc: linux-arch@vger.kernel.org LKML-Reference: <49F654E9.4070809@kernel.org> Signed-off-by: Ingo Molnar --- arch/alpha/kernel/sys_dp264.c | 8 +++- arch/alpha/kernel/sys_titan.c | 4 +- arch/arm/common/gic.c | 4 +- arch/cris/arch-v32/kernel/irq.c | 4 +- arch/ia64/hp/sim/hpsim_irq.c | 3 +- arch/ia64/kernel/iosapic.c | 10 +++-- arch/ia64/kernel/msi_ia64.c | 16 ++++--- arch/ia64/sn/kernel/irq.c | 4 +- arch/ia64/sn/kernel/msi_sn.c | 8 ++-- arch/mips/cavium-octeon/octeon-irq.c | 8 +++- arch/mips/include/asm/irq.h | 2 +- arch/mips/kernel/irq-gic.c | 5 ++- arch/mips/mti-malta/malta-smtc.c | 4 +- arch/mips/sibyte/bcm1480/irq.c | 8 ++-- arch/mips/sibyte/sb1250/irq.c | 8 ++-- arch/parisc/kernel/irq.c | 6 ++- arch/powerpc/platforms/pseries/xics.c | 12 ++--- arch/powerpc/sysdev/mpic.c | 4 +- arch/sparc/kernel/irq_64.c | 12 +++-- arch/x86/kernel/apic/io_apic.c | 64 +++++++++++++++++---------- drivers/parisc/iosapic.c | 6 ++- drivers/xen/events.c | 12 ++--- include/linux/irq.h | 2 +- 23 files changed, 140 insertions(+), 74 deletions(-) diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c index 9c9d1fd4155..5bd5259324b 100644 --- a/arch/alpha/kernel/sys_dp264.c +++ b/arch/alpha/kernel/sys_dp264.c @@ -176,22 +176,26 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity) } } -static void +static int dp264_set_affinity(unsigned int irq, const struct cpumask *affinity) { spin_lock(&dp264_irq_lock); cpu_set_irq_affinity(irq, *affinity); tsunami_update_irq_hw(cached_irq_mask); spin_unlock(&dp264_irq_lock); + + return 0; } -static void +static int clipper_set_affinity(unsigned int irq, const struct cpumask *affinity) { spin_lock(&dp264_irq_lock); cpu_set_irq_affinity(irq - 16, *affinity); tsunami_update_irq_hw(cached_irq_mask); spin_unlock(&dp264_irq_lock); + + return 0; } static struct hw_interrupt_type dp264_irq_type = { diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index 27f840a4ad3..8dd239ebdb9 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -157,13 +157,15 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity) } -static void +static int titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) { spin_lock(&titan_irq_lock); titan_cpu_set_irq_affinity(irq - 16, *affinity); titan_update_irq_hw(titan_cached_irq_mask); spin_unlock(&titan_irq_lock); + + return 0; } static void diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c index c6884ba1d5e..90f6b7f52d4 100644 --- a/arch/arm/common/gic.c +++ b/arch/arm/common/gic.c @@ -109,7 +109,7 @@ static void gic_unmask_irq(unsigned int irq) } #ifdef CONFIG_SMP -static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val) +static int gic_set_cpu(unsigned int irq, const struct cpumask *mask_val) { void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3); unsigned int shift = (irq % 4) * 8; @@ -122,6 +122,8 @@ static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val) val |= 1 << (cpu + shift); writel(val, reg); spin_unlock(&irq_controller_lock); + + return 0; } #endif diff --git a/arch/cris/arch-v32/kernel/irq.c b/arch/cris/arch-v32/kernel/irq.c index df3925cb1c7..d70b445f4a8 100644 --- a/arch/cris/arch-v32/kernel/irq.c +++ b/arch/cris/arch-v32/kernel/irq.c @@ -325,12 +325,14 @@ static void end_crisv32_irq(unsigned int irq) { } -void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest) +int set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest) { unsigned long flags; spin_lock_irqsave(&irq_lock, flags); irq_allocations[irq - FIRST_IRQ].mask = *dest; spin_unlock_irqrestore(&irq_lock, flags); + + return 0; } static struct irq_chip crisv32_irq_type = { diff --git a/arch/ia64/hp/sim/hpsim_irq.c b/arch/ia64/hp/sim/hpsim_irq.c index cc0a3182db3..acb5047ab57 100644 --- a/arch/ia64/hp/sim/hpsim_irq.c +++ b/arch/ia64/hp/sim/hpsim_irq.c @@ -21,9 +21,10 @@ hpsim_irq_noop (unsigned int irq) { } -static void +static int hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b) { + return 0; } static struct hw_interrupt_type irq_type_hp_sim = { diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index 166e0d839fa..f92cef47bf8 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c @@ -329,7 +329,7 @@ unmask_irq (unsigned int irq) } -static void +static int iosapic_set_affinity(unsigned int irq, const struct cpumask *mask) { #ifdef CONFIG_SMP @@ -343,15 +343,15 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask) cpu = cpumask_first_and(cpu_online_mask, mask); if (cpu >= nr_cpu_ids) - return; + return -1; if (irq_prepare_move(irq, cpu)) - return; + return -1; dest = cpu_physical_id(cpu); if (!iosapic_intr_info[irq].count) - return; /* not an IOSAPIC interrupt */ + return -1; /* not an IOSAPIC interrupt */ set_irq_affinity_info(irq, dest, redir); @@ -376,7 +376,9 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask) iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32); iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32); } + #endif + return 0; } /* diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c index 2b15e233f7f..0f8ade9331b 100644 --- a/arch/ia64/kernel/msi_ia64.c +++ b/arch/ia64/kernel/msi_ia64.c @@ -12,7 +12,7 @@ static struct irq_chip ia64_msi_chip; #ifdef CONFIG_SMP -static void ia64_set_msi_irq_affinity(unsigned int irq, +static int ia64_set_msi_irq_affinity(unsigned int irq, const cpumask_t *cpu_mask) { struct msi_msg msg; @@ -20,10 +20,10 @@ static void ia64_set_msi_irq_affinity(unsigned int irq, int cpu = first_cpu(*cpu_mask); if (!cpu_online(cpu)) - return; + return -1; if (irq_prepare_move(irq, cpu)) - return; + return -1; read_msi_msg(irq, &msg); @@ -39,6 +39,8 @@ static void ia64_set_msi_irq_affinity(unsigned int irq, write_msi_msg(irq, &msg); cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu)); + + return 0; } #endif /* CONFIG_SMP */ @@ -130,17 +132,17 @@ void arch_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_DMAR #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_cfg *cfg = irq_cfg + irq; struct msi_msg msg; int cpu = cpumask_first(mask); if (!cpu_online(cpu)) - return; + return -1; if (irq_prepare_move(irq, cpu)) - return; + return -1; dmar_msi_read(irq, &msg); @@ -151,6 +153,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dmar_msi_write(irq, &msg); cpumask_copy(irq_desc[irq].affinity, mask); + + return 0; } #endif /* CONFIG_SMP */ diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c index 66fd705e82c..764f26abac0 100644 --- a/arch/ia64/sn/kernel/irq.c +++ b/arch/ia64/sn/kernel/irq.c @@ -227,7 +227,7 @@ finish_up: return new_irq_info; } -static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask) +static int sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct sn_irq_info *sn_irq_info, *sn_irq_info_safe; nasid_t nasid; @@ -239,6 +239,8 @@ static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask) list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe, sn_irq_lh[irq], list) (void)sn_retarget_vector(sn_irq_info, nasid, slice); + + return 0; } #ifdef CONFIG_SMP diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c index 81e428943d7..fbbfb970120 100644 --- a/arch/ia64/sn/kernel/msi_sn.c +++ b/arch/ia64/sn/kernel/msi_sn.c @@ -151,7 +151,7 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry) } #ifdef CONFIG_SMP -static void sn_set_msi_irq_affinity(unsigned int irq, +static int sn_set_msi_irq_affinity(unsigned int irq, const struct cpumask *cpu_mask) { struct msi_msg msg; @@ -168,7 +168,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, cpu = cpumask_first(cpu_mask); sn_irq_info = sn_msi_info[irq].sn_irq_info; if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0) - return; + return -1; /* * Release XIO resources for the old MSI PCI address @@ -189,7 +189,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice); sn_msi_info[irq].sn_irq_info = new_irq_info; if (new_irq_info == NULL) - return; + return -1; /* * Map the xio address into bus space @@ -206,6 +206,8 @@ static void sn_set_msi_irq_affinity(unsigned int irq, write_msi_msg(irq, &msg); cpumask_copy(irq_desc[irq].affinity, cpu_mask); + + return 0; } #endif /* CONFIG_SMP */ diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c index 1c19af8daa6..d3a0c8154be 100644 --- a/arch/mips/cavium-octeon/octeon-irq.c +++ b/arch/mips/cavium-octeon/octeon-irq.c @@ -177,7 +177,7 @@ static void octeon_irq_ciu0_disable(unsigned int irq) } #ifdef CONFIG_SMP -static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest) +static int octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest) { int cpu; int bit = irq - OCTEON_IRQ_WORKQ0; /* Bit 0-63 of EN0 */ @@ -199,6 +199,8 @@ static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask */ cvmx_read_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num() * 2)); write_unlock(&octeon_irq_ciu0_rwlock); + + return 0; } #endif @@ -292,7 +294,7 @@ static void octeon_irq_ciu1_disable(unsigned int irq) } #ifdef CONFIG_SMP -static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest) +static int octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest) { int cpu; int bit = irq - OCTEON_IRQ_WDOG0; /* Bit 0-63 of EN1 */ @@ -315,6 +317,8 @@ static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask */ cvmx_read_csr(CVMX_CIU_INTX_EN1(cvmx_get_core_num() * 2 + 1)); write_unlock(&octeon_irq_ciu1_rwlock); + + return 0; } #endif diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h index 3214ade02d1..4f1eed107b0 100644 --- a/arch/mips/include/asm/irq.h +++ b/arch/mips/include/asm/irq.h @@ -49,7 +49,7 @@ static inline void smtc_im_ack_irq(unsigned int irq) #ifdef CONFIG_MIPS_MT_SMTC_IRQAFF #include -extern void plat_set_irq_affinity(unsigned int irq, +extern int plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity); extern void smtc_forward_irq(unsigned int irq); diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c index 87deb8f6c45..3f43c2e3aa5 100644 --- a/arch/mips/kernel/irq-gic.c +++ b/arch/mips/kernel/irq-gic.c @@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq) static DEFINE_SPINLOCK(gic_lock); -static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask) +static int gic_set_affinity(unsigned int irq, const struct cpumask *cpumask) { cpumask_t tmp = CPU_MASK_NONE; unsigned long flags; @@ -166,7 +166,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask) cpumask_and(&tmp, cpumask, cpu_online_mask); if (cpus_empty(tmp)) - return; + return -1; /* Assumption : cpumask refers to a single CPU */ spin_lock_irqsave(&gic_lock, flags); @@ -190,6 +190,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask) cpumask_copy(irq_desc[irq].affinity, cpumask); spin_unlock_irqrestore(&gic_lock, flags); + return 0; } #endif diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c index 5ba31888fef..499ffe5475d 100644 --- a/arch/mips/mti-malta/malta-smtc.c +++ b/arch/mips/mti-malta/malta-smtc.c @@ -114,7 +114,7 @@ struct plat_smp_ops msmtc_smp_ops = { */ -void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) +int plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) { cpumask_t tmask; int cpu = 0; @@ -156,5 +156,7 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) /* Do any generic SMTC IRQ affinity setup */ smtc_set_irq_affinity(irq, tmask); + + return 0; } #endif /* CONFIG_MIPS_MT_SMTC_IRQAFF */ diff --git a/arch/mips/sibyte/bcm1480/irq.c b/arch/mips/sibyte/bcm1480/irq.c index 352352b3cb2..4f256a131bf 100644 --- a/arch/mips/sibyte/bcm1480/irq.c +++ b/arch/mips/sibyte/bcm1480/irq.c @@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq); static void disable_bcm1480_irq(unsigned int irq); static void ack_bcm1480_irq(unsigned int irq); #ifdef CONFIG_SMP -static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask); +static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask); #endif #ifdef CONFIG_PCI @@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq) } #ifdef CONFIG_SMP -static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask) +static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask) { int i = 0, old_cpu, cpu, int_on, k; u64 cur_ints; @@ -119,7 +119,7 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask) if (cpumask_weight(mask) != 1) { printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq); - return; + return -1; } i = cpumask_first(mask); @@ -155,6 +155,8 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask) } spin_unlock(&bcm1480_imr_lock); spin_unlock_irqrestore(&desc->lock, flags); + + return 0; } #endif diff --git a/arch/mips/sibyte/sb1250/irq.c b/arch/mips/sibyte/sb1250/irq.c index c08ff582da6..e389507f1f9 100644 --- a/arch/mips/sibyte/sb1250/irq.c +++ b/arch/mips/sibyte/sb1250/irq.c @@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq); static void disable_sb1250_irq(unsigned int irq); static void ack_sb1250_irq(unsigned int irq); #ifdef CONFIG_SMP -static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask); +static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask); #endif #ifdef CONFIG_SIBYTE_HAS_LDT @@ -103,7 +103,7 @@ void sb1250_unmask_irq(int cpu, int irq) } #ifdef CONFIG_SMP -static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask) +static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask) { int i = 0, old_cpu, cpu, int_on; u64 cur_ints; @@ -114,7 +114,7 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask) if (cpumask_weight(mask) > 1) { printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq); - return; + return -1; } /* Convert logical CPU to physical CPU */ @@ -146,6 +146,8 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask) } spin_unlock(&sb1250_imr_lock); spin_unlock_irqrestore(&desc->lock, flags); + + return 0; } #endif diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index 4ea4229d765..8007f1e6572 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -130,15 +130,17 @@ int cpu_check_affinity(unsigned int irq, const struct cpumask *dest) return cpu_dest; } -static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest) +static int cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest) { int cpu_dest; cpu_dest = cpu_check_affinity(irq, dest); if (cpu_dest < 0) - return; + return -1; cpumask_copy(&irq_desc[irq].affinity, dest); + + return 0; } #endif diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c index 80b513449f4..be3581a8c29 100644 --- a/arch/powerpc/platforms/pseries/xics.c +++ b/arch/powerpc/platforms/pseries/xics.c @@ -333,7 +333,7 @@ static void xics_eoi_lpar(unsigned int virq) lpar_xirr_info_set((0xff << 24) | irq); } -static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) +static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) { unsigned int irq; int status; @@ -342,14 +342,14 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) irq = (unsigned int)irq_map[virq].hwirq; if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) - return; + return -1; status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq); if (status) { printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n", __func__, irq, status); - return; + return -1; } /* @@ -363,7 +363,7 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) printk(KERN_WARNING "%s: No online cpus in the mask %s for irq %d\n", __func__, cpulist, virq); - return; + return -1; } status = rtas_call(ibm_set_xive, 3, 1, NULL, @@ -372,8 +372,10 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) if (status) { printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n", __func__, irq, status); - return; + return -1; } + + return 0; } static struct irq_chip xics_pic_direct = { diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 21b95670159..f4cbd15cf22 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -807,7 +807,7 @@ static void mpic_end_ipi(unsigned int irq) #endif /* CONFIG_SMP */ -void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask) +int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct mpic *mpic = mpic_from_irq(irq); unsigned int src = mpic_irq_to_hw(irq); @@ -824,6 +824,8 @@ void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask) mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION), mpic_physmask(cpus_addr(tmp)[0])); } + + return 0; } static unsigned int mpic_type_to_vecpri(struct mpic *mpic, unsigned int type) diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index 5deabe921a4..e5e78f9cfc9 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -318,10 +318,12 @@ static void sun4u_irq_enable(unsigned int virt_irq) } } -static void sun4u_set_affinity(unsigned int virt_irq, +static int sun4u_set_affinity(unsigned int virt_irq, const struct cpumask *mask) { sun4u_irq_enable(virt_irq); + + return 0; } /* Don't do anything. The desc->status check for IRQ_DISABLED in @@ -377,7 +379,7 @@ static void sun4v_irq_enable(unsigned int virt_irq) ino, err); } -static void sun4v_set_affinity(unsigned int virt_irq, +static int sun4v_set_affinity(unsigned int virt_irq, const struct cpumask *mask) { unsigned int ino = virt_irq_table[virt_irq].dev_ino; @@ -388,6 +390,8 @@ static void sun4v_set_affinity(unsigned int virt_irq, if (err != HV_EOK) printk(KERN_ERR "sun4v_intr_settarget(%x,%lu): " "err(%d)\n", ino, cpuid, err); + + return 0; } static void sun4v_irq_disable(unsigned int virt_irq) @@ -445,7 +449,7 @@ static void sun4v_virq_enable(unsigned int virt_irq) dev_handle, dev_ino, err); } -static void sun4v_virt_set_affinity(unsigned int virt_irq, +static int sun4v_virt_set_affinity(unsigned int virt_irq, const struct cpumask *mask) { unsigned long cpuid, dev_handle, dev_ino; @@ -461,6 +465,8 @@ static void sun4v_virt_set_affinity(unsigned int virt_irq, printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): " "err(%d)\n", dev_handle, dev_ino, cpuid, err); + + return 0; } static void sun4v_virq_disable(unsigned int virt_irq) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9fbf0f7ec7e..5c7630b40a5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -574,13 +574,14 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); } -static void +static int set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; unsigned long flags; unsigned int dest; unsigned int irq; + int ret = -1; irq = desc->irq; cfg = desc->chip_data; @@ -591,18 +592,21 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); + ret = 0; } spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; } -static void +static int set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc; desc = irq_to_desc(irq); - set_ioapic_affinity_irq_desc(desc, mask); + return set_ioapic_affinity_irq_desc(desc, mask); } #endif /* CONFIG_SMP */ @@ -2348,24 +2352,25 @@ static int ioapic_retrigger_irq(unsigned int irq) * Real vector that is used for interrupting cpu will be coming from * the interrupt-remapping table entry. */ -static void +static int migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; struct irte irte; unsigned int dest; unsigned int irq; + int ret = -1; if (!cpumask_intersects(mask, cpu_online_mask)) - return; + return ret; irq = desc->irq; if (get_irte(irq, &irte)) - return; + return ret; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return; + return ret; dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); @@ -2381,27 +2386,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) send_cleanup_vector(cfg); cpumask_copy(desc->affinity, mask); + + return 0; } /* * Migrates the IRQ destination in the process context. */ -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { - migrate_ioapic_irq_desc(desc, mask); + return migrate_ioapic_irq_desc(desc, mask); } -static void set_ir_ioapic_affinity_irq(unsigned int irq, +static int set_ir_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); - set_ir_ioapic_affinity_irq_desc(desc, mask); + return set_ir_ioapic_affinity_irq_desc(desc, mask); } #else -static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { + return 0; } #endif @@ -3318,7 +3326,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms } #ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3327,7 +3335,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3339,13 +3347,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg_desc(desc, &msg); + + return 0; } #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ -static void +static int ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); @@ -3354,11 +3364,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) struct irte irte; if (get_irte(irq, &irte)) - return; + return -1; dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -3375,6 +3385,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) */ if (cfg->move_in_progress) send_cleanup_vector(cfg); + + return 0; } #endif @@ -3528,7 +3540,7 @@ void arch_teardown_msi_irq(unsigned int irq) #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3537,7 +3549,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3549,6 +3561,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ @@ -3582,7 +3596,7 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3591,7 +3605,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3603,6 +3617,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ @@ -3659,7 +3675,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) write_ht_irq_msg(irq, &msg); } -static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3667,11 +3683,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; target_ht_irq(irq, dest, cfg->vector); + + return 0; } #endif diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c index 73348c4047e..4a9cc92d4d1 100644 --- a/drivers/parisc/iosapic.c +++ b/drivers/parisc/iosapic.c @@ -702,7 +702,7 @@ static unsigned int iosapic_startup_irq(unsigned int irq) } #ifdef CONFIG_SMP -static void iosapic_set_affinity_irq(unsigned int irq, +static int iosapic_set_affinity_irq(unsigned int irq, const struct cpumask *dest) { struct vector_info *vi = iosapic_get_vector(irq); @@ -712,7 +712,7 @@ static void iosapic_set_affinity_irq(unsigned int irq, dest_cpu = cpu_check_affinity(irq, dest); if (dest_cpu < 0) - return; + return -1; cpumask_copy(irq_desc[irq].affinity, cpumask_of(dest_cpu)); vi->txn_addr = txn_affinity_addr(irq, dest_cpu); @@ -724,6 +724,8 @@ static void iosapic_set_affinity_irq(unsigned int irq, iosapic_set_irt_data(vi, &dummy_d0, &d1); iosapic_wr_irt_entry(vi, d0, d1); spin_unlock_irqrestore(&iosapic_lock, flags); + + return 0; } #endif diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 30963af5dba..33389880279 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -688,13 +688,13 @@ void rebind_evtchn_irq(int evtchn, int irq) } /* Rebind an evtchn so that it gets delivered to a specific cpu */ -static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) +static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) { struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); if (!VALID_EVTCHN(evtchn)) - return; + return -1; /* Send future instances of this interrupt to other vcpu. */ bind_vcpu.port = evtchn; @@ -707,13 +707,15 @@ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) */ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) bind_evtchn_to_cpu(evtchn, tcpu); + + return 0; } - -static void set_affinity_irq(unsigned irq, const struct cpumask *dest) +static int set_affinity_irq(unsigned irq, const struct cpumask *dest) { unsigned tcpu = cpumask_first(dest); - rebind_irq_to_cpu(irq, tcpu); + + return rebind_irq_to_cpu(irq, tcpu); } int resend_irq_on_evtchn(unsigned int irq) diff --git a/include/linux/irq.h b/include/linux/irq.h index 2a34cd6281d..8e4c18b2915 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -117,7 +117,7 @@ struct irq_chip { void (*eoi)(unsigned int irq); void (*end)(unsigned int irq); - void (*set_affinity)(unsigned int irq, + int (*set_affinity)(unsigned int irq, const struct cpumask *dest); int (*retrigger)(unsigned int irq); int (*set_type)(unsigned int irq, unsigned int flow_type); From 57b150cce8e004ddd36330490a68bfb59b7271e9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 17:59:53 -0700 Subject: [PATCH 269/900] irq: only update affinity if ->set_affinity() is sucessfull irq_set_affinity() and move_masked_irq() try to assign affinity before calling chip set_affinity(). Some archs are assigning it in ->set_affinity() again. We do something like: cpumask_cpy(desc->affinity, mask); desc->chip->set_affinity(mask); But in the failure path, affinity should not be touched - otherwise we'll end up with a different affinity mask despite the failure to migrate the IRQ. So try to update the afffinity only if set_affinity returns with 0. Also call irq_set_thread_affinity accordingly. v2: update after "irq, x86: Remove IRQ_DISABLED check in process context IRQ move" v3: according to Ingo, change set_affinity() in irq_chip should return int. v4: update comments by removing moving irq_desc code. [ Impact: fix /proc/irq/*/smp_affinity setting corner case bug ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F65509.60307@kernel.org> Signed-off-by: Ingo Molnar --- kernel/irq/internals.h | 3 +++ kernel/irq/manage.c | 17 +++++++++++------ kernel/irq/migration.c | 14 +++++++++----- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 01ce20eab38..de5f412f6a9 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq, extern int irq_select_affinity_usr(unsigned int irq); +extern void +irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2734eca5924..aaf5c9d0577 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq) return 1; } -static void +void irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) { struct irqaction *action = desc->action; @@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) - desc->chip->set_affinity(irq, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) { + if (!desc->chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->affinity, cpumask); + irq_set_thread_affinity(desc, cpumask); + } + } else { desc->status |= IRQ_MOVE_PENDING; cpumask_copy(desc->pending_mask, cpumask); } #else - cpumask_copy(desc->affinity, cpumask); - desc->chip->set_affinity(irq, cpumask); + if (!desc->chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->affinity, cpumask); + irq_set_thread_affinity(desc, cpumask); + } #endif - irq_set_thread_affinity(desc, cpumask); desc->status |= IRQ_AFFINITY_SET; spin_unlock_irqrestore(&desc->lock, flags); return 0; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index e05ad9be43b..cfe767ca154 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -1,5 +1,8 @@ #include +#include + +#include "internals.h" void move_masked_irq(int irq) { @@ -39,11 +42,12 @@ void move_masked_irq(int irq) * masking the irqs. */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) { - cpumask_and(desc->affinity, - desc->pending_mask, cpu_online_mask); - desc->chip->set_affinity(irq, desc->affinity); - } + < nr_cpu_ids)) + if (!desc->chip->set_affinity(irq, desc->pending_mask)) { + cpumask_copy(desc->affinity, desc->pending_mask); + irq_set_thread_affinity(desc, desc->pending_mask); + } + cpumask_clear(desc->pending_mask); } From 85ac16d033370caf6f48d743c8dc8103700f5cc5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:00:38 -0700 Subject: [PATCH 270/900] x86/irq: change irq_desc_alloc() to take node instead of cpu This simplifies the node awareness of the code. All our allocators only deal with a NUMA node ID locality not with CPU ids anyway - so there's no need to maintain (and transform) a CPU id all across the IRq layer. v2: keep move_irq_desc related [ Impact: cleanup, prepare IRQ code to be NUMA-aware ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell Cc: Jeremy Fitzhardinge LKML-Reference: <49F65536.2020300@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 58 +++++++++++++++------------------- arch/x86/lguest/boot.c | 2 +- drivers/pci/intr_remapping.c | 15 ++++----- drivers/xen/events.c | 2 +- include/linux/interrupt.h | 2 +- include/linux/irq.h | 16 ++++------ kernel/irq/handle.c | 28 +++++++--------- kernel/irq/internals.h | 2 +- kernel/irq/numa_migrate.c | 36 +++++++-------------- kernel/softirq.c | 2 +- 10 files changed, 66 insertions(+), 97 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5c7630b40a5..560b887ba27 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -129,12 +129,9 @@ struct irq_pin_list { struct irq_pin_list *next; }; -static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +static struct irq_pin_list *get_one_free_irq_2_pin(int node) { struct irq_pin_list *pin; - int node; - - node = cpu_to_node(cpu); pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); @@ -209,12 +206,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq) return cfg; } -static struct irq_cfg *get_one_free_irq_cfg(int cpu) +static struct irq_cfg *get_one_free_irq_cfg(int node) { struct irq_cfg *cfg; - int node; - - node = cpu_to_node(cpu); cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { @@ -235,13 +229,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu) return cfg; } -int arch_init_chip_data(struct irq_desc *desc, int cpu) +int arch_init_chip_data(struct irq_desc *desc, int node) { struct irq_cfg *cfg; cfg = desc->chip_data; if (!cfg) { - desc->chip_data = get_one_free_irq_cfg(cpu); + desc->chip_data = get_one_free_irq_cfg(node); if (!desc->chip_data) { printk(KERN_ERR "can not alloc irq_cfg\n"); BUG_ON(1); @@ -253,7 +247,7 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu) /* for move_irq_desc */ static void -init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node) { struct irq_pin_list *old_entry, *head, *tail, *entry; @@ -262,7 +256,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) if (!old_entry) return; - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) return; @@ -272,7 +266,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) tail = entry; old_entry = old_entry->next; while (old_entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { entry = head; while (entry) { @@ -312,12 +306,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) } void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { struct irq_cfg *cfg; struct irq_cfg *old_cfg; - cfg = get_one_free_irq_cfg(cpu); + cfg = get_one_free_irq_cfg(node); if (!cfg) return; @@ -328,7 +322,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); - init_copy_irq_2_pin(old_cfg, cfg, cpu); + init_copy_irq_2_pin(old_cfg, cfg, node); } static void free_irq_cfg(struct irq_cfg *old_cfg) @@ -615,13 +609,13 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list *entry; entry = cfg->irq_2_pin; if (!entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", apic, pin); @@ -641,7 +635,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) entry = entry->next; } - entry->next = get_one_free_irq_2_pin(cpu); + entry->next = get_one_free_irq_2_pin(node); entry = entry->next; entry->apic = apic; entry->pin = pin; @@ -650,7 +644,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, +static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, int oldapic, int oldpin, int newapic, int newpin) { @@ -670,7 +664,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, /* why? call replace before add? */ if (!replaced) - add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); + add_pin_to_irq_node(cfg, node, newapic, newpin); } static inline void io_apic_modify_irq(struct irq_cfg *cfg, @@ -1612,7 +1606,7 @@ static void __init setup_IO_APIC_irqs(void) int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1647,13 +1641,13 @@ static void __init setup_IO_APIC_irqs(void) apic->multi_timer_check(apic_id, irq)) continue; - desc = irq_to_desc_alloc_cpu(irq, cpu); + desc = irq_to_desc_alloc_node(irq, node); if (!desc) { printk(KERN_INFO "can not get irq_desc for %d\n", irq); continue; } cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); + add_pin_to_irq_node(cfg, node, apic_id, pin); setup_IO_APIC_irq(apic_id, pin, irq, desc, irq_trigger(idx), irq_polarity(idx)); @@ -2863,7 +2857,7 @@ static inline void __init check_timer(void) { struct irq_desc *desc = irq_to_desc(0); struct irq_cfg *cfg = desc->chip_data; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -2929,7 +2923,7 @@ static inline void __init check_timer(void) * Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); + add_pin_to_irq_node(cfg, node, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } else { /* for edge trigger, setup_IO_APIC_irq already @@ -2966,7 +2960,7 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); + replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); enable_8259A_irq(0); if (timer_irq_works()) { @@ -3185,7 +3179,7 @@ unsigned int create_irq_nr(unsigned int irq_want) unsigned int new; unsigned long flags; struct irq_cfg *cfg_new = NULL; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); struct irq_desc *desc_new = NULL; irq = 0; @@ -3194,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want) spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { - desc_new = irq_to_desc_alloc_cpu(new, cpu); + desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { printk(KERN_INFO "can not get irq_desc for %d\n", new); continue; @@ -3968,7 +3962,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p { struct irq_desc *desc; struct irq_cfg *cfg; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", @@ -3976,7 +3970,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p return -EINVAL; } - desc = irq_to_desc_alloc_cpu(irq, cpu); + desc = irq_to_desc_alloc_node(irq, node); if (!desc) { printk(KERN_INFO "can not get irq_desc %d\n", irq); return 0; @@ -3987,7 +3981,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p */ if (irq >= NR_IRQS_LEGACY) { cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); + add_pin_to_irq_node(cfg, node, ioapic, pin); } setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ca7ec44bafc..45acbcf2568 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -636,7 +636,7 @@ static void __init lguest_init_IRQ(void) void lguest_setup_irq(unsigned int irq) { - irq_to_desc_alloc_cpu(irq, 0); + irq_to_desc_alloc_node(irq, 0); set_irq_chip_and_handler_name(irq, &lguest_irq_controller, handle_level_irq, "level"); } diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index f5e0ea724a6..9eff36a293e 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -23,15 +23,12 @@ struct irq_2_iommu { }; #ifdef CONFIG_GENERIC_HARDIRQS -static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu) +static struct irq_2_iommu *get_one_free_irq_2_iommu(int node) { struct irq_2_iommu *iommu; - int node; - - node = cpu_to_node(cpu); iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node); - printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node); + printk(KERN_DEBUG "alloc irq_2_iommu on node %d\n", node); return iommu; } @@ -48,7 +45,7 @@ static struct irq_2_iommu *irq_2_iommu(unsigned int irq) return desc->irq_2_iommu; } -static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu) +static struct irq_2_iommu *irq_2_iommu_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; struct irq_2_iommu *irq_iommu; @@ -56,7 +53,7 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu) /* * alloc irq desc if not allocated already. */ - desc = irq_to_desc_alloc_cpu(irq, cpu); + desc = irq_to_desc_alloc_node(irq, node); if (!desc) { printk(KERN_INFO "can not get irq_desc for %d\n", irq); return NULL; @@ -65,14 +62,14 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu) irq_iommu = desc->irq_2_iommu; if (!irq_iommu) - desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu); + desc->irq_2_iommu = get_one_free_irq_2_iommu(node); return desc->irq_2_iommu; } static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) { - return irq_2_iommu_alloc_cpu(irq, boot_cpu_id); + return irq_2_iommu_alloc_node(irq, cpu_to_node(boot_cpu_id)); } #else /* !CONFIG_SPARSE_IRQ */ diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 33389880279..be437c2bc94 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -335,7 +335,7 @@ static int find_unbound_irq(void) if (irq == nr_irqs) panic("No available IRQ to bind to: increase nr_irqs!\n"); - desc = irq_to_desc_alloc_cpu(irq, 0); + desc = irq_to_desc_alloc_node(irq, 0); if (WARN_ON(desc == NULL)) return -1; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 91bb76f44f1..ff374ceface 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -566,6 +566,6 @@ struct irq_desc; extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); -extern int arch_init_chip_data(struct irq_desc *desc, int cpu); +extern int arch_init_chip_data(struct irq_desc *desc, int node); #endif diff --git a/include/linux/irq.h b/include/linux/irq.h index 8e4c18b2915..a09baf8f9d9 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -187,7 +187,7 @@ struct irq_desc { spinlock_t lock; #ifdef CONFIG_SMP cpumask_var_t affinity; - unsigned int cpu; + unsigned int node; #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_var_t pending_mask; #endif @@ -201,16 +201,16 @@ struct irq_desc { } ____cacheline_internodealigned_in_smp; extern void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int cpu); + struct irq_desc *desc, int node); extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc); #ifndef CONFIG_SPARSE_IRQ extern struct irq_desc irq_desc[NR_IRQS]; #else /* CONFIG_SPARSE_IRQ */ -extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu); +extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node); #endif /* CONFIG_SPARSE_IRQ */ -extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu); +extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); /* * Migration helpers for obsolete names, they will go away: @@ -422,12 +422,10 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); * Allocates affinity and pending_mask cpumask if required. * Returns true if successful (or not required). */ -static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu, +static inline bool alloc_desc_masks(struct irq_desc *desc, int node, bool boot) { #ifdef CONFIG_CPUMASK_OFFSTACK - int node; - if (boot) { alloc_bootmem_cpumask_var(&desc->affinity); @@ -437,8 +435,6 @@ static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu, return true; } - node = cpu_to_node(cpu); - if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node)) return false; @@ -494,7 +490,7 @@ static inline void free_desc_masks(struct irq_desc *old_desc, #else /* !CONFIG_SMP */ -static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu, +static inline bool alloc_desc_masks(struct irq_desc *desc, int node, bool boot) { return true; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3e0cbc44bd7..a6368db2618 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -81,12 +81,10 @@ static struct irq_desc irq_desc_init = { .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; -void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) +void init_kstat_irqs(struct irq_desc *desc, int node, int nr) { - int node; void *ptr; - node = cpu_to_node(cpu); ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); /* @@ -94,33 +92,32 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) * init_copy_kstat_irqs() could still use old one */ if (ptr) { - printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", - cpu, node); + printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); desc->kstat_irqs = ptr; } } -static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) +static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); spin_lock_init(&desc->lock); desc->irq = irq; #ifdef CONFIG_SMP - desc->cpu = cpu; + desc->node = node; #endif lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_kstat_irqs(desc, cpu, nr_cpu_ids); + init_kstat_irqs(desc, node, nr_cpu_ids); if (!desc->kstat_irqs) { printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } - if (!alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, node, false)) { printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); BUG_ON(1); } init_desc_masks(desc); - arch_init_chip_data(desc, cpu); + arch_init_chip_data(desc, node); } /* @@ -189,11 +186,10 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; unsigned long flags; - int node; if (irq >= nr_irqs) { WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", @@ -212,15 +208,13 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) if (desc) goto out_unlock; - node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", - irq, cpu, node); + printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); if (!desc) { printk(KERN_ERR "can not alloc irq_desc\n"); BUG_ON(1); } - init_one_irq_desc(irq, desc, cpu); + init_one_irq_desc(irq, desc, node); irq_desc_ptrs[irq] = desc; @@ -270,7 +264,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) return (irq < NR_IRQS) ? irq_desc + irq : NULL; } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) { return irq_to_desc(irq); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index de5f412f6a9..73468253143 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern struct lock_class_key irq_desc_lock_class; -extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); +extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); extern void clear_kstat_irqs(struct irq_desc *desc); extern spinlock_t sparse_irq_lock; diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index ce72bc3f4ce..2f69bee57bf 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -15,9 +15,9 @@ static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc, - int cpu, int nr) + int node, int nr) { - init_kstat_irqs(desc, cpu, nr); + init_kstat_irqs(desc, node, nr); if (desc->kstat_irqs != old_desc->kstat_irqs) memcpy(desc->kstat_irqs, old_desc->kstat_irqs, @@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) } static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { memcpy(desc, old_desc, sizeof(struct irq_desc)); - if (!alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, node, false)) { printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " "for migration.\n", irq); return false; } spin_lock_init(&desc->lock); - desc->cpu = cpu; + desc->node = node; lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); + init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); init_copy_desc_masks(old_desc, desc); - arch_init_copy_chip_data(old_desc, desc, cpu); + arch_init_copy_chip_data(old_desc, desc, node); return true; } @@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) } static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, - int cpu) + int node) { struct irq_desc *desc; unsigned int irq; unsigned long flags; - int node; irq = old_desc->irq; @@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, if (desc && old_desc != desc) goto out_unlock; - node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); if (!desc) { printk(KERN_ERR "irq %d: can not get new irq_desc " @@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, desc = old_desc; goto out_unlock; } - if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { + if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { /* still use old one */ kfree(desc); desc = old_desc; @@ -107,24 +105,14 @@ out_unlock: return desc; } -struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) +struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { - int old_cpu; - int node, old_node; - /* those all static, do move them */ if (desc->irq < NR_IRQS_LEGACY) return desc; - old_cpu = desc->cpu; - if (old_cpu != cpu) { - node = cpu_to_node(cpu); - old_node = cpu_to_node(old_cpu); - if (old_node != node) - desc = __real_move_irq_desc(desc, cpu); - else - desc->cpu = cpu; - } + if (desc->node != node) + desc = __real_move_irq_desc(desc, node); return desc; } diff --git a/kernel/softirq.c b/kernel/softirq.c index b525dd34851..f674f332a02 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -828,7 +828,7 @@ int __init __weak arch_early_irq_init(void) return 0; } -int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) +int __weak arch_init_chip_data(struct irq_desc *desc, int node) { return 0; } From a2f809b08ae4dddc1015c7dcd8659e5729e45b3e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:01:20 -0700 Subject: [PATCH 271/900] irq: change ACPI GSI APIs to also take a device argument We want to use dev_to_node() later on, to be aware of the 'home node' of the GSI in question. [ Impact: cleanup, prepare the IRQ code to be more NUMA aware ] Signed-off-by: Yinghai Lu Acked-by: Len Brown Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell Cc: Len Brown Cc: Bjorn Helgaas Cc: Tony Luck Cc: linux-acpi@vger.kernel.org Cc: linux-ia64@vger.kernel.org LKML-Reference: <49F65560.20904@kernel.org> Signed-off-by: Ingo Molnar --- arch/ia64/kernel/acpi.c | 5 +++-- arch/x86/include/asm/io_apic.h | 4 ++-- arch/x86/include/asm/mpspec.h | 4 +++- arch/x86/kernel/acpi/boot.c | 8 ++++---- arch/x86/kernel/apic/io_apic.c | 3 ++- drivers/acpi/pci_irq.c | 5 +++-- drivers/char/hpet.c | 4 ++-- drivers/pnp/pnpacpi/rsparser.c | 2 +- include/linux/acpi.h | 2 +- 9 files changed, 21 insertions(+), 16 deletions(-) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 5510317db37..baec6f00f7f 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -636,7 +636,7 @@ void __init acpi_numa_arch_fixup(void) * success: return IRQ number (>=0) * failure: return < 0 */ -int acpi_register_gsi(u32 gsi, int triggering, int polarity) +int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM) return gsi; @@ -678,7 +678,8 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) fadt = (struct acpi_table_fadt *)fadt_header; - acpi_register_gsi(fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW); + acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, + ACPI_ACTIVE_LOW); return 0; } diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 9d826e43601..07f2913ba5d 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -154,8 +154,8 @@ extern int timer_through_8259; extern int io_apic_get_unique_id(int ioapic, int apic_id); extern int io_apic_get_version(int ioapic); extern int io_apic_get_redir_entries(int ioapic); -extern int io_apic_set_pci_routing(int ioapic, int pin, int irq, - int edge_level, int active_high_low); +extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, + int irq, int edge_level, int active_high_low); #endif /* CONFIG_ACPI */ extern int (*ioapic_renumber_irq)(int ioapic, int irq); diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 642fc7fc8cd..3ea1f531f53 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -72,7 +72,9 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base); extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi); extern void mp_config_acpi_legacy_irqs(void); -extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); +struct device; +extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level, + int active_high_low); extern int acpi_probe_gsi(void); #ifdef CONFIG_X86_IO_APIC extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 723989d7f80..6ee96b5530f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -522,7 +522,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) * success: return IRQ number (>=0) * failure: return < 0 */ -int acpi_register_gsi(u32 gsi, int triggering, int polarity) +int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { unsigned int irq; unsigned int plat_gsi = gsi; @@ -539,7 +539,7 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity) #ifdef CONFIG_X86_IO_APIC if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(gsi, triggering, polarity); + plat_gsi = mp_register_gsi(dev, gsi, triggering, polarity); } #endif acpi_gsi_to_irq(plat_gsi, &irq); @@ -1158,7 +1158,7 @@ void __init mp_config_acpi_legacy_irqs(void) } } -int mp_register_gsi(u32 gsi, int triggering, int polarity) +int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { int ioapic; int ioapic_pin; @@ -1253,7 +1253,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) } } #endif - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi, triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); return gsi; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 560b887ba27..d9346622601 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3958,7 +3958,8 @@ int __init io_apic_get_version(int ioapic) } #endif -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) +int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, + int triggering, int polarity) { struct irq_desc *desc; struct irq_cfg *cfg; diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index 51b9f8280f8..2faa9e2ac89 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -401,7 +401,8 @@ int acpi_pci_irq_enable(struct pci_dev *dev) /* Interrupt Line values above 0xF are forbidden */ if (dev->irq > 0 && (dev->irq <= 0xF)) { printk(" - using IRQ %d\n", dev->irq); - acpi_register_gsi(dev->irq, ACPI_LEVEL_SENSITIVE, + acpi_register_gsi(&dev->dev, dev->irq, + ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW); return 0; } else { @@ -410,7 +411,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev) } } - rc = acpi_register_gsi(gsi, triggering, polarity); + rc = acpi_register_gsi(&dev->dev, gsi, triggering, polarity); if (rc < 0) { dev_warn(&dev->dev, "PCI INT %c: failed to register GSI\n", pin_name(pin)); diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 340ba4f9dc5..4a9f3492b92 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -224,7 +224,7 @@ static void hpet_timer_set_irq(struct hpet_dev *devp) break; } - gsi = acpi_register_gsi(irq, ACPI_LEVEL_SENSITIVE, + gsi = acpi_register_gsi(NULL, irq, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW); if (gsi > 0) break; @@ -939,7 +939,7 @@ static acpi_status hpet_resources(struct acpi_resource *res, void *data) irqp = &res->data.extended_irq; for (i = 0; i < irqp->interrupt_count; i++) { - irq = acpi_register_gsi(irqp->interrupts[i], + irq = acpi_register_gsi(NULL, irqp->interrupts[i], irqp->triggering, irqp->polarity); if (irq < 0) return AE_ERROR; diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c index adf17856bac..7f207f335be 100644 --- a/drivers/pnp/pnpacpi/rsparser.c +++ b/drivers/pnp/pnpacpi/rsparser.c @@ -123,7 +123,7 @@ static void pnpacpi_parse_allocated_irqresource(struct pnp_dev *dev, } flags = irq_flags(triggering, polarity, shareable); - irq = acpi_register_gsi(gsi, triggering, polarity); + irq = acpi_register_gsi(&dev->dev, gsi, triggering, polarity); if (irq >= 0) pcibios_penalize_isa_irq(irq, 1); else diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 88be890ee3c..51b4b0a5ce8 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -119,7 +119,7 @@ extern int pci_mmcfg_config_num; extern int sbf_port; extern unsigned long acpi_realmode_flags; -int acpi_register_gsi (u32 gsi, int triggering, int polarity); +int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity); int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); #ifdef CONFIG_X86_IO_APIC From 024154cfdd802654cb236a18c78b6e37351e2c49 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:01:50 -0700 Subject: [PATCH 272/900] irq: change io_apic_set_pci_routing() to use device parameter Make actual use of the device parameter passed down to io_apic_set_pci_routing() - to have the IRQ descriptor on the home node of the device. If no device has been passed down, we assume it's a platform device and use the boot node ID for the IRQ descriptor. [ Impact: optimization, make IO-APIC code more NUMA aware ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F6557E.3080101@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d9346622601..82376e021b5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3963,7 +3963,7 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, { struct irq_desc *desc; struct irq_cfg *cfg; - int node = cpu_to_node(boot_cpu_id); + int node; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", @@ -3971,6 +3971,11 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, return -EINVAL; } + if (dev) + node = dev_to_node(dev); + else + node = cpu_to_node(boot_cpu_id); + desc = irq_to_desc_alloc_node(irq, node); if (!desc) { printk(KERN_INFO "can not get irq_desc %d\n", irq); From d047f53a2ecce37e3bdf79eac5a326fbaadb3628 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:02:23 -0700 Subject: [PATCH 273/900] x86/irq: change MSI irq_desc to be more numa aware Try to get irq_desc on the home node in create_irq_nr(). v2: don't check if we can move it when sparse_irq is not used v3: use move_irq_des, if that node is not what we want [ Impact: optimization, make MSI IRQ descriptors more NUMA aware ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F6559F.7070005@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 17 +++++++++++++---- include/linux/irq.h | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 82376e021b5..9cd4806cdf5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3172,14 +3172,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ -unsigned int create_irq_nr(unsigned int irq_want) +unsigned int create_irq_nr(unsigned int irq_want, int node) { /* Allocate an unused irq */ unsigned int irq; unsigned int new; unsigned long flags; struct irq_cfg *cfg_new = NULL; - int node = cpu_to_node(boot_cpu_id); struct irq_desc *desc_new = NULL; irq = 0; @@ -3197,6 +3196,13 @@ unsigned int create_irq_nr(unsigned int irq_want) if (cfg_new->vector != 0) continue; + +#ifdef CONFIG_NUMA_IRQ_DESC + /* different node ?*/ + if (desc_new->node != node) + desc = move_irq_desc(desc, node); +#endif + if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; break; @@ -3214,11 +3220,12 @@ unsigned int create_irq_nr(unsigned int irq_want) int create_irq(void) { + int node = cpu_to_node(boot_cpu_id); unsigned int irq_want; int irq; irq_want = nr_irqs_gsi; - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) irq = -1; @@ -3476,15 +3483,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) unsigned int irq_want; struct intel_iommu *iommu = NULL; int index = 0; + int node; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; + node = dev_to_node(&dev->dev); irq_want = nr_irqs_gsi; sub_handle = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) return -1; irq_want = irq + 1; diff --git a/include/linux/irq.h b/include/linux/irq.h index a09baf8f9d9..4b95ddb5304 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -376,7 +376,7 @@ extern void set_irq_noprobe(unsigned int irq); extern void set_irq_probe(unsigned int irq); /* Handle dynamic irq creation and destruction */ -extern unsigned int create_irq_nr(unsigned int irq_want); +extern unsigned int create_irq_nr(unsigned int irq_want, int node); extern int create_irq(void); extern void destroy_irq(unsigned int irq); From 56b581ea9591b5767b1e0204c6a06c7d0c49396e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:02:46 -0700 Subject: [PATCH 274/900] irq: make ht irq_desc more numa aware Try to get irq_desc on the same node as create_irq_nr(). [ Impact: optimization, make HT IRQs more NUMA-aware ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F655B6.8020109@kernel.org> Signed-off-by: Ingo Molnar --- drivers/pci/htirq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index bf7d6ce9bbb..4e9dd0fe274 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -98,6 +98,7 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) int max_irq; int pos; int irq; + int node; pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ); if (!pos) @@ -125,7 +126,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) cfg->msg.address_lo = 0xffffffff; cfg->msg.address_hi = 0xffffffff; - irq = create_irq(); + node = dev_to_node(&dev->dev); + irq = create_irq_nr(0, node); if (irq <= 0) { kfree(cfg); From cd891ae0305601bdb4d2e7e85282961c4ff256cd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Apr 2009 11:39:34 -0400 Subject: [PATCH 275/900] tracing: convert ftrace_dump spinlocks to raw ftrace_dump is used for printing out the contents of the ftrace ring buffer to the console on failure. Currently it uses a spinlock to synchronize the output from multiple failures on different CPUs. This spin lock currently is a normal spinlock and can cause issues with lockdep and lock tracing. This patch converts it to raw since it is for error handling only. The lock is local to the ftrace_dump and is not used by any other infrastructure. [ Impact: prevent ftrace_dump from locking up by internal tracing ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b6183bc9eca..5d704a41f83 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4114,7 +4114,8 @@ trace_printk_seq(struct trace_seq *s) static void __ftrace_dump(bool disable_tracing) { - static DEFINE_SPINLOCK(ftrace_dump_lock); + static raw_spinlock_t ftrace_dump_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; @@ -4123,7 +4124,8 @@ static void __ftrace_dump(bool disable_tracing) int cnt = 0, cpu; /* only one dump */ - spin_lock_irqsave(&ftrace_dump_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&ftrace_dump_lock); if (dump_ran) goto out; @@ -4195,7 +4197,8 @@ static void __ftrace_dump(bool disable_tracing) } out: - spin_unlock_irqrestore(&ftrace_dump_lock, flags); + __raw_spin_unlock(&ftrace_dump_lock); + local_irq_restore(flags); } /* By default: disable tracing after the dump */ From edc953fa4ebc0265ef3b1754fe116a9fd4264e15 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 28 Apr 2009 11:13:46 -0400 Subject: [PATCH 276/900] x86: clean up alternative.h Alternative header duplicates assembly that could be merged in one single macro. Merging this into this macro also allows to directly declare ALTERNATIVE() statements within assembly code. Uses a __stringify() of the feature bits rather than passing a "i" operand. Leave the old %0 operand as-is (set to 0), unused to stay compatible with API. (v2: tab alignment fixes) [ Impact: cleanup ] Signed-off-by: Mathieu Desnoyers LKML-Reference: <20090428151346.GA31212@Krystal> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 59 ++++++++++++------------------ 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index f6aa18eadf7..1a37bcdc860 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -3,6 +3,7 @@ #include #include +#include #include /* @@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {} const unsigned char *const *find_nop_table(void); +/* alternative assembly primitive: */ +#define ALTERNATIVE(oldinstr, newinstr, feature) \ + \ + "661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte " __stringify(feature) "\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" + /* * Alternative instructions for different CPU types or capabilities. * @@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void); * without volatile and memory clobber. */ #define alternative(oldinstr, newinstr, feature) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ - " .byte %c0\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" :: "i" (feature) : "memory") + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") /* * Alternative inline assembly with input. @@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void); * Best is to use constraints that are fixed size (like (%1) ... "r") * If you use variable sized constraints like "m" or "g" in the * replacement make sure to pad to the worst case length. + * Leaving an unused argument 0 to keep API compatibility. */ #define alternative_input(oldinstr, newinstr, feature, input...) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ - " .byte %c0\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" :: "i" (feature), ##input) + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ + : : "i" (0), ## input) /* Like alternative_input, but with a single output argument */ #define alternative_io(oldinstr, newinstr, feature, output, input...) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ - " .byte %c[feat]\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" : output : [feat] "i" (feature), ##input) + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ + : output : "i" (0), ## input) /* * use this macro(s) if you need more than one output parameter From 5beae6efd1004b44c3e257dc96087978e4c763c1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:16:21 -0400 Subject: [PATCH 277/900] tracing: fix ref count in splice pages The pages allocated for the splice binary buffer did not initialize the ref count correctly. This caused pages not to be freed and causes a drastic memory leak. Thanks to logdev I was able to trace the tracer to find where the leak was. [ Impact: stop memory leak when using splice ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5d704a41f83..9058240c85c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3531,6 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if (!ref) break; + ref->ref = 1; ref->buffer = info->tr->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer); if (!ref->page) { From 93459c6cb9816c52200993d29dd18cea1daee335 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:23:13 -0400 Subject: [PATCH 278/900] tracing: only add splice page if entries exist The splice code allocates a page even when the ring buffer is empty. It detects the ring buffer being empty when it it fails to copy anything from the ring buffer into the page. This patch adds a check to see if there is anything in the ring buffer before allocating a page. Thanks to logdev for letting me trace the tracer to find this. [ Impact: speed up due to removing unnecessary allocation ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9058240c85c..0aeb3b93414 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3508,7 +3508,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int size, i; + int entries, size, i; size_t ret; if (*ppos & (PAGE_SIZE - 1)) { @@ -3523,7 +3523,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } - for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) { + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + + for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; @@ -3564,6 +3566,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; *ppos += PAGE_SIZE; + + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); } spd.nr_pages = i; From f2957f1f196b0217644a17c1379855a118a37d72 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:26:30 -0400 Subject: [PATCH 279/900] tracing: have splice only copy full pages Splice works with pages, it is much more effecient to use an entire page than to copy bits over several pages. Using logdev to trace the internals of the splice mechanism, I was able to see that splice can be very aggressive. When tracing is occurring, and the reader caught up to the writer, and the writer is on the reader page, the reader will copy what is there into the splice page. Splice may iterate over several pages and if the writer is still writing to the page, the reader will keep copying bits to new pages to pass to userspace. This patch changes it to only pass data to userspace if the page is full (the writer has left the page). This has a small side effect that splice can not read a partial page, and must wait for the page to fill. This should not be an issue. If tracing has stopped, then a use of "read" will still read all of the page. [ Impact: better performance for ring buffer splice code ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0aeb3b93414..f5427e0fc98 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3542,7 +3542,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } r = ring_buffer_read_page(ref->buffer, &ref->page, - len, info->cpu, 0); + len, info->cpu, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->page); From 7d7d2b803159d4edeb051b0e5efbc1a8d9ef1c67 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 27 Apr 2009 12:37:49 -0400 Subject: [PATCH 280/900] ring-buffer: fix printk output The warning output in trace_recursive_lock uses %d for a long when it should be %ld. [ Impact: fix compile warning ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9692f100ec1..f4cc59040eb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1491,7 +1491,7 @@ static int trace_recursive_lock(void) /* Disable all tracing before we do anything else */ tracing_off_permanent(); - printk_once(KERN_WARNING "Tracing recursion: depth[%d]:" + printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" "HC[%lu]:SC[%lu]:NMI[%lu]\n", current->trace_recursion, hardirq_count() >> HARDIRQ_SHIFT, From aee6a166a5401dcfcb17fcdc055e5edf2a4f4042 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:17 +0200 Subject: [PATCH 281/900] x86: beautify vmlinux_32.lds.S Beautify vmlinux_32.lds.S: - Use tabs for indent - Located curly braces like in C code - Rearranged a few comments To see actual differences use "git diff -b" which ignore 'whitespace' changes. The beautification is done to prepare a unification of the _32 and _64 variants of the linker scripts. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-1-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_32.lds.S | 374 ++++++++++++++++--------------- 1 file changed, 199 insertions(+), 175 deletions(-) diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 62ad500d55f..fffa45a1036 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -22,196 +22,220 @@ ENTRY(phys_startup_32) jiffies = jiffies_64; PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = startup_32 - LOAD_OFFSET; - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - *(.text.head) - } :text = 0x9090 + /* Text and read-only data */ + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; + *(.text.head) + } :text = 0x9090 - /* read-only */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */ - *(.text.page_aligned) - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 + /* read-only */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { + /* not really needed, already page aligned */ + . = ALIGN(PAGE_SIZE); + *(.text.page_aligned) + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 - NOTES :text :note + NOTES :text :note - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 - RODATA + RODATA - /* writeable */ - . = ALIGN(PAGE_SIZE); - .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ - DATA_DATA - CONSTRUCTORS + /* writeable */ + . = ALIGN(PAGE_SIZE); + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS } :data - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - *(.data.idt) - } - - . = ALIGN(32); - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - - /* rarely changed data like cpu maps */ - . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - _edata = .; /* End of data section */ - } - - . = ALIGN(THREAD_SIZE); /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } - - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - } - /* will be freed after init - * Following ALIGN() is required to make sure no other data falls on the - * same page where __smp_alt_end is pointing as that page might be freed - * after boot. Always make sure that ALIGN() directive is present after - * the section which contains __smp_alt_end. - */ - . = ALIGN(PAGE_SIZE); - - /* will be freed after init */ - . = ALIGN(PAGE_SIZE); /* Init code and data */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT - . = ALIGN(4); - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - . = ALIGN(4); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } -#if defined(CONFIG_BLK_DEV_INITRD) - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - PERCPU(PAGE_SIZE) - . = ALIGN(PAGE_SIZE); - /* freed after init ends here */ - - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __init_end = .; - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - . = ALIGN(4); - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } - - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = . ; - } - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; } - STABS_DEBUG + . = ALIGN(PAGE_SIZE); + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + *(.data.idt) + } - DWARF_DEBUG + . = ALIGN(32); + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + + /* rarely changed data like cpu maps */ + . = ALIGN(32); + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + + /* End of data section */ + _edata = .; + } + + /* init_task */ + . = ALIGN(THREAD_SIZE); + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } + + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + /* might get freed after init */ + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + } + /* will be freed after init + * Following ALIGN() is required to make sure no other data falls on the + * same page where __smp_alt_end is pointing as that page might be freed + * after boot. Always make sure that ALIGN() directive is present after + * the section which contains __smp_alt_end. + */ + . = ALIGN(PAGE_SIZE); + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + + . = ALIGN(4); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + . = ALIGN(4); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + +#if defined(CONFIG_BLK_DEV_INITRD) + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } +#endif + + PERCPU(PAGE_SIZE) + + . = ALIGN(PAGE_SIZE); + /* freed after init ends here */ + + /* BSS */ + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __init_end = .; + __bss_start = .; + *(.bss.page_aligned) + *(.bss) + . = ALIGN(4); + __bss_stop = .; + } + + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } + + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = . ; + } + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.discard) + } + + STABS_DEBUG + DWARF_DEBUG } /* From 17ce265d6a1789eae5eb739a3bb7fcffdb3e87c5 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:18 +0200 Subject: [PATCH 282/900] x86, vmlinux.lds: unify header/footer Merge everything except PHDRS and SECTIONS into vmlinux.lds.S. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-2-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 77 ++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 37 --------------- arch/x86/kernel/vmlinux_64.lds.S | 42 ----------------- 3 files changed, 77 insertions(+), 79 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 849ee611f01..d113642c134 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -1,5 +1,82 @@ +/* + * ld script for the x86 kernel + * + * Historic 32-bit version written by Martin Mares + * + * Modernisation and unification done by Sam Ravnborg + * + * + * Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. + */ + +#ifdef CONFIG_X86_32 +#define LOAD_OFFSET __PAGE_OFFSET +#else +#define LOAD_OFFSET __START_KERNEL_map +#endif + +#include +#include +#include +#include +#include +#include + +#undef i386 /* in case the preprocessor is a 32bit one */ + +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#ifdef CONFIG_X86_32 +OUTPUT_ARCH(i386) +ENTRY(phys_startup_32) +jiffies = jiffies_64; +#else +OUTPUT_ARCH(i386:x86-64) +ENTRY(phys_startup_64) +jiffies_64 = jiffies; +#endif + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else # include "vmlinux_64.lds.S" #endif + + +#ifdef CONFIG_X86_32 +ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") +#else +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ +#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +INIT_PER_CPU(gdt_page); +INIT_PER_CPU(irq_stack_union); + +/* + * Build-time check on the image size: + */ +ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") + +#ifdef CONFIG_SMP +ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); +#endif + +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_KEXEC +#include + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif + diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index fffa45a1036..4c985fcd9ab 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,26 +1,3 @@ -/* ld script to make i386 Linux kernel - * Written by Martin Mares ; - * - * Don't define absolute symbols until and unless you know that symbol - * value is should remain constant even if kernel image is relocated - * at run time. Absolute symbols are not relocated. If symbol value should - * change if kernel is relocated, make the symbol section relative and - * put it inside the section definition. - */ - -#define LOAD_OFFSET __PAGE_OFFSET - -#include -#include -#include -#include -#include - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(phys_startup_32) -jiffies = jiffies_64; - PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ @@ -237,17 +214,3 @@ SECTIONS STABS_DEBUG DWARF_DEBUG } - -/* - * Build-time check on the image size: - */ -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_KEXEC -/* Link time checks */ -#include - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 6d5a5b05eaa..7f1cc3d5fef 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,19 +1,3 @@ -/* ld script to make x86-64 Linux kernel - * Written by Martin Mares ; - */ - -#define LOAD_OFFSET __START_KERNEL_map - -#include -#include -#include - -#undef i386 /* in case the preprocessor is a 32bit one */ - -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") -OUTPUT_ARCH(i386:x86-64) -ENTRY(phys_startup_64) -jiffies_64 = jiffies; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ @@ -308,29 +292,3 @@ SECTIONS STABS_DEBUG DWARF_DEBUG } - -/* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load -INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); - -/* - * Build-time check on the image size: - */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); -#endif - -#ifdef CONFIG_KEXEC -#include - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif From afb8095a7eab32e5760613fa73d2f80a39cc45bf Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:19 +0200 Subject: [PATCH 283/900] x86, vmlinux.lds: unify PHDRS PHDRS are not equal for the two - so use ifdefs to cover up for that. On the assumption that they may become equal the ifdef is inside the PHDRS definiton. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-3-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 13 +++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 5 ----- arch/x86/kernel/vmlinux_64.lds.S | 11 ----------- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d113642c134..1a1b303a427 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -40,6 +40,19 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_X86_64 + user PT_LOAD FLAGS(7); /* RWE */ + data.init PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_SMP + percpu PT_LOAD FLAGS(7); /* RWE */ +#endif + data.init2 PT_LOAD FLAGS(7); /* RWE */ +#endif + note PT_NOTE FLAGS(0); /* ___ */ +} #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 4c985fcd9ab..4fd40dc5017 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,8 +1,3 @@ -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} SECTIONS { . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 7f1cc3d5fef..6e7cbee0e87 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,14 +1,3 @@ -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - user PT_LOAD FLAGS(7); /* RWE */ - data.init PT_LOAD FLAGS(7); /* RWE */ -#ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(7); /* RWE */ -#endif - data.init2 PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} SECTIONS { . = __START_KERNEL; From 444e0ae4831f99ba25062d9a5ccb7117c62841a0 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:20 +0200 Subject: [PATCH 284/900] x86, vmlinux.lds: unify start/end of SECTIONS [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-4-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 14 ++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 9 --------- arch/x86/kernel/vmlinux_64.lds.S | 9 --------- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1a1b303a427..845776fe529 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -54,12 +54,26 @@ PHDRS { note PT_NOTE FLAGS(0); /* ___ */ } +SECTIONS +{ +#ifdef CONFIG_X86_32 + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = startup_32 - LOAD_OFFSET; +#else + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; +#endif + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else # include "vmlinux_64.lds.S" #endif + STABS_DEBUG + DWARF_DEBUG +} + #ifdef CONFIG_X86_32 ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 4fd40dc5017..3d3d49c31b0 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,8 +1,3 @@ -SECTIONS -{ - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; - /* Text and read-only data */ .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { _text = .; @@ -205,7 +200,3 @@ SECTIONS *(.exitcall.exit) *(.discard) } - - STABS_DEBUG - DWARF_DEBUG -} diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 6e7cbee0e87..2d7fa2016c3 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,8 +1,3 @@ -SECTIONS -{ - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; @@ -277,7 +272,3 @@ SECTIONS *(.eh_frame) *(.discard) } - - STABS_DEBUG - DWARF_DEBUG -} From dfc20895d944cfa81d8ff00809b68ecb8f72cbb0 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:21 +0200 Subject: [PATCH 285/900] x86, vmlinux.lds: unify .text output sections 32 bit x86 had a dedicated .text.head output section, whereas 64 bit had it all in a single output section. In the unified version the dedicated .text.head output section was kept to have full control over the head code. 32 bit: - Moved definition of _stext to the linker script. The definition is located _after_ .text.page_aligned as this is what 32 bit did before. The ALIGN(8) was introduced so we hit the exact same address (on the tested config) before and after the move. I assume that it is a bug that _stext did not cover the .text.page_aligned section - if this is true it can be fixed in a follow-up patch (and the ugly ALIGN() can be dropped). [ Impact: 64-bit: cleanup, 32-bit: use the 64-bit linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-5-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 7 ------- arch/x86/kernel/vmlinux.lds.S | 31 +++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 24 ------------------------ arch/x86/kernel/vmlinux_64.lds.S | 20 -------------------- 4 files changed, 31 insertions(+), 51 deletions(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 30683883e0c..dc5ed4bdd88 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -608,13 +608,6 @@ ignore_int: ENTRY(initial_code) .long i386_start_kernel -.section .text -/* - * Real beginning of normal "text" segment - */ -ENTRY(stext) -ENTRY(_stext) - /* * BSS section */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 845776fe529..a7c88bb4365 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -64,6 +64,37 @@ SECTIONS phys_startup_64 = startup_64 - LOAD_OFFSET; #endif + /* Text and read-only data */ + + /* bootstrapping code */ + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; + *(.text.head) + } :text = 0x9090 + + /* The rest of the text */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_X86_32 + /* not really needed, already page aligned */ + . = ALIGN(PAGE_SIZE); + *(.text.page_aligned) +#endif + . = ALIGN(8); + _stext = .; + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 + + NOTES :text :note + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 3d3d49c31b0..854009288ec 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,27 +1,3 @@ - /* Text and read-only data */ - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; - *(.text.head) - } :text = 0x9090 - - /* read-only */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - /* not really needed, already page aligned */ - . = ALIGN(PAGE_SIZE); - *(.text.page_aligned) - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - /* End of text section */ - _etext = .; - } :text = 0x9090 - - NOTES :text :note - /* Exception table */ . = ALIGN(16); __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 2d7fa2016c3..b5d43670d80 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,23 +1,3 @@ - /* Text and read-only data */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; - /* First the code that has to be first for bootstrapping */ - *(.text.head) - _stext = .; - /* Then the rest */ - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - /* End of text section */ - _etext = .; - } :text = 0x9090 - - NOTES :text :note - /* Exception table */ . = ALIGN(16); __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { From 448bc3ab0d03e77fee8e4264de0d001fc87bc164 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:22 +0200 Subject: [PATCH 286/900] x86, vmlinux.lds: unify exception table [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-6-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 10 ++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 10 ---------- arch/x86/kernel/vmlinux_64.lds.S | 10 ---------- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a7c88bb4365..67164f6f092 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -94,6 +94,16 @@ SECTIONS NOTES :text :note + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 + + RODATA + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 854009288ec..920cc6989cc 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,13 +1,3 @@ - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - /* writeable */ . = ALIGN(PAGE_SIZE); /* Data */ diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index b5d43670d80..641f3f991a0 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,13 +1,3 @@ - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - /* Align data segment to page size boundary */ . = ALIGN(PAGE_SIZE); /* Data */ From 1f6397bac55040cd520d9eaf299e155a7aa01d5f Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:23 +0200 Subject: [PATCH 287/900] x86, vmlinux.lds: unify data output sections For 64 bit the following functional changes are introduced: - .data.page_aligned has moved - .data.cacheline_aligned has moved - .data.read_mostly has moved - ALIGN() moved out of output section for .data.cacheline_aligned - ALIGN() moved out of output section for .data.page_aligned Notice that 32 bit and 64 bit has different location of _edata. .data_nosave is 32 bit only as 64 bit is special due to PERCPU. [ Impact: 32-bit: cleanup, 64-bit: use 32-bit linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-7-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 55 ++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 37 --------------------- arch/x86/kernel/vmlinux_64.lds.S | 28 ---------------- 3 files changed, 55 insertions(+), 65 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 67164f6f092..067bdb012da 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -104,6 +104,61 @@ SECTIONS RODATA + /* Data */ + . = ALIGN(PAGE_SIZE); + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS + +#ifdef CONFIG_X86_64 + /* End of data section */ + _edata = .; +#endif + } :data + +#ifdef CONFIG_X86_32 + /* 32 bit has nosave before _edata */ + . = ALIGN(PAGE_SIZE); + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } +#endif + + . = ALIGN(PAGE_SIZE); + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + *(.data.idt) + } + +#ifdef CONFIG_X86_32 + . = ALIGN(32); +#else + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); +#endif + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + + /* rarely changed data like cpu maps */ +#ifdef CONFIG_X86_32 + . = ALIGN(32); +#else + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); +#endif + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + +#ifdef CONFIG_X86_32 + /* End of data section */ + _edata = .; +#endif + } + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 920cc6989cc..8ade84687b2 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,40 +1,3 @@ - /* writeable */ - . = ALIGN(PAGE_SIZE); - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - } :data - - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - *(.data.idt) - } - - . = ALIGN(32); - .data.cacheline_aligned : - AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - - /* rarely changed data like cpu maps */ - . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - - /* End of data section */ - _edata = .; - } - /* init_task */ . = ALIGN(THREAD_SIZE); .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 641f3f991a0..826270147b5 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,26 +1,3 @@ - /* Align data segment to page size boundary */ - . = ALIGN(PAGE_SIZE); - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - /* End of data section */ - _edata = .; - } :data - - - .data.cacheline_aligned : - AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - *(.data.cacheline_aligned) - } - - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } - #define VSYSCALL_ADDR (-10*1024*1024) #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ SIZEOF(.data.read_mostly) + 4095) & ~(4095)) @@ -95,11 +72,6 @@ *(.data.init_task) } :data.init - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - } - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { /* might get freed after init */ . = ALIGN(PAGE_SIZE); From ff6f87e1626e10beef675084c9b5384a9477e3d5 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:24 +0200 Subject: [PATCH 288/900] x86, vmlinux.lds: move vsyscall output sections [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-8-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 71 ++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_64.lds.S | 68 ------------------------------ 2 files changed, 71 insertions(+), 68 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 067bdb012da..b3106c2a037 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -159,6 +159,77 @@ SECTIONS #endif } +#ifdef CONFIG_X86_64 + +#define VSYSCALL_ADDR (-10*1024*1024) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) + +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + *(.vsyscall_0) + } :user + + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + *(.vsyscall_fn) + } + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { + *(.vsyscall_gtod_data) + } + + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { + *(.vsyscall_clock) + } + vsyscall_clock = VVIRT(.vsyscall_clock); + + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + *(.vsyscall_1) + } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + *(.vsyscall_2) + } + + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { + *(.vgetcpu_mode) + } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .jiffies : AT(VLOAD(.jiffies)) { + *(.jiffies) + } + jiffies = VVIRT(.jiffies); + + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { + *(.vsyscall_3) + } + + . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT + +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 826270147b5..013aa0e1dd3 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,71 +1,3 @@ -#define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { - *(.vsyscall_0) - } :user - - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { - *(.vsyscall_fn) - } - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { - *(.vsyscall_gtod_data) - } - - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { - *(.vsyscall_clock) - } - vsyscall_clock = VVIRT(.vsyscall_clock); - - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { - *(.vsyscall_1) - } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { - *(.vsyscall_2) - } - - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { - *(.vgetcpu_mode) - } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { - *(.jiffies) - } - jiffies = VVIRT(.jiffies); - - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { - *(.vsyscall_3) - } - - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; - -#undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { . = ALIGN(THREAD_SIZE); From e58bdaa8f810332e5c1760ce496b01e07d51642c Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:25 +0200 Subject: [PATCH 289/900] x86, vmlinux.lds: unify first part of initdata 32-bit: - Move definition of __init_begin outside output_section because it covers more than one section - Move ALIGN() for end-of-section inside .smp_locks output section. Same effect but the intent is better documented that we need both start and end aligned. 64-bit: - Move ALIGN() outside output section in .init.setup - Deleted unused __smp_alt_* symbols None of the above should result in any functional change. [ Impact: refactor and unify linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-9-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 61 ++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 60 ------------------------------- arch/x86/kernel/vmlinux_64.lds.S | 59 ------------------------------ 3 files changed, 61 insertions(+), 119 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index b3106c2a037..8b203c4ced9 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -231,6 +231,67 @@ SECTIONS #endif /* CONFIG_X86_64 */ + /* init_task */ + . = ALIGN(THREAD_SIZE); + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } +#ifdef CONFIG_X86_64 + :data.init +#endif + + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + } + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + __init_begin = .; /* paired with __init_end */ + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 8ade84687b2..d8ba5394af0 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,63 +1,3 @@ - /* init_task */ - . = ALIGN(THREAD_SIZE); - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } - - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - } - /* will be freed after init - * Following ALIGN() is required to make sure no other data falls on the - * same page where __smp_alt_end is pointing as that page might be freed - * after boot. Always make sure that ALIGN() directive is present after - * the section which contains __smp_alt_end. - */ - . = ALIGN(PAGE_SIZE); - - /* Init code and data - will be freed after init */ - . = ALIGN(PAGE_SIZE); - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; - _sinittext = .; - INIT_TEXT - _einittext = .; - } - - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - - SECURITY_INIT - . = ALIGN(4); .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { __alt_instructions = .; diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 013aa0e1dd3..0e8054e0c5c 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,62 +1,3 @@ - /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - . = ALIGN(THREAD_SIZE); - *(.data.init_task) - } :data.init - - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; - } - - /* Init code and data */ - . = ALIGN(PAGE_SIZE); - __init_begin = .; /* paired with __init_end */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } - - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - __initdata_begin = .; - INIT_DATA - __initdata_end = .; - } - - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - . = ALIGN(16); - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - - SECURITY_INIT - . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { __parainstructions = .; From ae61836289a415351caa524d328110aaeae100d4 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:26 +0200 Subject: [PATCH 290/900] x86, vmlinux.lds: unify parainstructions 32 bit: - increase alignment from 4 to 8 for .parainstructions - increase alignment from 4 to 8 for .altinstructions 64 bit: - move ALIGN() outside output section for .altinstructions None of the above should result in any functional change. [ Impact: refactor and unify linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-10-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 18 ++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 18 ------------------ arch/x86/kernel/vmlinux_64.lds.S | 18 ------------------ 3 files changed, 18 insertions(+), 36 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8b203c4ced9..c8dd71ecb56 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -291,6 +291,24 @@ SECTIONS SECURITY_INIT + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + . = ALIGN(8); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index d8ba5394af0..5df9647bb5d 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,21 +1,3 @@ - . = ALIGN(4); - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - - . = ALIGN(4); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - /* * .exit.text is discard at runtime, not link time, to deal with * references from .altinstructions and .eh_frame diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 0e8054e0c5c..9ef70966985 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,21 +1,3 @@ - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - . = ALIGN(8); - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* * .exit.text is discard at runtime, not link time, to deal with * references from .altinstructions and .eh_frame From bf6a57418d5445c98047edbec022c9e54d1526e6 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:27 +0200 Subject: [PATCH 291/900] x86, vmlinux.lds: unify .exit.* and .init.ramfs [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-11-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 20 ++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 21 --------------------- arch/x86/kernel/vmlinux_64.lds.S | 21 --------------------- 3 files changed, 20 insertions(+), 42 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c8dd71ecb56..1ab62a5fa1a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -309,6 +309,26 @@ SECTIONS *(.altinstr_replacement) } + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + +#ifdef CONFIG_BLK_DEV_INITRD + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } +#endif #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 5df9647bb5d..36c8fbd3c76 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,24 +1,3 @@ - /* - * .exit.text is discard at runtime, not link time, to deal with - * references from .altinstructions and .eh_frame - */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } - -#if defined(CONFIG_BLK_DEV_INITRD) - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - PERCPU(PAGE_SIZE) . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 9ef70966985..1aa53622333 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,24 +1,3 @@ - /* - * .exit.text is discard at runtime, not link time, to deal with - * references from .altinstructions and .eh_frame - */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } - -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - #ifdef CONFIG_SMP /* * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the From 9d16e78318f174fd4b07916a93e41749d5199267 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:28 +0200 Subject: [PATCH 292/900] x86, vmlinux.lds: unify percpu 32 bit: - move __init_end outside the .bss output section It really did not belong in there [ Impact: 64-bit: cleanup, 32-bit: refactor linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-12-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 6 ------ arch/x86/kernel/vmlinux_64.lds.S | 26 -------------------------- 3 files changed, 30 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1ab62a5fa1a..1ea2b8571e1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -330,6 +330,36 @@ SECTIONS } #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - __data_nosave - should + * start another section data.init2. Also, pda should be at the head of + * percpu area. Preallocate it and define the percpu offset symbol + * so that it can be accessed as a percpu variable. + */ + . = ALIGN(PAGE_SIZE); + PERCPU_VADDR(0, :percpu) +#else + PERCPU(PAGE_SIZE) +#endif + + . = ALIGN(PAGE_SIZE); + /* freed after init ends here */ + __init_end = .; + +#ifdef CONFIG_X86_64 + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } :data.init2 + /* use another section data.init2, see PERCPU_VADDR() above */ +#endif + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 36c8fbd3c76..d23ee2c15c2 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,11 +1,5 @@ - PERCPU(PAGE_SIZE) - - . = ALIGN(PAGE_SIZE); - /* freed after init ends here */ - /* BSS */ .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __init_end = .; __bss_start = .; *(.bss.page_aligned) *(.bss) diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 1aa53622333..a53936696a0 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,29 +1,3 @@ -#ifdef CONFIG_SMP - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) -#else - PERCPU(PAGE_SIZE) -#endif - - . = ALIGN(PAGE_SIZE); - __init_end = .; - - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 - /* use another section data.init2, see PERCPU_VADDR() above */ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); __bss_start = .; /* BSS */ From 091e52c3551d3031343df24b573b770b4c6c72b6 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:29 +0200 Subject: [PATCH 293/900] x86, vmlinux.lds: unify remaining parts 32 bit: - explicit page align .bss - move ALING() out of .brk output section - discard *(.eh_frame) 64 bit: - move ALIGN() out of .bss output section - move ALIGN() out of .brk output section - use a dedicated section to define _end [ Impact: unify and fix section alignments in linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-13-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 32 +++++++++++++++++++++++++++----- arch/x86/kernel/vmlinux_32.lds.S | 26 -------------------------- arch/x86/kernel/vmlinux_64.lds.S | 24 ------------------------ 3 files changed, 27 insertions(+), 55 deletions(-) delete mode 100644 arch/x86/kernel/vmlinux_32.lds.S delete mode 100644 arch/x86/kernel/vmlinux_64.lds.S diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1ea2b8571e1..ef3e4f1042b 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -359,12 +359,34 @@ SECTIONS /* use another section data.init2, see PERCPU_VADDR() above */ #endif + /* BSS */ + . = ALIGN(PAGE_SIZE); + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __bss_start = .; + *(.bss.page_aligned) + *(.bss) + . = ALIGN(4); + __bss_stop = .; + } -#ifdef CONFIG_X86_32 -# include "vmlinux_32.lds.S" -#else -# include "vmlinux_64.lds.S" -#endif + . = ALIGN(PAGE_SIZE); + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } + + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = .; + } + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.eh_frame) + *(.discard) + } STABS_DEBUG DWARF_DEBUG diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S deleted file mode 100644 index d23ee2c15c2..00000000000 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ /dev/null @@ -1,26 +0,0 @@ - /* BSS */ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __bss_start = .; - *(.bss.page_aligned) - *(.bss) - . = ALIGN(4); - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = .; - . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = .; - } - - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = . ; - } - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S deleted file mode 100644 index a53936696a0..00000000000 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ /dev/null @@ -1,24 +0,0 @@ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = .; - . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = .; - } - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } From 91fd7fe809bdf4d8aa56559d17b9f25a1a6fe732 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 10:58:38 +0200 Subject: [PATCH 294/900] x86, vmlinux.lds: add copyright Acked-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-2-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index ef3e4f1042b..0bdbaa57969 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -3,7 +3,8 @@ * * Historic 32-bit version written by Martin Mares * - * Modernisation and unification done by Sam Ravnborg + * Modernisation, unification and other changes and fixes: + * Copyright (C) 2007-2009 Sam Ravnborg * * * Don't define absolute symbols until and unless you know that symbol From 0492e1bb8fe7d122901c9f3af75e537d4129712e Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Tue, 28 Apr 2009 20:17:49 +0100 Subject: [PATCH 295/900] tracing: x86, mmiotrace: code consistency/legibility improvement kmmio_probe being *p and kmmio_fault_page being sometimes *f and sometimes *p is not helpful. [ Impact: cleanup ] Signed-off-by: Stuart Bennett Acked-by: Pekka Paalanen Cc: Steven Rostedt LKML-Reference: <1240946271-7083-3-git-send-email-stuart@freedesktop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 4f115e00486..869181a917d 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -97,13 +97,13 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr) static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) { struct list_head *head; - struct kmmio_fault_page *p; + struct kmmio_fault_page *f; page &= PAGE_MASK; head = kmmio_page_list(page); - list_for_each_entry_rcu(p, head, list) { - if (p->page == page) - return p; + list_for_each_entry_rcu(f, head, list) { + if (f->page == page) + return f; } return NULL; } @@ -439,12 +439,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head) head, struct kmmio_delayed_release, rcu); - struct kmmio_fault_page *p = dr->release_list; - while (p) { - struct kmmio_fault_page *next = p->release_next; - BUG_ON(p->count); - kfree(p); - p = next; + struct kmmio_fault_page *f = dr->release_list; + while (f) { + struct kmmio_fault_page *next = f->release_next; + BUG_ON(f->count); + kfree(f); + f = next; } kfree(dr); } @@ -453,19 +453,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) { struct kmmio_delayed_release *dr = container_of(head, struct kmmio_delayed_release, rcu); - struct kmmio_fault_page *p = dr->release_list; + struct kmmio_fault_page *f = dr->release_list; struct kmmio_fault_page **prevp = &dr->release_list; unsigned long flags; spin_lock_irqsave(&kmmio_lock, flags); - while (p) { - if (!p->count) { - list_del_rcu(&p->list); - prevp = &p->release_next; + while (f) { + if (!f->count) { + list_del_rcu(&f->list); + prevp = &f->release_next; } else { - *prevp = p->release_next; + *prevp = f->release_next; } - p = p->release_next; + f = f->release_next; } spin_unlock_irqrestore(&kmmio_lock, flags); From 46e91d00b1165b14b484aa33800e1bba0794ae1a Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Tue, 28 Apr 2009 20:17:50 +0100 Subject: [PATCH 296/900] tracing: x86, mmiotrace: refactor clearing/restore of page presence * change function names to clear_* from set_*: in reality we only clear and restore page presence, and never unconditionally set present. Using clear_*({true, false}, ...) is therefore more honest than set_*({false, true}, ...) * upgrade presence storage to pteval_t: doing user-space tracing will require saving and manipulation of the _PAGE_PROTNONE bit, in addition to the existing _PAGE_PRESENT changes, and having multiple bools stored and passed around does not seem optimal [ Impact: refactor, clean up mmiotrace code ] Signed-off-by: Stuart Bennett Acked-by: Pekka Paalanen Cc: Steven Rostedt LKML-Reference: <1240946271-7083-4-git-send-email-stuart@freedesktop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 869181a917d..a769d1a2d93 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -32,7 +32,7 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; unsigned long page; /* location of the fault page */ - bool old_presence; /* page presence prior to arming */ + pteval_t old_presence; /* page presence prior to arming */ bool armed; /* @@ -108,49 +108,51 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } -static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) +static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) { pmdval_t v = pmd_val(*pmd); - *old = !!(v & _PAGE_PRESENT); - v &= ~_PAGE_PRESENT; - if (present) - v |= _PAGE_PRESENT; + if (clear) { + *old = v & _PAGE_PRESENT; + v &= ~_PAGE_PRESENT; + } else /* presume this has been called with clear==true previously */ + v |= *old; set_pmd(pmd, __pmd(v)); } -static void set_pte_presence(pte_t *pte, bool present, bool *old) +static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) { pteval_t v = pte_val(*pte); - *old = !!(v & _PAGE_PRESENT); - v &= ~_PAGE_PRESENT; - if (present) - v |= _PAGE_PRESENT; + if (clear) { + *old = v & _PAGE_PRESENT; + v &= ~_PAGE_PRESENT; + } else /* presume this has been called with clear==true previously */ + v |= *old; set_pte_atomic(pte, __pte(v)); } -static int set_page_presence(unsigned long addr, bool present, bool *old) +static int clear_page_presence(struct kmmio_fault_page *f, bool clear) { unsigned int level; - pte_t *pte = lookup_address(addr, &level); + pte_t *pte = lookup_address(f->page, &level); if (!pte) { - pr_err("kmmio: no pte for page 0x%08lx\n", addr); + pr_err("kmmio: no pte for page 0x%08lx\n", f->page); return -1; } switch (level) { case PG_LEVEL_2M: - set_pmd_presence((pmd_t *)pte, present, old); + clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence); break; case PG_LEVEL_4K: - set_pte_presence(pte, present, old); + clear_pte_presence(pte, clear, &f->old_presence); break; default: pr_err("kmmio: unexpected page level 0x%x.\n", level); return -1; } - __flush_tlb_one(addr); + __flush_tlb_one(f->page); return 0; } @@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); if (f->armed) { pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, f->old_presence); + f->page, f->count, !!f->old_presence); } - ret = set_page_presence(f->page, false, &f->old_presence); + ret = clear_page_presence(f, true); WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); f->armed = true; return ret; @@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) /** Restore the given page to saved presence state. */ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) { - bool tmp; - int ret = set_page_presence(f->page, f->old_presence, &tmp); + int ret = clear_page_presence(f, false); WARN_ONCE(ret < 0, KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); f->armed = false; From 0f9a623dd6c9b5b4dd00c232f29525bfc7a8ecf2 Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Tue, 28 Apr 2009 20:17:51 +0100 Subject: [PATCH 297/900] tracing: x86, mmiotrace: only register for die notifier when tracer active Follow up to afcfe024aebd74b0984a41af9a34e009cf5badaf in Linus' tree ("x86: mmiotrace: quieten spurious warning message") Signed-off-by: Stuart Bennett Acked-by: Pekka Paalanen Cc: Steven Rostedt LKML-Reference: <1240946271-7083-5-git-send-email-stuart@freedesktop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 27 ++++++++++++++++++++++----- arch/x86/mm/mmio-mod.c | 2 ++ include/linux/mmiotrace.h | 2 ++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index a769d1a2d93..256ce643b0b 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -311,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) { - pr_debug("kmmio: spurious debug trap on CPU %d.\n", + /* + * debug traps without an active context are due to either + * something external causing them (f.e. using a debugger while + * mmio tracing enabled), or erroneous behaviour + */ + pr_warning("kmmio: unexpected debug trap on CPU %d.\n", smp_processor_id()); goto out; } @@ -529,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p) } EXPORT_SYMBOL(unregister_kmmio_probe); -static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, - void *args) +static int +kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) { struct die_args *arg = args; @@ -545,11 +550,23 @@ static struct notifier_block nb_die = { .notifier_call = kmmio_die_notifier }; -static int __init init_kmmio(void) +int kmmio_init(void) { int i; + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) INIT_LIST_HEAD(&kmmio_page_table[i]); + return register_die_notifier(&nb_die); } -fs_initcall(init_kmmio); /* should be before device_initcall() */ + +void kmmio_cleanup(void) +{ + int i; + + unregister_die_notifier(&nb_die); + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) { + WARN_ONCE(!list_empty(&kmmio_page_table[i]), + KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n"); + } +} diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index c9342ed8b40..132772a8ec5 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -451,6 +451,7 @@ void enable_mmiotrace(void) if (nommiotrace) pr_info(NAME "MMIO tracing disabled.\n"); + kmmio_init(); enter_uniprocessor(); spin_lock_irq(&trace_lock); atomic_inc(&mmiotrace_enabled); @@ -473,6 +474,7 @@ void disable_mmiotrace(void) clear_trace_list(); /* guarantees: no more kmmio callbacks */ leave_uniprocessor(); + kmmio_cleanup(); pr_info(NAME "disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 3d1b7bde128..97491f78b08 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -30,6 +30,8 @@ extern unsigned int kmmio_count; extern int register_kmmio_probe(struct kmmio_probe *p); extern void unregister_kmmio_probe(struct kmmio_probe *p); +extern int kmmio_init(void); +extern void kmmio_cleanup(void); #ifdef CONFIG_MMIOTRACE /* kmmio is active by some kmmio_probes? */ From fd0731944333db6e9e91b6954c6ef95f4b71ab04 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 12:56:58 +0200 Subject: [PATCH 298/900] x86, vmlinux.lds: fix relocatable symbols __init_begin/_end symbols should be inside sections as well, otherwise the relocatable kernel gets confused when freeing init sections in the wrong place. [ Impact: fix bootup crash ] Cc: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <20090429105056.GA28720@uranus.ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0bdbaa57969..4c85b2e2bb6 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -255,8 +255,8 @@ SECTIONS /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); - __init_begin = .; /* paired with __init_end */ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ _sinittext = .; INIT_TEXT _einittext = .; @@ -346,8 +346,11 @@ SECTIONS #endif . = ALIGN(PAGE_SIZE); + /* freed after init ends here */ - __init_end = .; + .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) { + __init_end = .; + } #ifdef CONFIG_X86_64 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { From 30e673b230f9d556eb81ef68a7b1a08c8b3b142c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 28 Apr 2009 03:04:47 -0500 Subject: [PATCH 299/900] tracing/filters: move preds into event_filter object Create a new event_filter object, and move the pred-related members out of the call and subsystem objects and into the filter object - the details of the filter implementation don't need to be exposed in the call and subsystem in any case, and it will also help make the new parser implementation a little cleaner. [ Impact: refactor trace-filter code to prepare for new features ] Signed-off-by: Tom Zanussi Acked-by: Steven Rostedt Cc: fweisbec@gmail.com Cc: Li Zefan LKML-Reference: <1240905887.6416.119.camel@tropicana> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 4 +- kernel/trace/trace.h | 10 ++- kernel/trace/trace_events.c | 3 +- kernel/trace/trace_events_filter.c | 107 ++++++++++++++++++----------- 4 files changed, 76 insertions(+), 48 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 78a9ba24cbf..46a27f2695a 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -101,8 +101,8 @@ struct ftrace_event_call { int (*show_format)(struct trace_seq *s); int (*define_fields)(void); struct list_head fields; - int n_preds; - struct filter_pred **preds; + int filter_active; + void *filter; void *mod; #ifdef CONFIG_EVENT_PROFILE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7d55bcf50e4..1fb7d6ccadf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -731,12 +731,16 @@ struct ftrace_event_field { int size; }; +struct event_filter { + int n_preds; + struct filter_pred **preds; +}; + struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; - int n_preds; - struct filter_pred **preds; + void *filter; }; struct filter_pred; @@ -774,7 +778,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) { + if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index be4d3a437c1..1cd1f37373d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -757,8 +757,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) list_add(&system->list, &event_subsystems); - system->preds = NULL; - system->n_preds = 0; + system->filter = NULL; entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 65418288f95..1e861eca3d0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -93,11 +93,12 @@ static int filter_pred_none(struct filter_pred *pred, void *event) /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct ftrace_event_call *call, void *rec) { + struct event_filter *filter = call->filter; int i, matched, and_failed = 0; struct filter_pred *pred; - for (i = 0; i < call->n_preds; i++) { - pred = call->preds[i]; + for (i = 0; i < filter->n_preds; i++) { + pred = filter->preds[i]; if (and_failed && !pred->or) continue; matched = pred->fn(pred, rec); @@ -115,20 +116,20 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) } EXPORT_SYMBOL_GPL(filter_match_preds); -static void __filter_print_preds(struct filter_pred **preds, int n_preds, +static void __filter_print_preds(struct event_filter *filter, struct trace_seq *s) { - char *field_name; struct filter_pred *pred; + char *field_name; int i; - if (!n_preds) { + if (!filter || !filter->n_preds) { trace_seq_printf(s, "none\n"); return; } - for (i = 0; i < n_preds; i++) { - pred = preds[i]; + for (i = 0; i < filter->n_preds; i++) { + pred = filter->preds[i]; field_name = pred->field_name; if (i) trace_seq_printf(s, pred->or ? "|| " : "&& "); @@ -144,7 +145,7 @@ static void __filter_print_preds(struct filter_pred **preds, int n_preds, void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s) { mutex_lock(&filter_mutex); - __filter_print_preds(call->preds, call->n_preds, s); + __filter_print_preds(call->filter, s); mutex_unlock(&filter_mutex); } @@ -152,7 +153,7 @@ void filter_print_subsystem_preds(struct event_subsystem *system, struct trace_seq *s) { mutex_lock(&filter_mutex); - __filter_print_preds(system->preds, system->n_preds, s); + __filter_print_preds(system->filter, s); mutex_unlock(&filter_mutex); } @@ -200,12 +201,14 @@ static int filter_set_pred(struct filter_pred *dest, static void __filter_disable_preds(struct ftrace_event_call *call) { + struct event_filter *filter = call->filter; int i; - call->n_preds = 0; + call->filter_active = 0; + filter->n_preds = 0; for (i = 0; i < MAX_FILTER_PRED; i++) - call->preds[i]->fn = filter_pred_none; + filter->preds[i]->fn = filter_pred_none; } void filter_disable_preds(struct ftrace_event_call *call) @@ -217,32 +220,39 @@ void filter_disable_preds(struct ftrace_event_call *call) int init_preds(struct ftrace_event_call *call) { + struct event_filter *filter; struct filter_pred *pred; int i; - call->n_preds = 0; - - call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!call->preds) + filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!call->filter) return -ENOMEM; + call->filter_active = 0; + filter->n_preds = 0; + + filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + if (!filter->preds) + goto oom; + for (i = 0; i < MAX_FILTER_PRED; i++) { pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) goto oom; pred->fn = filter_pred_none; - call->preds[i] = pred; + filter->preds[i] = pred; } return 0; oom: for (i = 0; i < MAX_FILTER_PRED; i++) { - if (call->preds[i]) - filter_free_pred(call->preds[i]); + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); } - kfree(call->preds); - call->preds = NULL; + kfree(filter->preds); + kfree(call->filter); + call->filter = NULL; return -ENOMEM; } @@ -250,15 +260,16 @@ EXPORT_SYMBOL_GPL(init_preds); static void __filter_free_subsystem_preds(struct event_subsystem *system) { + struct event_filter *filter = system->filter; struct ftrace_event_call *call; int i; - if (system->n_preds) { - for (i = 0; i < system->n_preds; i++) - filter_free_pred(system->preds[i]); - kfree(system->preds); - system->preds = NULL; - system->n_preds = 0; + if (filter && filter->n_preds) { + for (i = 0; i < filter->n_preds; i++) + filter_free_pred(filter->preds[i]); + kfree(filter->preds); + kfree(filter); + system->filter = NULL; } list_for_each_entry(call, &ftrace_events, list) { @@ -281,21 +292,23 @@ static int filter_add_pred_fn(struct ftrace_event_call *call, struct filter_pred *pred, filter_pred_fn_t fn) { + struct event_filter *filter = call->filter; int idx, err; - if (call->n_preds && !pred->compound) + if (filter->n_preds && !pred->compound) __filter_disable_preds(call); - if (call->n_preds == MAX_FILTER_PRED) + if (filter->n_preds == MAX_FILTER_PRED) return -ENOSPC; - idx = call->n_preds; - filter_clear_pred(call->preds[idx]); - err = filter_set_pred(call->preds[idx], pred, fn); + idx = filter->n_preds; + filter_clear_pred(filter->preds[idx]); + err = filter_set_pred(filter->preds[idx], pred, fn); if (err) return err; - call->n_preds++; + filter->n_preds++; + call->filter_active = 1; return 0; } @@ -366,29 +379,41 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred) { + struct event_filter *filter = system->filter; struct ftrace_event_call *call; mutex_lock(&filter_mutex); - if (system->n_preds && !pred->compound) + if (filter && filter->n_preds && !pred->compound) { __filter_free_subsystem_preds(system); + filter = NULL; + } - if (!system->n_preds) { - system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), + if (!filter) { + system->filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!system->filter) { + mutex_unlock(&filter_mutex); + return -ENOMEM; + } + filter = system->filter; + filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!system->preds) { + + if (!filter->preds) { + kfree(system->filter); + system->filter = NULL; mutex_unlock(&filter_mutex); return -ENOMEM; } } - if (system->n_preds == MAX_FILTER_PRED) { + if (filter->n_preds == MAX_FILTER_PRED) { mutex_unlock(&filter_mutex); return -ENOSPC; } - system->preds[system->n_preds] = pred; - system->n_preds++; + filter->preds[filter->n_preds] = pred; + filter->n_preds++; list_for_each_entry(call, &ftrace_events, list) { int err; @@ -401,8 +426,8 @@ int filter_add_subsystem_pred(struct event_subsystem *system, err = __filter_add_pred(call, pred); if (err == -ENOMEM) { - system->preds[system->n_preds] = NULL; - system->n_preds--; + filter->preds[filter->n_preds] = NULL; + filter->n_preds--; mutex_unlock(&filter_mutex); return err; } From a118e4d1402f1349fe3d953493e4168a300a752d Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 28 Apr 2009 03:04:53 -0500 Subject: [PATCH 300/900] tracing/filters: distinguish between signed and unsigned fields The new filter comparison ops need to be able to distinguish between signed and unsigned field types, so add an is_signed flag/param to the event field struct/trace_define_fields(). Also define a simple macro, is_signed_type() to determine the signedness at compile time, used in the trace macros. If the is_signed_type() macro won't work with a specific type, a new slightly modified version of TRACE_FIELD() called TRACE_FIELD_SIGN(), allows the signedness to be set explicitly. [ Impact: extend trace-filter code for new feature ] Signed-off-by: Tom Zanussi Acked-by: Steven Rostedt Cc: fweisbec@gmail.com Cc: Li Zefan LKML-Reference: <1240905893.6416.120.camel@tropicana> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 7 ++++--- include/trace/ftrace.h | 16 ++++++++-------- kernel/trace/trace.h | 1 + kernel/trace/trace_event_types.h | 4 ++-- kernel/trace/trace_events.c | 3 ++- kernel/trace/trace_export.c | 29 ++++++++++++++++++++++------- 6 files changed, 39 insertions(+), 21 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 46a27f2695a..e61a7403f3d 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -122,8 +122,9 @@ extern int filter_current_check_discard(struct ftrace_event_call *call, struct ring_buffer_event *event); extern int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size); + char *name, int offset, int size, int is_signed); +#define is_signed_type(type) (((type)(-1)) < 0) /* * The double __builtin_constant_p is because gcc will give us an error @@ -144,10 +145,10 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) -#define __common_field(type, item) \ +#define __common_field(type, item, is_signed) \ ret = trace_define_field(event_call, #type, "common_" #item, \ offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ + sizeof(field.ent.item), is_signed); \ if (ret) \ return ret; diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 1e681142f1d..edb02bc9f8f 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -225,7 +225,7 @@ ftrace_format_##call(struct trace_seq *s) \ #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), is_signed_type(type)); \ if (ret) \ return ret; @@ -234,7 +234,7 @@ ftrace_format_##call(struct trace_seq *s) \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), 0); \ if (ret) \ return ret; @@ -242,7 +242,7 @@ ftrace_format_##call(struct trace_seq *s) \ #define __string(item, src) \ ret = trace_define_field(event_call, "__str_loc", #item, \ offsetof(typeof(field), __str_loc_##item), \ - sizeof(field.__str_loc_##item)); + sizeof(field.__str_loc_##item), 0); #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ @@ -253,11 +253,11 @@ ftrace_define_fields_##call(void) \ struct ftrace_event_call *event_call = &event_##call; \ int ret; \ \ - __common_field(int, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ + __common_field(int, type, 1); \ + __common_field(unsigned char, flags, 0); \ + __common_field(unsigned char, preempt_count, 0); \ + __common_field(int, pid, 1); \ + __common_field(int, tgid, 1); \ \ tstruct; \ \ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1fb7d6ccadf..866d0108fd2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -729,6 +729,7 @@ struct ftrace_event_field { char *type; int offset; int size; + int is_signed; }; struct event_filter { diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index cfcecc4fd86..5e32e375134 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -141,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, TRACE_STRUCT( - TRACE_FIELD(ktime_t, state_data.stamp, stamp) - TRACE_FIELD(ktime_t, state_data.end, end) + TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) + TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) TRACE_FIELD(int, state_data.type, type) TRACE_FIELD(int, state_data.state, state) ), diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1cd1f37373d..bbbea747937 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -26,7 +26,7 @@ static DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size) + char *name, int offset, int size, int is_signed) { struct ftrace_event_field *field; @@ -44,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type, field->offset = offset; field->size = size; + field->is_signed = is_signed; list_add(&field->link, &call->fields); return 0; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 0cb1a142c74..d06cf898dc8 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -50,6 +50,9 @@ extern void __bad_type_size(void); if (!ret) \ return 0; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + TRACE_FIELD(type, item, assign) #undef TP_RAW_FMT #define TP_RAW_FMT(args...) args @@ -98,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_FIELD(type, item, assign)\ entry->item = assign; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + TRACE_FIELD(type, item, assign) + #undef TP_CMD #define TP_CMD(cmd...) cmd @@ -149,7 +156,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD(type, item, assign) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), is_signed_type(type)); \ if (ret) \ return ret; @@ -157,7 +164,15 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), 0); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed); \ if (ret) \ return ret; @@ -173,11 +188,11 @@ ftrace_define_fields_##call(void) \ struct args field; \ int ret; \ \ - __common_field(unsigned char, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ + __common_field(unsigned char, type, 0); \ + __common_field(unsigned char, flags, 0); \ + __common_field(unsigned char, preempt_count, 0); \ + __common_field(int, pid, 1); \ + __common_field(int, tgid, 1); \ \ tstruct; \ \ From 8b3725621074040d380664964ffbc40610aef8c6 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 28 Apr 2009 03:04:59 -0500 Subject: [PATCH 301/900] tracing/filters: a better event parser Replace the current event parser hack with a better one. Filters are no longer specified predicate by predicate, but all at once and can use parens and any of the following operators: numeric fields: ==, !=, <, <=, >, >= string fields: ==, != predicates can be combined with the logical operators: &&, || examples: "common_preempt_count > 4" > filter "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter If there was an error, the erroneous string along with an error message can be seen by looking at the filter e.g.: ((sig >= 10 && sig < 15) || dsig == 17) && comm != bash ^ parse_error: Field not found Currently the caret for an error always appears at the beginning of the filter; a real position should be used, but the error message should be useful even without it. To clear a filter, '0' can be written to the filter file. Filters can also be set or cleared for a complete subsystem by writing the same filter as would be written to an individual event to the filter file at the root of the subsytem. Note however, that if any event in the subsystem lacks a field specified in the filter being set, the set will fail and all filters in the subsytem are automatically cleared. This change from the previous version was made because using only the fields that happen to exist for a given event would most likely result in a meaningless filter. Because the logical operators are now implemented as predicates, the maximum number of predicates in a filter was increased from 8 to 16. [ Impact: add new, extended trace-filter implementation ] Signed-off-by: Tom Zanussi Acked-by: Steven Rostedt Cc: fweisbec@gmail.com Cc: Li Zefan LKML-Reference: <1240905899.6416.121.camel@tropicana> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 2 +- kernel/trace/trace.h | 66 +- kernel/trace/trace_events.c | 90 +-- kernel/trace/trace_events_filter.c | 1044 +++++++++++++++++++++------- 4 files changed, 898 insertions(+), 304 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index e61a7403f3d..5fff40c9ff5 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -112,7 +112,7 @@ struct ftrace_event_call { #endif }; -#define MAX_FILTER_PRED 8 +#define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 128 extern int init_preds(struct ftrace_event_call *call); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 866d0108fd2..7736fe8c1b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -735,6 +735,7 @@ struct ftrace_event_field { struct event_filter { int n_preds; struct filter_pred **preds; + char *filter_string; }; struct event_subsystem { @@ -746,7 +747,8 @@ struct event_subsystem { struct filter_pred; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, + int val1, int val2); struct filter_pred { filter_pred_fn_t fn; @@ -756,23 +758,18 @@ struct filter_pred { char *field_name; int offset; int not; - int or; - int compound; - int clear; + int op; + int pop_n; }; -extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct ftrace_event_call *call, +extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); -extern int filter_parse(char **pbuf, struct filter_pred *pred); -extern int filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred); -extern void filter_disable_preds(struct ftrace_event_call *call); -extern void filter_free_subsystem_preds(struct event_subsystem *system); -extern void filter_print_subsystem_preds(struct event_subsystem *system, +extern int apply_event_filter(struct ftrace_event_call *call, + char *filter_string); +extern int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string); +extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); -extern int filter_add_subsystem_pred(struct event_subsystem *system, - struct filter_pred *pred); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -787,6 +784,47 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bbbea747937..f789ca540fe 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -492,7 +492,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call, s); + print_event_filter(call, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -505,40 +505,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; - char buf[64], *pbuf = buf; - struct filter_pred *pred; + char *buf; int err; - if (cnt >= sizeof(buf)) + if (cnt >= PAGE_SIZE) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return -ENOMEM; - err = filter_parse(&pbuf, pred); - if (err < 0) { - filter_free_pred(pred); + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; + } + buf[cnt] = '\0'; + + err = apply_event_filter(call, buf); + free_page((unsigned long) buf); + if (err < 0) return err; - } - - if (pred->clear) { - filter_disable_preds(call); - filter_free_pred(pred); - return cnt; - } - - err = filter_add_pred(call, pred); - if (err < 0) { - filter_free_pred(pred); - return err; - } - - filter_free_pred(pred); *ppos += cnt; @@ -562,7 +548,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_subsystem_preds(system, s); + print_subsystem_event_filter(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -575,38 +561,26 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct event_subsystem *system = filp->private_data; - char buf[64], *pbuf = buf; - struct filter_pred *pred; + char *buf; int err; - if (cnt >= sizeof(buf)) + if (cnt >= PAGE_SIZE) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return -ENOMEM; - err = filter_parse(&pbuf, pred); - if (err < 0) { - filter_free_pred(pred); - return err; + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; } + buf[cnt] = '\0'; - if (pred->clear) { - filter_free_subsystem_preds(system); - filter_free_pred(pred); - return cnt; - } - - err = filter_add_subsystem_pred(system, pred); - if (err < 0) { - filter_free_pred(pred); + err = apply_subsystem_event_filter(system, buf); + free_page((unsigned long) buf); + if (err < 0) return err; - } *ppos += cnt; @@ -760,11 +734,21 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->filter = NULL; + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) { + pr_warning("Could not allocate filter for subsystem " + "'%s'\n", name); + return system->entry; + } + entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); - if (!entry) + if (!entry) { + kfree(system->filter); + system->filter = NULL; pr_warning("Could not create debugfs " "'%s/filter' entry\n", name); + } return system->entry; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1e861eca3d0..f49486687ee 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -29,51 +29,130 @@ static DEFINE_MUTEX(filter_mutex); -static int filter_pred_64(struct filter_pred *pred, void *event) +enum filter_op_ids { - u64 *addr = (u64 *)(event + pred->offset); - u64 val = (u64)pred->val; - int match; + OP_OR, + OP_AND, + OP_NE, + OP_EQ, + OP_LT, + OP_LE, + OP_GT, + OP_GE, + OP_NONE, + OP_OPEN_PAREN, +}; - match = (val == *addr) ^ pred->not; +struct filter_op { + int id; + char *string; + int precedence; +}; - return match; +static struct filter_op filter_ops[] = { + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, +}; + +enum { + FILT_ERR_NONE, + FILT_ERR_INVALID_OP, + FILT_ERR_UNBALANCED_PAREN, + FILT_ERR_TOO_MANY_OPERANDS, + FILT_ERR_OPERAND_TOO_LONG, + FILT_ERR_FIELD_NOT_FOUND, + FILT_ERR_ILLEGAL_FIELD_OP, + FILT_ERR_ILLEGAL_INTVAL, + FILT_ERR_BAD_SUBSYS_FILTER, + FILT_ERR_TOO_MANY_PREDS, + FILT_ERR_MISSING_FIELD, + FILT_ERR_INVALID_FILTER, +}; + +static char *err_text[] = { + "No error", + "Invalid operator", + "Unbalanced parens", + "Too many operands", + "Operand too long", + "Field not found", + "Illegal operation for field type", + "Illegal integer value", + "Couldn't find or set field in one of a subsystem's events", + "Too many terms in predicate expression", + "Missing field name and/or value", + "Meaningless filter expression", +}; + +struct opstack_op { + int op; + struct list_head list; +}; + +struct postfix_elt { + int op; + char *operand; + struct list_head list; +}; + +struct filter_parse_state { + struct filter_op *ops; + struct list_head opstack; + struct list_head postfix; + int lasterr; + int lasterr_pos; + + struct { + char *string; + unsigned int cnt; + unsigned int tail; + } infix; + + struct { + char string[MAX_FILTER_STR_VAL]; + int pos; + unsigned int tail; + } operand; +}; + +DEFINE_COMPARISON_PRED(s64); +DEFINE_COMPARISON_PRED(u64); +DEFINE_COMPARISON_PRED(s32); +DEFINE_COMPARISON_PRED(u32); +DEFINE_COMPARISON_PRED(s16); +DEFINE_COMPARISON_PRED(u16); +DEFINE_COMPARISON_PRED(s8); +DEFINE_COMPARISON_PRED(u8); + +DEFINE_EQUALITY_PRED(64); +DEFINE_EQUALITY_PRED(32); +DEFINE_EQUALITY_PRED(16); +DEFINE_EQUALITY_PRED(8); + +static int filter_pred_and(struct filter_pred *pred __attribute((unused)), + void *event __attribute((unused)), + int val1, int val2) +{ + return val1 && val2; } -static int filter_pred_32(struct filter_pred *pred, void *event) +static int filter_pred_or(struct filter_pred *pred __attribute((unused)), + void *event __attribute((unused)), + int val1, int val2) { - u32 *addr = (u32 *)(event + pred->offset); - u32 val = (u32)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; + return val1 || val2; } -static int filter_pred_16(struct filter_pred *pred, void *event) -{ - u16 *addr = (u16 *)(event + pred->offset); - u16 val = (u16)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; -} - -static int filter_pred_8(struct filter_pred *pred, void *event) -{ - u8 *addr = (u8 *)(event + pred->offset); - u8 val = (u8)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; -} - -static int filter_pred_string(struct filter_pred *pred, void *event) +static int filter_pred_string(struct filter_pred *pred, void *event, + int val1, int val2) { char *addr = (char *)(event + pred->offset); int cmp, match; @@ -85,7 +164,8 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } -static int filter_pred_none(struct filter_pred *pred, void *event) +static int filter_pred_none(struct filter_pred *pred, void *event, + int val1, int val2) { return 0; } @@ -94,66 +174,119 @@ static int filter_pred_none(struct filter_pred *pred, void *event) int filter_match_preds(struct ftrace_event_call *call, void *rec) { struct event_filter *filter = call->filter; - int i, matched, and_failed = 0; + int match, top = 0, val1 = 0, val2 = 0; + int stack[MAX_FILTER_PRED]; struct filter_pred *pred; + int i; for (i = 0; i < filter->n_preds; i++) { pred = filter->preds[i]; - if (and_failed && !pred->or) + if (!pred->pop_n) { + match = pred->fn(pred, rec, val1, val2); + stack[top++] = match; continue; - matched = pred->fn(pred, rec); - if (!matched && !pred->or) { - and_failed = 1; - continue; - } else if (matched && pred->or) - return 1; + } + if (pred->pop_n > top) { + WARN_ON_ONCE(1); + return 0; + } + val1 = stack[--top]; + val2 = stack[--top]; + match = pred->fn(pred, rec, val1, val2); + stack[top++] = match; } - if (and_failed) - return 0; - - return 1; + return stack[--top]; } EXPORT_SYMBOL_GPL(filter_match_preds); -static void __filter_print_preds(struct event_filter *filter, - struct trace_seq *s) +static void parse_error(struct filter_parse_state *ps, int err, int pos) { - struct filter_pred *pred; - char *field_name; - int i; - - if (!filter || !filter->n_preds) { - trace_seq_printf(s, "none\n"); - return; - } - - for (i = 0; i < filter->n_preds; i++) { - pred = filter->preds[i]; - field_name = pred->field_name; - if (i) - trace_seq_printf(s, pred->or ? "|| " : "&& "); - trace_seq_printf(s, "%s ", field_name); - trace_seq_printf(s, pred->not ? "!= " : "== "); - if (pred->str_len) - trace_seq_printf(s, "%s\n", pred->str_val); - else - trace_seq_printf(s, "%llu\n", pred->val); - } + ps->lasterr = err; + ps->lasterr_pos = pos; } -void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s) +static void remove_filter_string(struct event_filter *filter) { + kfree(filter->filter_string); + filter->filter_string = NULL; +} + +static int replace_filter_string(struct event_filter *filter, + char *filter_string) +{ + kfree(filter->filter_string); + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + return -ENOMEM; + + return 0; +} + +static int append_filter_string(struct event_filter *filter, + char *string) +{ + int newlen; + char *new_filter_string; + + BUG_ON(!filter->filter_string); + newlen = strlen(filter->filter_string) + strlen(string) + 1; + new_filter_string = kmalloc(newlen, GFP_KERNEL); + if (!new_filter_string) + return -ENOMEM; + + strcpy(new_filter_string, filter->filter_string); + strcat(new_filter_string, string); + kfree(filter->filter_string); + filter->filter_string = new_filter_string; + + return 0; +} + +static void append_filter_err(struct filter_parse_state *ps, + struct event_filter *filter) +{ + int pos = ps->lasterr_pos; + char *buf, *pbuf; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return; + + append_filter_string(filter, "\n"); + memset(buf, ' ', PAGE_SIZE); + if (pos > PAGE_SIZE - 128) + pos = 0; + buf[pos] = '^'; + pbuf = &buf[pos] + 1; + + sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); + append_filter_string(filter, buf); + free_page((unsigned long) buf); +} + +void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) +{ + struct event_filter *filter = call->filter; + mutex_lock(&filter_mutex); - __filter_print_preds(call->filter, s); + if (filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_printf(s, "none\n"); mutex_unlock(&filter_mutex); } -void filter_print_subsystem_preds(struct event_subsystem *system, +void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { + struct event_filter *filter = system->filter; + mutex_lock(&filter_mutex); - __filter_print_preds(system->filter, s); + if (filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_printf(s, "none\n"); mutex_unlock(&filter_mutex); } @@ -170,7 +303,7 @@ find_event_field(struct ftrace_event_call *call, char *name) return NULL; } -void filter_free_pred(struct filter_pred *pred) +static void filter_free_pred(struct filter_pred *pred) { if (!pred) return; @@ -191,15 +324,17 @@ static int filter_set_pred(struct filter_pred *dest, filter_pred_fn_t fn) { *dest = *src; - dest->field_name = kstrdup(src->field_name, GFP_KERNEL); - if (!dest->field_name) - return -ENOMEM; + if (src->field_name) { + dest->field_name = kstrdup(src->field_name, GFP_KERNEL); + if (!dest->field_name) + return -ENOMEM; + } dest->fn = fn; return 0; } -static void __filter_disable_preds(struct ftrace_event_call *call) +static void filter_disable_preds(struct ftrace_event_call *call) { struct event_filter *filter = call->filter; int i; @@ -211,13 +346,6 @@ static void __filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } -void filter_disable_preds(struct ftrace_event_call *call) -{ - mutex_lock(&filter_mutex); - __filter_disable_preds(call); - mutex_unlock(&filter_mutex); -} - int init_preds(struct ftrace_event_call *call) { struct event_filter *filter; @@ -258,48 +386,43 @@ oom: } EXPORT_SYMBOL_GPL(init_preds); -static void __filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct event_filter *filter = system->filter; struct ftrace_event_call *call; int i; - if (filter && filter->n_preds) { + if (filter->n_preds) { for (i = 0; i < filter->n_preds; i++) filter_free_pred(filter->preds[i]); kfree(filter->preds); - kfree(filter); - system->filter = NULL; + filter->preds = NULL; + filter->n_preds = 0; } list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; - if (!strcmp(call->system, system->name)) - __filter_disable_preds(call); + if (!strcmp(call->system, system->name)) { + filter_disable_preds(call); + remove_filter_string(call->filter); + } } } -void filter_free_subsystem_preds(struct event_subsystem *system) -{ - mutex_lock(&filter_mutex); - __filter_free_subsystem_preds(system); - mutex_unlock(&filter_mutex); -} - -static int filter_add_pred_fn(struct ftrace_event_call *call, +static int filter_add_pred_fn(struct filter_parse_state *ps, + struct ftrace_event_call *call, struct filter_pred *pred, filter_pred_fn_t fn) { struct event_filter *filter = call->filter; int idx, err; - if (filter->n_preds && !pred->compound) - __filter_disable_preds(call); - - if (filter->n_preds == MAX_FILTER_PRED) + if (filter->n_preds == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; + } idx = filter->n_preds; filter_clear_pred(filter->preds[idx]); @@ -321,94 +444,132 @@ static int is_string_field(const char *type) return 0; } -static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred) +static int is_legal_op(struct ftrace_event_field *field, int op) +{ + if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) + return 0; + + return 1; +} + +static filter_pred_fn_t select_comparison_fn(int op, int field_size, + int field_is_signed) +{ + filter_pred_fn_t fn = NULL; + + switch (field_size) { + case 8: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_64; + else if (field_is_signed) + fn = filter_pred_s64; + else + fn = filter_pred_u64; + break; + case 4: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_32; + else if (field_is_signed) + fn = filter_pred_s32; + else + fn = filter_pred_u32; + break; + case 2: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_16; + else if (field_is_signed) + fn = filter_pred_s16; + else + fn = filter_pred_u16; + break; + case 1: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_8; + else if (field_is_signed) + fn = filter_pred_s8; + else + fn = filter_pred_u8; + break; + } + + return fn; +} + +static int filter_add_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, + struct filter_pred *pred) { struct ftrace_event_field *field; filter_pred_fn_t fn; unsigned long long val; - field = find_event_field(call, pred->field_name); - if (!field) - return -EINVAL; - pred->fn = filter_pred_none; + + if (pred->op == OP_AND) { + pred->pop_n = 2; + return filter_add_pred_fn(ps, call, pred, filter_pred_and); + } else if (pred->op == OP_OR) { + pred->pop_n = 2; + return filter_add_pred_fn(ps, call, pred, filter_pred_or); + } + + field = find_event_field(call, pred->field_name); + if (!field) { + parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); + return -EINVAL; + } + pred->offset = field->offset; + if (!is_legal_op(field, pred->op)) { + parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); + return -EINVAL; + } + if (is_string_field(field->type)) { fn = filter_pred_string; pred->str_len = field->size; - return filter_add_pred_fn(call, pred, fn); + if (pred->op == OP_NE) + pred->not = 1; + return filter_add_pred_fn(ps, call, pred, fn); } else { - if (strict_strtoull(pred->str_val, 0, &val)) + if (strict_strtoull(pred->str_val, 0, &val)) { + parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; + } pred->val = val; } - switch (field->size) { - case 8: - fn = filter_pred_64; - break; - case 4: - fn = filter_pred_32; - break; - case 2: - fn = filter_pred_16; - break; - case 1: - fn = filter_pred_8; - break; - default: + fn = select_comparison_fn(pred->op, field->size, field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); return -EINVAL; } - return filter_add_pred_fn(call, pred, fn); + if (pred->op == OP_NE) + pred->not = 1; + + return filter_add_pred_fn(ps, call, pred, fn); } -int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) -{ - int err; - - mutex_lock(&filter_mutex); - err = __filter_add_pred(call, pred); - mutex_unlock(&filter_mutex); - - return err; -} - -int filter_add_subsystem_pred(struct event_subsystem *system, - struct filter_pred *pred) +static int filter_add_subsystem_pred(struct filter_parse_state *ps, + struct event_subsystem *system, + struct filter_pred *pred, + char *filter_string) { struct event_filter *filter = system->filter; struct ftrace_event_call *call; - mutex_lock(&filter_mutex); - - if (filter && filter->n_preds && !pred->compound) { - __filter_free_subsystem_preds(system); - filter = NULL; - } - - if (!filter) { - system->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!system->filter) { - mutex_unlock(&filter_mutex); - return -ENOMEM; - } - filter = system->filter; + if (!filter->preds) { filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!filter->preds) { - kfree(system->filter); - system->filter = NULL; - mutex_unlock(&filter_mutex); + if (!filter->preds) return -ENOMEM; - } } if (filter->n_preds == MAX_FILTER_PRED) { - mutex_unlock(&filter_mutex); + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } @@ -424,97 +585,508 @@ int filter_add_subsystem_pred(struct event_subsystem *system, if (strcmp(call->system, system->name)) continue; - err = __filter_add_pred(call, pred); - if (err == -ENOMEM) { - filter->preds[filter->n_preds] = NULL; - filter->n_preds--; - mutex_unlock(&filter_mutex); + err = filter_add_pred(ps, call, pred); + if (err) { + filter_free_subsystem_preds(system); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); return err; } + replace_filter_string(call->filter, filter_string); } - mutex_unlock(&filter_mutex); - return 0; } -/* - * The filter format can be - * - 0, which means remove all filter preds - * - [||/&&] ==/!= - */ -int filter_parse(char **pbuf, struct filter_pred *pred) +static void parse_init(struct filter_parse_state *ps, + struct filter_op *ops, + char *infix_string) { - char *tok, *val_str = NULL; - int tok_n = 0; + memset(ps, '\0', sizeof(*ps)); - while ((tok = strsep(pbuf, " \n"))) { - if (tok_n == 0) { - if (!strcmp(tok, "0")) { - pred->clear = 1; - return 0; - } else if (!strcmp(tok, "&&")) { - pred->or = 0; - pred->compound = 1; - } else if (!strcmp(tok, "||")) { - pred->or = 1; - pred->compound = 1; - } else - pred->field_name = tok; - tok_n = 1; - continue; - } - if (tok_n == 1) { - if (!pred->field_name) - pred->field_name = tok; - else if (!strcmp(tok, "!=")) - pred->not = 1; - else if (!strcmp(tok, "==")) - pred->not = 0; - else { - pred->field_name = NULL; - return -EINVAL; - } - tok_n = 2; - continue; - } - if (tok_n == 2) { - if (pred->compound) { - if (!strcmp(tok, "!=")) - pred->not = 1; - else if (!strcmp(tok, "==")) - pred->not = 0; - else { - pred->field_name = NULL; - return -EINVAL; - } - } else { - val_str = tok; - break; /* done */ - } - tok_n = 3; - continue; - } - if (tok_n == 3) { - val_str = tok; - break; /* done */ + ps->infix.string = infix_string; + ps->infix.cnt = strlen(infix_string); + ps->ops = ops; + + INIT_LIST_HEAD(&ps->opstack); + INIT_LIST_HEAD(&ps->postfix); +} + +static char infix_next(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + + return ps->infix.string[ps->infix.tail++]; +} + +static char infix_peek(struct filter_parse_state *ps) +{ + if (ps->infix.tail == strlen(ps->infix.string)) + return 0; + + return ps->infix.string[ps->infix.tail]; +} + +static void infix_advance(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + ps->infix.tail++; +} + +static inline int is_precedence_lower(struct filter_parse_state *ps, + int a, int b) +{ + return ps->ops[a].precedence < ps->ops[b].precedence; +} + +static inline int is_op_char(struct filter_parse_state *ps, char c) +{ + int i; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (ps->ops[i].string[0] == c) + return 1; + } + + return 0; +} + +static int infix_get_op(struct filter_parse_state *ps, char firstc) +{ + char nextc = infix_peek(ps); + char opstr[3]; + int i; + + opstr[0] = firstc; + opstr[1] = nextc; + opstr[2] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) { + infix_advance(ps); + return ps->ops[i].id; } } - if (!val_str || !strlen(val_str) - || strlen(val_str) >= MAX_FILTER_STR_VAL) { - pred->field_name = NULL; + opstr[1] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) + return ps->ops[i].id; + } + + return OP_NONE; +} + +static inline void clear_operand_string(struct filter_parse_state *ps) +{ + memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); + ps->operand.tail = 0; +} + +static inline int append_operand_char(struct filter_parse_state *ps, char c) +{ + if (ps->operand.tail == MAX_FILTER_STR_VAL) + return -EINVAL; + + ps->operand.string[ps->operand.tail++] = c; + + return 0; +} + +static int filter_opstack_push(struct filter_parse_state *ps, int op) +{ + struct opstack_op *opstack_op; + + opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); + if (!opstack_op) + return -ENOMEM; + + opstack_op->op = op; + list_add(&opstack_op->list, &ps->opstack); + + return 0; +} + +static int filter_opstack_empty(struct filter_parse_state *ps) +{ + return list_empty(&ps->opstack); +} + +static int filter_opstack_top(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + + return opstack_op->op; +} + +static int filter_opstack_pop(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + int op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + op = opstack_op->op; + list_del(&opstack_op->list); + + kfree(opstack_op); + + return op; +} + +static void filter_opstack_clear(struct filter_parse_state *ps) +{ + while (!filter_opstack_empty(ps)) + filter_opstack_pop(ps); +} + +static char *curr_operand(struct filter_parse_state *ps) +{ + return ps->operand.string; +} + +static int postfix_append_operand(struct filter_parse_state *ps, char *operand) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = OP_NONE; + elt->operand = kstrdup(operand, GFP_KERNEL); + if (!elt->operand) { + kfree(elt); + return -ENOMEM; + } + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static int postfix_append_op(struct filter_parse_state *ps, int op) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = op; + elt->operand = NULL; + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static void postfix_clear(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + + while (!list_empty(&ps->postfix)) { + elt = list_first_entry(&ps->postfix, struct postfix_elt, list); + kfree(elt->operand); + list_del(&elt->list); + } +} + +static int filter_parse(struct filter_parse_state *ps) +{ + int op, top_op; + char ch; + + while ((ch = infix_next(ps))) { + if (isspace(ch)) + continue; + + if (is_op_char(ps, ch)) { + op = infix_get_op(ps, ch); + if (op == OP_NONE) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; + } + + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_top(ps); + if (!is_precedence_lower(ps, top_op, op)) { + top_op = filter_opstack_pop(ps); + postfix_append_op(ps, top_op); + continue; + } + break; + } + + filter_opstack_push(ps, op); + continue; + } + + if (ch == '(') { + filter_opstack_push(ps, OP_OPEN_PAREN); + continue; + } + + if (ch == ')') { + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + top_op = filter_opstack_pop(ps); + while (top_op != OP_NONE) { + if (top_op == OP_OPEN_PAREN) + break; + postfix_append_op(ps, top_op); + top_op = filter_opstack_pop(ps); + } + if (top_op == OP_NONE) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + continue; + } + if (append_operand_char(ps, ch)) { + parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); + return -EINVAL; + } + } + + if (strlen(curr_operand(ps))) + postfix_append_operand(ps, curr_operand(ps)); + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_pop(ps); + if (top_op == OP_NONE) + break; + if (top_op == OP_OPEN_PAREN) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + postfix_append_op(ps, top_op); + } + + return 0; +} + +static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->field_name = kstrdup(operand1, GFP_KERNEL); + if (!pred->field_name) { + kfree(pred); + return NULL; + } + + strcpy(pred->str_val, operand2); + pred->str_len = strlen(operand2); + + pred->op = op; + + return pred; +} + +static struct filter_pred *create_logical_pred(int op) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->op = op; + + return pred; +} + +static int check_preds(struct filter_parse_state *ps) +{ + int n_normal_preds = 0, n_logical_preds = 0; + struct postfix_elt *elt; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + + if (elt->op == OP_AND || elt->op == OP_OR) { + n_logical_preds++; + continue; + } + n_normal_preds++; + } + + if (!n_normal_preds || n_logical_preds >= n_normal_preds) { + parse_error(ps, FILT_ERR_INVALID_FILTER, 0); return -EINVAL; } - strcpy(pred->str_val, val_str); - pred->str_len = strlen(val_str); + return 0; +} - pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); - if (!pred->field_name) - return -ENOMEM; +static int replace_preds(struct event_subsystem *system, + struct ftrace_event_call *call, + struct filter_parse_state *ps, + char *filter_string) +{ + char *operand1 = NULL, *operand2 = NULL; + struct filter_pred *pred; + struct postfix_elt *elt; + int err; + + err = check_preds(ps); + if (err) + return err; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) { + if (!operand1) + operand1 = elt->operand; + else if (!operand2) + operand2 = elt->operand; + else { + parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); + return -EINVAL; + } + continue; + } + + if (elt->op == OP_AND || elt->op == OP_OR) { + pred = create_logical_pred(elt->op); + if (call) { + err = filter_add_pred(ps, call, pred); + filter_free_pred(pred); + } else + err = filter_add_subsystem_pred(ps, system, + pred, filter_string); + if (err) + return err; + + operand1 = operand2 = NULL; + continue; + } + + if (!operand1 || !operand2) { + parse_error(ps, FILT_ERR_MISSING_FIELD, 0); + return -EINVAL; + } + + pred = create_pred(elt->op, operand1, operand2); + if (call) { + err = filter_add_pred(ps, call, pred); + filter_free_pred(pred); + } else + err = filter_add_subsystem_pred(ps, system, pred, + filter_string); + if (err) + return err; + + operand1 = operand2 = NULL; + } return 0; } +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; + + struct filter_parse_state *ps; + + mutex_lock(&filter_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_disable_preds(call); + remove_filter_string(call->filter); + mutex_unlock(&filter_mutex); + return 0; + } + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + filter_disable_preds(call); + replace_filter_string(call->filter, filter_string); + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, call->filter); + goto out; + } + + err = replace_preds(NULL, call, ps, filter_string); + if (err) + append_filter_err(ps, call->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + mutex_unlock(&filter_mutex); + + return err; +} + +int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string) +{ + int err; + + struct filter_parse_state *ps; + + mutex_lock(&filter_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_free_subsystem_preds(system); + remove_filter_string(system->filter); + mutex_unlock(&filter_mutex); + return 0; + } + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + filter_free_subsystem_preds(system); + replace_filter_string(system->filter, filter_string); + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, system->filter); + goto out; + } + + err = replace_preds(system, NULL, ps, filter_string); + if (err) + append_filter_err(ps, system->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + mutex_unlock(&filter_mutex); + + return err; +} From a0e39ed378fb6ba916522764cd508fa7d42ad495 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 29 Apr 2009 13:51:39 +0200 Subject: [PATCH 302/900] tracing: fix build failure on s390 "tracing: create automated trace defines" causes this compile error on s390, as reported by Sachin Sant against linux-next: kernel/built-in.o: In function `__do_softirq': (.text+0x1c680): undefined reference to `__tracepoint_softirq_entry' This happens because the definitions of the softirq tracepoints were moved from kernel/softirq.c to kernel/irq/handle.c. Since s390 doesn't support generic hardirqs handle.c doesn't get compiled and the definitions are missing. So move the tracepoints to softirq.c again. [ Impact: fix build failure on s390 ] Reported-by: Sachin Sant Signed-off-by: Heiko Carstens Cc: Steven Rostedt Cc: fweisbec@gmail.com LKML-Reference: <20090429135139.5fac79b8@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 2 -- kernel/softirq.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 37c63633e78..e68bb5aebe0 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,8 +18,6 @@ #include #include #include - -#define CREATE_TRACE_POINTS #include #include "internals.h" diff --git a/kernel/softirq.c b/kernel/softirq.c index 7ab9dfd8d08..d4ba347a872 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,6 +24,8 @@ #include #include #include + +#define CREATE_TRACE_POINTS #include #include From 50fa610a3b6ba7cf91d7a92229177dfaff2b81a1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 28 Apr 2009 15:01:38 +0100 Subject: [PATCH 303/900] sched: Document memory barriers implied by sleep/wake-up primitives Add a section to the memory barriers document to note the implied memory barriers of sleep primitives (set_current_state() and wrappers) and wake-up primitives (wake_up() and co.). Also extend the in-code comments on the wake_up() functions to note these implied barriers. [ Impact: add documentation ] Signed-off-by: David Howells Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <20090428140138.1192.94723.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 129 +++++++++++++++++++++++++++++- kernel/sched.c | 23 ++++++ 2 files changed, 151 insertions(+), 1 deletion(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index f5b7127f54a..7f5809eddee 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -31,6 +31,7 @@ Contents: - Locking functions. - Interrupt disabling functions. + - Sleep and wake-up functions. - Miscellaneous functions. (*) Inter-CPU locking barrier effects. @@ -1217,6 +1218,132 @@ barriers are required in such a situation, they must be provided from some other means. +SLEEP AND WAKE-UP FUNCTIONS +--------------------------- + +Sleeping and waking on an event flagged in global data can be viewed as an +interaction between two pieces of data: the task state of the task waiting for +the event and the global data used to indicate the event. To make sure that +these appear to happen in the right order, the primitives to begin the process +of going to sleep, and the primitives to initiate a wake up imply certain +barriers. + +Firstly, the sleeper normally follows something like this sequence of events: + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (event_indicated) + break; + schedule(); + } + +A general memory barrier is interpolated automatically by set_current_state() +after it has altered the task state: + + CPU 1 + =============================== + set_current_state(); + set_mb(); + STORE current->state + + LOAD event_indicated + +set_current_state() may be wrapped by: + + prepare_to_wait(); + prepare_to_wait_exclusive(); + +which therefore also imply a general memory barrier after setting the state. +The whole sequence above is available in various canned forms, all of which +interpolate the memory barrier in the right place: + + wait_event(); + wait_event_interruptible(); + wait_event_interruptible_exclusive(); + wait_event_interruptible_timeout(); + wait_event_killable(); + wait_event_timeout(); + wait_on_bit(); + wait_on_bit_lock(); + + +Secondly, code that performs a wake up normally follows something like this: + + event_indicated = 1; + wake_up(&event_wait_queue); + +or: + + event_indicated = 1; + wake_up_process(event_daemon); + +A write memory barrier is implied by wake_up() and co. if and only if they wake +something up. The barrier occurs before the task state is cleared, and so sits +between the STORE to indicate the event and the STORE to set TASK_RUNNING: + + CPU 1 CPU 2 + =============================== =============================== + set_current_state(); STORE event_indicated + set_mb(); wake_up(); + STORE current->state + STORE current->state + LOAD event_indicated + +The available waker functions include: + + complete(); + wake_up(); + wake_up_all(); + wake_up_bit(); + wake_up_interruptible(); + wake_up_interruptible_all(); + wake_up_interruptible_nr(); + wake_up_interruptible_poll(); + wake_up_interruptible_sync(); + wake_up_interruptible_sync_poll(); + wake_up_locked(); + wake_up_locked_poll(); + wake_up_nr(); + wake_up_poll(); + wake_up_process(); + + +[!] Note that the memory barriers implied by the sleeper and the waker do _not_ +order multiple stores before the wake-up with respect to loads of those stored +values after the sleeper has called set_current_state(). For instance, if the +sleeper does: + + set_current_state(TASK_INTERRUPTIBLE); + if (event_indicated) + break; + __set_current_state(TASK_RUNNING); + do_something(my_data); + +and the waker does: + + my_data = value; + event_indicated = 1; + wake_up(&event_wait_queue); + +there's no guarantee that the change to event_indicated will be perceived by +the sleeper as coming after the change to my_data. In such a circumstance, the +code on both sides must interpolate its own memory barriers between the +separate data accesses. Thus the above sleeper ought to do: + + set_current_state(TASK_INTERRUPTIBLE); + if (event_indicated) { + smp_rmb(); + do_something(my_data); + } + +and the waker should do: + + my_data = value; + smp_wmb(); + event_indicated = 1; + wake_up(&event_wait_queue); + + MISCELLANEOUS FUNCTIONS ----------------------- @@ -1366,7 +1493,7 @@ WHERE ARE MEMORY BARRIERS NEEDED? Under normal operation, memory operation reordering is generally not going to be a problem as a single-threaded linear piece of code will still appear to -work correctly, even if it's in an SMP kernel. There are, however, three +work correctly, even if it's in an SMP kernel. There are, however, four circumstances in which reordering definitely _could_ be a problem: (*) Interprocessor interaction. diff --git a/kernel/sched.c b/kernel/sched.c index b902e587a3a..fd0c2cee3f3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2458,6 +2458,17 @@ out: return success; } +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ int wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_ALL, 0); @@ -5241,6 +5252,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5279,6 +5293,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5315,6 +5332,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * awakened in the same order in which they were queued. * * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete(struct completion *x) { @@ -5332,6 +5352,9 @@ EXPORT_SYMBOL(complete); * @x: holds the state of this particular completion * * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete_all(struct completion *x) { From b1fca26631f76a5e8b18435a43f5d82b8734da4b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 23 Mar 2009 18:22:09 +0100 Subject: [PATCH 304/900] mutex: add atomic_dec_and_mutex_lock() Much like the atomic_dec_and_lock() function in which we take an hold a spin_lock if we drop the atomic to 0 this function takes and holds the mutex if we dec the atomic to 0. Signed-off-by: Eric Paris Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.410913479@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 3069ec7e0ab..93054fc3635 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} + #endif From 0c8b946e3ebb3846103486420ea7430a4b5e5b1b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 Apr 2009 17:48:18 +0200 Subject: [PATCH 305/900] vsprintf: introduce %pf format specifier A printf format specifier which would allow us to print a pure function name has been suggested by Andrew Morton a couple of months ago. The current %pF is very convenient to print a function symbol, but often we only want to print the name of the function, without its asm offset. That's what %pf does in this patch. The lowecase f has been chosen for its intuitive meaning of a 'weak kind of %pF'. The support for this new format would be welcome by the tracing code where the need to print pure function names is often needed. This is also true for other parts of the kernel: $ git-grep -E "kallsyms_lookup\(.+?\)" arch/blackfin/kernel/traps.c: symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); arch/powerpc/xmon/xmon.c: name = kallsyms_lookup(pc, &size, &offset, NULL, tmpstr); arch/sh/kernel/cpu/sh5/unwind.c: sym = kallsyms_lookup(pc, NULL, &offset, NULL, namebuf); arch/x86/kernel/ftrace.c: kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); kernel/kprobes.c: sym = kallsyms_lookup((unsigned long)p->addr, NULL, kernel/lockdep.c: return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); kernel/trace/ftrace.c: kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); kernel/trace/ftrace.c: kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); kernel/trace/ftrace.c: kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str); kernel/trace/ftrace.c: kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); kernel/trace/ftrace.c: kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); kernel/trace/ftrace.c: kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); kernel/trace/ftrace.c: kallsyms_lookup(*ptr, NULL, NULL, NULL, str); kernel/trace/trace_functions.c: kallsyms_lookup(ip, NULL, NULL, NULL, str); kernel/trace/trace_output.c: kallsyms_lookup(address, NULL, NULL, NULL, str); Changes in v2: - Add the explanation of the %pf role for vsnprintf() and bstr_printf() - Change the comments by dropping the "asm offset" notion and only define the %pf against the actual function offset notion. Signed-off-by: Frederic Weisbecker Acked-by: Mike Frysinger Cc: Linus Torvalds Cc: Zhaolei Cc: Tom Zanussi Cc: Li Zefan Cc: Andrew Morton Cc: Steven Rostedt LKML-Reference: <20090415154817.GC5989@nowhere> Signed-off-by: Ingo Molnar --- lib/vsprintf.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index b56f6d039d2..756ccafa9ce 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -575,12 +575,15 @@ static char *string(char *buf, char *end, char *s, struct printf_spec spec) } static char *symbol_string(char *buf, char *end, void *ptr, - struct printf_spec spec) + struct printf_spec spec, char ext) { unsigned long value = (unsigned long) ptr; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; - sprint_symbol(sym, value); + if (ext != 'f') + sprint_symbol(sym, value); + else + kallsyms_lookup(value, NULL, NULL, NULL, sym); return string(buf, end, sym, spec); #else spec.field_width = 2*sizeof(void *); @@ -692,7 +695,8 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr, * * Right now we handle: * - * - 'F' For symbolic function descriptor pointers + * - 'F' For symbolic function descriptor pointers with offset + * - 'f' For simple symbolic function names without offset * - 'S' For symbolic direct pointers * - 'R' For a struct resource pointer, it prints the range of * addresses (not the name nor the flags) @@ -715,10 +719,11 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, switch (*fmt) { case 'F': + case 'f': ptr = dereference_function_descriptor(ptr); /* Fallthrough */ case 'S': - return symbol_string(buf, end, ptr, spec); + return symbol_string(buf, end, ptr, spec, *fmt); case 'R': return resource_string(buf, end, ptr, spec); case 'm': @@ -954,7 +959,8 @@ qualifier: * * This function follows C99 vsnprintf, but has some extensions: * %pS output the name of a text symbol - * %pF output the name of a function pointer + * %pF output the name of a function pointer with its offset + * %pf output the name of a function pointer without its offset * %pR output the address range in a struct resource * * The return value is the number of characters which would @@ -1412,7 +1418,8 @@ EXPORT_SYMBOL_GPL(vbin_printf); * * The format follows C99 vsnprintf, but has some extensions: * %pS output the name of a text symbol - * %pF output the name of a function pointer + * %pF output the name of a function pointer with its offset + * %pf output the name of a function pointer without its offset * %pR output the address range in a struct resource * %n is ignored * From 23b94b967f118bef941369238f33c8140be46539 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Wed, 29 Apr 2009 21:54:51 +0100 Subject: [PATCH 306/900] locking, rtmutex.c: Documentation cleanup Two minor updates on functions documentation: - Updated documentation for function rt_mutex_unlock(), which contained an incorrect name - Removed extra '*' from comment in function rt_mutex_destroy() [ Impact: cleanup ] Signed-off-by: Luis Henriques Cc: Steven Rostedt LKML-Reference: <20090429205451.GA23154@hades.domain.com> Signed-off-by: Ingo Molnar --- kernel/rtmutex.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 69d9cb921ff..013882e8349 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -864,9 +864,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /** - * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller + * rt_mutex_timed_lock - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller * * @lock: the rt_mutex to be locked * @timeout: timeout structure or NULL (no timeout) @@ -913,7 +913,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) } EXPORT_SYMBOL_GPL(rt_mutex_unlock); -/*** +/** * rt_mutex_destroy - mark a mutex unusable * @lock: the mutex to be destroyed * From a511e3f968c462a55ef58697257f5347c73d306e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 29 Apr 2009 15:59:58 -0700 Subject: [PATCH 307/900] mutex: add atomic_dec_and_mutex_lock(), fix include/linux/mutex.h:136: warning: 'mutex_lock' declared inline after being called include/linux/mutex.h:136: warning: previous declaration of 'mutex_lock' was here uninline it. [ Impact: clean up and uninline, address compiler warning ] Signed-off-by: Andrew Morton Cc: Al Viro Cc: Christoph Hellwig Cc: Eric Paris Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <200904292318.n3TNIsi6028340@imap1.linux-foundation.org> Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 24 +----------------------- kernel/mutex.c | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 93054fc3635..878cab4f5fc 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -150,28 +150,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); */ extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); - -/** - * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 - * @cnt: the atomic which we are to dec - * @lock: the mutex to return holding if we dec to 0 - * - * return true and hold lock if we dec to 0, return false otherwise - */ -static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) -{ - /* dec if we can't possibly hit 0 */ - if (atomic_add_unless(cnt, -1, 1)) - return 0; - /* we might hit 0, so take the lock */ - mutex_lock(lock); - if (!atomic_dec_and_test(cnt)) { - /* when we actually did the dec, we didn't hit 0 */ - mutex_unlock(lock); - return 0; - } - /* we hit 0, and we hold the lock */ - return 1; -} +extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); #endif diff --git a/kernel/mutex.c b/kernel/mutex.c index 507cf2b5e9f..e2d25e9e62a 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -471,5 +471,28 @@ int __sched mutex_trylock(struct mutex *lock) return ret; } - EXPORT_SYMBOL(mutex_trylock); + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); From 2b72394e4089643f11669d9610907a1442fe044a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 28 Apr 2009 16:00:49 +0300 Subject: [PATCH 308/900] x86: move max_pfn_mapped and max_low_pfn_mapped to setup.c This patch moves the max_pfn_mapped and max_low_pfn_mapped global variables to kernel/setup.c where they're initialized. [ Impact: cleanup ] Signed-off-by: Pekka Enberg LKML-Reference: <1240923649.1982.21.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 8 ++++++++ arch/x86/mm/init_32.c | 4 +--- arch/x86/mm/init_64.c | 8 -------- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf6..0d77e56e821 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,14 @@ #define ARCH_SETUP #endif +/* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long max_low_pfn_mapped; +unsigned long max_pfn_mapped; + RESERVE_BRK(dmi_alloc, 65536); unsigned int boot_cpu_id __read_mostly; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2b27120665b..a640a7f0490 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -49,11 +49,9 @@ #include #include #include +#include #include -unsigned long max_low_pfn_mapped; -unsigned long max_pfn_mapped; - DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a4e7846efb1..1016ea01593 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -50,14 +50,6 @@ #include #include -/* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long max_low_pfn_mapped; -unsigned long max_pfn_mapped; - static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); From 9518e0e4350a5ea8ca200ce320b28d6284a7b0ce Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 28 Apr 2009 16:00:50 +0300 Subject: [PATCH 309/900] x86: move per-cpu mmu_gathers to mm/init.c [ Impact: cleanup ] Signed-off-by: Pekka Enberg LKML-Reference: <1240923650.1982.22.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 3 +++ arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 2 -- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index fedde5359a0..4d67c33a2e1 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -9,6 +9,9 @@ #include #include #include +#include + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long __initdata e820_table_start; unsigned long __meminitdata e820_table_end; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a640a7f0490..fef1d90d4f1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -52,7 +52,6 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1016ea01593..6a1a573e20f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -52,8 +52,6 @@ static unsigned long dma_reserve __initdata; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; From ba9c22f2c01cf5c88beed5a6b9e07d42e10bd358 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Mon, 20 Apr 2009 22:22:22 -0700 Subject: [PATCH 310/900] futex: remove FUTEX_REQUEUE_PI (non CMP) The new requeue PI futex op codes were modeled after the existing FUTEX_REQUEUE and FUTEX_CMP_REQUEUE calls. I was unaware at the time that FUTEX_REQUEUE was only around for compatibility reasons and shouldn't be used in new code. Ulrich Drepper elaborates on this in his Futexes are Tricky paper: http://people.redhat.com/drepper/futex.pdf. The deprecated call doesn't catch changes to the futex corresponding to the destination futex which can lead to deadlock. Therefor, I feel it best to remove FUTEX_REQUEUE_PI and leave only FUTEX_CMP_REQUEUE_PI as there are not yet any existing users of the API. This patch does change the OP code value of FUTEX_CMP_REQUEUE_PI to 12 from 13. Since my test case is the only known user of this API, I felt this was the right thing to do, rather than leave a hole in the enumeration. I chose to continue using the _CMP_ modifier in the OP code to make it explicit to the user that the test is being done. Builds, boots, and ran several hundred iterations requeue_pi.c. Signed-off-by: Darren Hart LKML-Reference: <49ED580E.1050502@us.ibm.com> Signed-off-by: Thomas Gleixner --- include/linux/futex.h | 4 +--- kernel/futex.c | 6 +----- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/include/linux/futex.h b/include/linux/futex.h index b05519ca9e5..34956c8fdeb 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -24,8 +24,7 @@ union ktime; #define FUTEX_WAIT_BITSET 9 #define FUTEX_WAKE_BITSET 10 #define FUTEX_WAIT_REQUEUE_PI 11 -#define FUTEX_REQUEUE_PI 12 -#define FUTEX_CMP_REQUEUE_PI 13 +#define FUTEX_CMP_REQUEUE_PI 12 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 @@ -43,7 +42,6 @@ union ktime; #define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) #define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ FUTEX_PRIVATE_FLAG) -#define FUTEX_REQUEUE_PI_PRIVATE (FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG) #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ FUTEX_PRIVATE_FLAG) diff --git a/kernel/futex.c b/kernel/futex.c index 6d2daa46f9f..aec8bf89bf4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2555,9 +2555,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, clockrt, uaddr2); break; - case FUTEX_REQUEUE_PI: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1); - break; case FUTEX_CMP_REQUEUE_PI: ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 1); @@ -2596,8 +2593,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. */ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI || - cmd == FUTEX_WAKE_OP) + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (u32) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); From 83c4832683bc8ebcd1687b3c0bf3ba1ab253dd4f Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Thu, 30 Apr 2009 12:03:16 +0200 Subject: [PATCH 311/900] x86: boot/compressed/vmlinux.lds.S: fix build of bzImage with 64 bit compiler Jesper reported that he saw following build issue: > ld:arch/x86/boot/compressed/vmlinux.lds:9: syntax error > make[2]: *** [arch/x86/boot/compressed/vmlinux] Error 1 > make[1]: *** [arch/x86/boot/compressed/vmlinux] Error 2 > make: *** [bzImage] Error 2 CPP defines the symbol "i386" to "1". Undefine this to fix it. [ Impact: build fix with certain tool chains ] Reported-by: Jesper Dangaard Brouer Signed-off-by: Sam Ravnborg Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/vmlinux.lds.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index ffcb19134bf..0d26c92d3c7 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -1,5 +1,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) +#undef i386 + #ifdef CONFIG_X86_64 OUTPUT_ARCH(i386:x86-64) ENTRY(startup_64) From 30b4ae8a4498543863501f707879b7220b649602 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Apr 2009 21:01:01 +0000 Subject: [PATCH 312/900] signals: split do_tkill Split out the code from do_tkill to make it reusable by the follow up patch which implements sys_rt_tgsigqueueinfo Signed-off-by: Thomas Gleixner Reviewed-by: Oleg Nesterov --- kernel/signal.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index d8034737db4..56d27acad87 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2278,24 +2278,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) return kill_something_info(sig, &info, pid); } -static int do_tkill(pid_t tgid, pid_t pid, int sig) +static int +do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { - int error; - struct siginfo info; struct task_struct *p; unsigned long flags; - - error = -ESRCH; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; - info.si_pid = task_tgid_vnr(current); - info.si_uid = current_uid(); + int error = -ESRCH; rcu_read_lock(); p = find_task_by_vpid(pid); if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { - error = check_kill_permission(sig, &info, p); + error = check_kill_permission(sig, info, p); /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. @@ -2305,7 +2298,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) * signal is private anyway. */ if (!error && sig && lock_task_sighand(p, &flags)) { - error = specific_send_sig_info(sig, &info, p); + error = specific_send_sig_info(sig, info, p); unlock_task_sighand(p, &flags); } } @@ -2314,6 +2307,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) return error; } +static int do_tkill(pid_t tgid, pid_t pid, int sig) +{ + struct siginfo info; + + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_TKILL; + info.si_pid = task_tgid_vnr(current); + info.si_uid = current_uid(); + + return do_send_specific(tgid, pid, sig, &info); +} + /** * sys_tgkill - send signal to one specific thread * @tgid: the thread group ID of the thread From 62ab4505e3efaf67784f84059e0fb9cedb1728ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Apr 2009 21:01:06 +0000 Subject: [PATCH 313/900] signals: implement sys_rt_tgsigqueueinfo sys_kill has the per thread counterpart sys_tgkill. sigqueueinfo is missing a thread directed counterpart. Such an interface is important for migrating applications from other OSes which have the per thread delivery implemented. Signed-off-by: Thomas Gleixner Reviewed-by: Oleg Nesterov Acked-by: Roland McGrath Acked-by: Ulrich Drepper --- include/linux/compat.h | 2 ++ include/linux/signal.h | 2 ++ kernel/compat.c | 11 +++++++++++ kernel/signal.c | 26 ++++++++++++++++++++++++++ 4 files changed, 41 insertions(+) diff --git a/include/linux/compat.h b/include/linux/compat.h index f2ded21f9a3..af931ee43dd 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -222,6 +222,8 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from); int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from); int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); +long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, + struct compat_siginfo __user *uinfo); static inline int compat_timeval_compare(struct compat_timeval *lhs, struct compat_timeval *rhs) diff --git a/include/linux/signal.h b/include/linux/signal.h index 84f997f8aa5..c7552836bd9 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -235,6 +235,8 @@ static inline int valid_signal(unsigned long sig) extern int next_signal(struct sigpending *pending, sigset_t *mask); extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); +extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, + siginfo_t *info); extern long do_sigpending(void __user *, unsigned long); extern int sigprocmask(int, sigset_t *, sigset_t *); extern int show_unhandled_signals; diff --git a/kernel/compat.c b/kernel/compat.c index 42d56544460..f6c204f07ea 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, } +asmlinkage long +compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, + struct compat_siginfo __user *uinfo) +{ + siginfo_t info; + + if (copy_siginfo_from_user32(&info, uinfo)) + return -EFAULT; + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} + #ifdef __ARCH_WANT_COMPAT_SYS_TIME /* compat_time_t is a 32 bit "long" and needs to get converted. */ diff --git a/kernel/signal.c b/kernel/signal.c index 56d27acad87..f79b3b9f837 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2369,6 +2369,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, return kill_proc_info(sig, &info, pid); } +long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) +{ + /* This is only valid for single tasks */ + if (pid <= 0 || tgid <= 0) + return -EINVAL; + + /* Not even root can pretend to send signals from the kernel. + Nor can they impersonate a kill(), which adds source info. */ + if (info->si_code >= 0) + return -EPERM; + info->si_signo = sig; + + return do_send_specific(tgid, pid, sig, info); +} + +SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, + siginfo_t __user *, uinfo) +{ + siginfo_t info; + + if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) + return -EFAULT; + + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *t = current; From 12d161147f828192b5bcc33166f468a827832767 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Apr 2009 21:01:10 +0000 Subject: [PATCH 314/900] x86: hookup sys_rt_tgsigqueueinfo Make the new sys_rt_tgsigqueueinfo available for x86. Signed-off-by: Thomas Gleixner --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/include/asm/unistd_32.h | 1 + arch/x86/include/asm/unistd_64.h | 2 ++ arch/x86/kernel/syscall_table_32.S | 1 + 4 files changed, 5 insertions(+) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a505202086e..dcef387ddc3 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -830,4 +830,5 @@ ia32_sys_call_table: .quad sys_inotify_init1 .quad compat_sys_preadv .quad compat_sys_pwritev + .quad compat_sys_rt_tgsigqueueinfo /* 335 */ ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 6e72d74cf8d..708dae61262 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -340,6 +340,7 @@ #define __NR_inotify_init1 332 #define __NR_preadv 333 #define __NR_pwritev 334 +#define __NR_rt_tgsigqueueinfo 335 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index f8182946232..4e2b0540440 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -657,6 +657,8 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1) __SYSCALL(__NR_preadv, sys_preadv) #define __NR_pwritev 296 __SYSCALL(__NR_pwritev, sys_pwritev) +#define __NR_rt_tgsigqueueinfo 297 +__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) #ifndef __NO_STUBS diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index ff5c8736b49..734f92c02dd 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -334,3 +334,4 @@ ENTRY(sys_call_table) .long sys_inotify_init1 .long sys_preadv .long sys_pwritev + .long sys_rt_tgsigqueueinfo /* 335 */ From bf293c17b26b8854241df08b9b63f7270cbde012 Mon Sep 17 00:00:00 2001 From: Remis Lima Baima Date: Thu, 30 Apr 2009 18:36:23 +0200 Subject: [PATCH 315/900] x86: added 'ifndef _ASM_X86_IOMAP_H' to iomap.h iomap.h misses the include guards. [ Impact: cleanup ] Signed-off-by: Remis Lima Baima Signed-off-by: Arnd Bergmann LKML-Reference: <200904301836.23885.arnd@arndb.de> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/iomap.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 86af26091d6..0e9fe1d9d97 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -1,3 +1,6 @@ +#ifndef _ASM_X86_IOMAP_H +#define _ASM_X86_IOMAP_H + /* * Copyright © 2008 Ingo Molnar * @@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); void iounmap_atomic(void *kvaddr, enum km_type type); + +#endif /* _ASM_X86_IOMAP_H */ From 56afb0f8823650f53a5f0e96d69a282e8892c61b Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 30 Apr 2009 13:29:36 -0400 Subject: [PATCH 316/900] kerneldoc, tracing: make kernel-doc understand TRACE_EVENT() macro (take #2) Add support to kernel-doc for tracepoint comments above TRACE_EVENT() macro definitions. Paves the way for tracepoint docbook. [ Impact: extend DocBook infrastructure ] Signed-off-by: Jason Baron Acked-by: Randy Dunlap Cc: akpm@linux-foundation.org Cc: rostedt@goodmis.org Cc: fweisbec@gmail.com Cc: mathieu.desnoyers@polymtl.ca Cc: wcohen@redhat.com LKML-Reference: Signed-off-by: Ingo Molnar --- scripts/kernel-doc | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 0f11870116d..2b53a55fbec 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -1827,6 +1827,25 @@ sub reset_state { $state = 0; } +sub tracepoint_munge($) { + my $file = shift; + my $tracepointname = 0; + my $tracepointargs = 0; + + if($prototype =~ m/TRACE_EVENT\((.*?),/) { + $tracepointname = $1; + } + if($prototype =~ m/TP_PROTO\((.*?)\)/) { + $tracepointargs = $1; + } + if (($tracepointname eq 0) || ($tracepointargs eq 0)) { + print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n". + "$prototype\n"; + } else { + $prototype = "static inline void trace_$tracepointname($tracepointargs)"; + } +} + sub syscall_munge() { my $void = 0; @@ -1881,6 +1900,9 @@ sub process_state3_function($$) { if ($prototype =~ /SYSCALL_DEFINE/) { syscall_munge(); } + if ($prototype =~ /TRACE_EVENT/) { + tracepoint_munge($file); + } dump_function($prototype, $file); reset_state(); } From a76f8c6da1e48fd4ef025f42c736389532ff30ba Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 30 Apr 2009 13:29:42 -0400 Subject: [PATCH 317/900] tracing: add new tracepoints docbook Add tracepoint docbook. This will help us document and understand what tracepoints are in the kernel. Since there are multiple macros, and files that contain tracepoints. [ Impact: add documentation ] Signed-off-by: Jason Baron Acked-by: Randy Dunlap Cc: akpm@linux-foundation.org Cc: rostedt@goodmis.org Cc: fweisbec@gmail.com Cc: mathieu.desnoyers@polymtl.ca Cc: wcohen@redhat.com LKML-Reference: <84160b6bd94aff02455da7e12bad054d34c579a0.1241107197.git.jbaron@redhat.com> Signed-off-by: Ingo Molnar --- Documentation/DocBook/Makefile | 3 +- Documentation/DocBook/tracepoint.tmpl | 84 +++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 Documentation/DocBook/tracepoint.tmpl diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 8918a32c6b3..4c8f4d6e114 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -13,7 +13,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \ gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ mac80211.xml debugobjects.xml sh.xml regulator.xml \ - alsa-driver-api.xml writing-an-alsa-driver.xml + alsa-driver-api.xml writing-an-alsa-driver.xml \ + tracepoint.xml ### # The build process is as follows (targets): diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl new file mode 100644 index 00000000000..70891bc6849 --- /dev/null +++ b/Documentation/DocBook/tracepoint.tmpl @@ -0,0 +1,84 @@ + + + + + + The Linux Kernel Tracepoint API + + + + Jason + Baron + +
+ jbaron@redhat.com +
+
+
+
+ + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + Introduction + + Tracepoints are static probe points that are located in strategic points + throughout the kernel. 'Probes' register/unregister with tracepoints + via a callback mechanism. The 'probes' are strictly typed functions that + are passed a unique set of parameters defined by each tracepoint. + + + + From this simple callback mechanism, 'probes' can be used to profile, debug, + and understand kernel behavior. There are a number of tools that provide a + framework for using 'probes'. These tools include Systemtap, ftrace, and + LTTng. + + + + Tracepoints are defined in a number of header files via various macros. Thus, + the purpose of this document is to provide a clear accounting of the available + tracepoints. The intention is to understand not only what tracepoints are + available but also to understand where future tracepoints might be added. + + + + The API presented has functions of the form: + trace_tracepointname(function parameters). These are the + tracepoints callbacks that are found throughout the code. Registering and + unregistering probes with these callback sites is covered in the + Documentation/trace/* directory. + + + +
From 9ee1983c9aa18f12388ef660d0c76a23dc112959 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 30 Apr 2009 13:29:47 -0400 Subject: [PATCH 318/900] tracing: add irq tracepoint documentation Document irqs for the newly created docbook. [ Impact: add documentation ] Signed-off-by: Jason Baron Acked-by: Randy Dunlap Cc: akpm@linux-foundation.org Cc: rostedt@goodmis.org Cc: fweisbec@gmail.com Cc: mathieu.desnoyers@polymtl.ca Cc: wcohen@redhat.com LKML-Reference: <73ff42be3420157667ec548e9b0e409c3cfad05f.1241107197.git.jbaron@redhat.com> Signed-off-by: Ingo Molnar --- Documentation/DocBook/tracepoint.tmpl | 5 +++ include/trace/events/irq.h | 46 ++++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl index 70891bc6849..b0756d0fd57 100644 --- a/Documentation/DocBook/tracepoint.tmpl +++ b/Documentation/DocBook/tracepoint.tmpl @@ -81,4 +81,9 @@ + + IRQ +!Iinclude/trace/events/irq.h + + diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 76868646751..32a9f7ef432 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -7,8 +7,16 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq -/* - * Tracepoint for entry of interrupt handler: +/** + * irq_handler_entry - called immediately before the irq action handler + * @irq: irq number + * @action: pointer to struct irqaction + * + * The struct irqaction pointed to by @action contains various + * information about the handler, including the device name, + * @action->name, and the device id, @action->dev_id. When used in + * conjunction with the irq_handler_exit tracepoint, we can figure + * out irq handler latencies. */ TRACE_EVENT(irq_handler_entry, @@ -29,8 +37,16 @@ TRACE_EVENT(irq_handler_entry, TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name)) ); -/* - * Tracepoint for return of an interrupt handler: +/** + * irq_handler_exit - called immediately after the irq action handler returns + * @irq: irq number + * @action: pointer to struct irqaction + * @ret: return value + * + * If the @ret value is set to IRQ_HANDLED, then we know that the corresponding + * @action->handler scuccessully handled this irq. Otherwise, the irq might be + * a shared irq line, or the irq was not handled successfully. Can be used in + * conjunction with the irq_handler_entry to understand irq handler latencies. */ TRACE_EVENT(irq_handler_exit, @@ -52,6 +68,17 @@ TRACE_EVENT(irq_handler_exit, __entry->irq, __entry->ret ? "handled" : "unhandled") ); +/** + * softirq_entry - called immediately before the softirq handler + * @h: pointer to struct softirq_action + * @vec: pointer to first struct softirq_action in softirq_vec array + * + * The @h parameter, contains a pointer to the struct softirq_action + * which has a pointer to the action handler that is called. By subtracting + * the @vec pointer from the @h pointer, we can determine the softirq + * number. Also, when used in combination with the softirq_exit tracepoint + * we can determine the softirq latency. + */ TRACE_EVENT(softirq_entry, TP_PROTO(struct softirq_action *h, struct softirq_action *vec), @@ -71,6 +98,17 @@ TRACE_EVENT(softirq_entry, TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) ); +/** + * softirq_exit - called immediately after the softirq handler returns + * @h: pointer to struct softirq_action + * @vec: pointer to first struct softirq_action in softirq_vec array + * + * The @h parameter contains a pointer to the struct softirq_action + * that has handled the softirq. By subtracting the @vec pointer from + * the @h pointer, we can determine the softirq number. Also, when used in + * combination with the softirq_entry tracepoint we can determine the softirq + * latency. + */ TRACE_EVENT(softirq_exit, TP_PROTO(struct softirq_action *h, struct softirq_action *vec), From 15e957d08dd4a841359cfec59ecb74041e0097aa Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 30 Apr 2009 01:17:50 -0700 Subject: [PATCH 319/900] x86/irq: use move_irq_desc() in create_irq_nr() move_irq_desc() will try to move irq_desc to the home node if the allocated one is not correct, in create_irq_nr(). ( This can happen on devices that are on different nodes that are using MSI, when drivers are loaded and unloaded randomly. ) v2: fix non-smp build v3: add NUMA_IRQ_DESC to eliminate #ifdefs [ Impact: improve irq descriptor locality on NUMA systems ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F95EAE.2050903@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++++ arch/x86/kernel/apic/io_apic.c | 6 +----- include/linux/irq.h | 11 +++++++++-- kernel/irq/Makefile | 2 +- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e1b2543f8ed..674e21e9f0a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -274,6 +274,10 @@ config SPARSE_IRQ If you don't know what to do here, say N. +config NUMA_IRQ_DESC + def_bool y + depends on SPARSE_IRQ && NUMA + config X86_MPPARSE bool "Enable MPS table" if ACPI default y diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9cd4806cdf5..e583291fe6c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3197,11 +3197,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) if (cfg_new->vector != 0) continue; -#ifdef CONFIG_NUMA_IRQ_DESC - /* different node ?*/ - if (desc_new->node != node) - desc = move_irq_desc(desc, node); -#endif + desc_new = move_irq_desc(desc_new, node); if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; diff --git a/include/linux/irq.h b/include/linux/irq.h index 4b95ddb5304..eedbb8e5e0c 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -206,9 +206,16 @@ extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc #ifndef CONFIG_SPARSE_IRQ extern struct irq_desc irq_desc[NR_IRQS]; -#else /* CONFIG_SPARSE_IRQ */ +#endif + +#ifdef CONFIG_NUMA_IRQ_DESC extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node); -#endif /* CONFIG_SPARSE_IRQ */ +#else +static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) +{ + return desc; +} +#endif extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2f065277f8e..7d047808419 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_SPARSE_IRQ) += numa_migrate.o +obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o obj-$(CONFIG_PM_SLEEP) += pm.o From 6f0aced639d346e5f54eea9fcb2784b633493d09 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 1 May 2009 23:54:25 +0400 Subject: [PATCH 320/900] x86, apic: use pr_ macro Replace recenly appeared printk with pr_ macro (the file already use a lot of them). [ Impact: cleanup ] Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090501195425.GB4633@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 28f747d61d7..e258bedce7c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2191,7 +2191,7 @@ static int __cpuinit set_multi(const struct dmi_system_id *d) { if (multi) return 0; - printk(KERN_INFO "APIC: %s detected, Multi Chassis\n", d->ident); + pr_info("APIC: %s detected, Multi Chassis\n", d->ident); multi = 1; return 0; } From a454ab3110175d710f4f9a96226a26ce4d5d5de2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 3 May 2009 10:09:03 +0200 Subject: [PATCH 321/900] x86, mm: fault.c, use printk_once() in is_errata93() Andrew pointed out that the 'once' variable has a needlessly function-global scope. We can in fact eliminate it completely, via the use of printk_once(). [ Impact: cleanup ] Reported-by: Andrew Morton Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 24a36a6426a..b9ca6d767db 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -514,8 +514,6 @@ bad: static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int once; - if (address != regs->ip) return 0; @@ -525,10 +523,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!once) { - printk(errata93_warning); - once = 1; - } + printk_once(errata93_warning); regs->ip = address; return 1; } From 1cbac972ba28e706fa9ce4d4c81830040bc811ee Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 2 May 2009 13:39:56 +0400 Subject: [PATCH 322/900] x86: uv io-apic - use BUILD_BUG_ON instead of BUG_ON The expression is known to be true/false at compilation time so we're allowed to use build-time instead of run-time check. Also align 'entry' items assignment. [ Impact: shrink kernel a bit, cleanup ] Signed-off-by: Cyrill Gorcunov Cc: Jack Steiner LKML-Reference: <20090502093956.GB4791@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8aef5f9d947..a80335ba12c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3749,6 +3749,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long flags; int err; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + cfg = irq_cfg(irq); err = assign_irq_vector(irq, cfg, eligible_cpu); @@ -3762,15 +3764,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -3788,10 +3788,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) struct uv_IO_APIC_route_entry *entry; int mmr_pnode; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - entry->mask = 1; mmr_pnode = uv_blade_to_pnode(mmr_blade); From 3969c52d4d2fef5a4b9e3ab0e51b3901e1cc8b83 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 3 May 2009 11:11:35 +0530 Subject: [PATCH 323/900] x86: cpufeature.h fix name for X86_FEATURE_MCE X86_FEATURE_MCE = Machine Check Exception X86_FEATURE_MCA = Machine Check Architecture [ Impact: cleanup ] Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1241329295.6321.1.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index bb83b1c397a..ccc1061b8b2 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -22,7 +22,7 @@ #define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ #define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */ #define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */ +#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */ #define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ #define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */ #define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */ From 9a8709d44139748fe2e0ab56d20d8c384c8b65ad Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 2 May 2009 00:25:11 +0400 Subject: [PATCH 324/900] x86: uv - prevent NULL dereference in uv_system_init() We may reach NULL dereference oops if kmalloc failed. Prevent it with explicit BUG_ON. [ Impact: more controlled assert in 'impossible' scenario ] Signed-off-by: Cyrill Gorcunov Acked-by: Jack Steiner LKML-Reference: <20090501202511.GE4633@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 873bf7121e8..9d9e2281a82 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -569,15 +569,18 @@ void __init uv_system_init(void) bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_blade_info); get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_node_to_blade); memset(uv_node_to_blade, 255, bytes); bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_cpu_to_blade); memset(uv_cpu_to_blade, 255, bytes); blade = 0; From d6ce96dabe2c4409fd009ec14250a1fdbab4b133 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 01:15:24 -0400 Subject: [PATCH 325/900] ring-buffer: export symbols I'm adding a module to do a series of tests on the ring buffer as well as benchmarks. This module needs to have more of the ring buffer API exported. There's nothing wrong with reading the ring buffer from a module. [ Impact: allow modules to read pages from the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f4cc59040eb..3e86da9b2a0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2802,6 +2802,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) return bpage; } +EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page @@ -2814,6 +2815,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) { free_page((unsigned long)data); } +EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); /** * ring_buffer_read_page - extract a page from the ring buffer @@ -2959,6 +2961,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, out: return ret; } +EXPORT_SYMBOL_GPL(ring_buffer_read_page); static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, From f0d2c681ac0a85142fc8abe65fc33fcad35cb9b7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 13:43:37 -0400 Subject: [PATCH 326/900] ring-buffer: add counters for commit overrun and nmi dropped entries The WARN_ON in the ring buffer when a commit is preempted and the buffer is filled by preceding writes can happen in normal operations. The WARN_ON makes it look like a bug, not to mention, because it does not stop tracing and calls printk which can also recurse, this is prone to deadlock (the WARN_ON is not in a position to recurse). This patch removes the WARN_ON and replaces it with a counter that can be retrieved by a tracer. This counter is called commit_overrun. While at it, I added a nmi_dropped counter to count any time an NMI entry is dropped because the NMI could not take the spinlock. [ Impact: prevent deadlock by printing normal case warning ] Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 ++ kernel/trace/ring_buffer.c | 52 ++++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 1c2f80911fb..f1345828c7c 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -153,6 +153,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer); unsigned long ring_buffer_overruns(struct ring_buffer *buffer); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3e86da9b2a0..26e1359fe19 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -402,6 +402,8 @@ struct ring_buffer_per_cpu { struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; + unsigned long nmi_dropped; + unsigned long commit_overrun; unsigned long overrun; unsigned long entries; u64 write_stamp; @@ -1216,8 +1218,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * simply fail. */ if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) + if (!__raw_spin_trylock(&cpu_buffer->lock)) { + cpu_buffer->nmi_dropped++; goto out_reset; + } } else __raw_spin_lock(&cpu_buffer->lock); @@ -1238,8 +1242,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * about it. */ if (unlikely(next_page == commit_page)) { - /* This can easily happen on small ring buffers */ - WARN_ON_ONCE(buffer->pages > 2); + cpu_buffer->commit_overrun++; goto out_reset; } @@ -1925,6 +1928,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); +/** + * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->nmi_dropped; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); + +/** + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->commit_overrun; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); + /** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer @@ -2595,6 +2639,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; + cpu_buffer->nmi_dropped = 0; + cpu_buffer->commit_overrun = 0; cpu_buffer->overrun = 0; cpu_buffer->entries = 0; From c8d771835e18c938dae8690611d65fe98ad30f58 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 18:03:45 -0400 Subject: [PATCH 327/900] tracing: export stats of ring buffers to userspace This patch adds stats to the ftrace ring buffers: # cat /debugfs/tracing/per_cpu/cpu0/stats entries: 42360 overrun: 30509326 commit overrun: 0 nmi dropped: 0 Where entries are the total number of data entries in the buffer. overrun is the number of entries not consumed and were overwritten by the writer. commit overrun is the number of entries dropped due to nested writers wrapping the buffer before the initial writer finished the commit. nmi dropped is the number of entries dropped due to the ring buffer lock being held when an nmi was going to write to the ring buffer. Note, this field will be meaningless and will go away when the ring buffer becomes lockless. [ Impact: let userspace know what is happening in the ring buffers ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f5427e0fc98..74df029056b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3595,6 +3595,45 @@ static const struct file_operations tracing_buffers_fops = { .llseek = no_llseek, }; +static ssize_t +tracing_stats_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long cpu = (unsigned long)filp->private_data; + struct trace_array *tr = &global_trace; + struct trace_seq *s; + unsigned long cnt; + + s = kmalloc(sizeof(*s), GFP_ATOMIC); + if (!s) + return ENOMEM; + + trace_seq_init(s); + + cnt = ring_buffer_entries_cpu(tr->buffer, cpu); + trace_seq_printf(s, "entries: %ld\n", cnt); + + cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "overrun: %ld\n", cnt); + + cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "commit overrun: %ld\n", cnt); + + cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); + trace_seq_printf(s, "nmi dropped: %ld\n", cnt); + + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + + kfree(s); + + return count; +} + +static const struct file_operations tracing_stats_fops = { + .open = tracing_open_generic, + .read = tracing_stats_read, +}; + #ifdef CONFIG_DYNAMIC_FTRACE int __weak ftrace_arch_read_dyn_info(char *buf, int size) @@ -3708,6 +3747,9 @@ static void tracing_init_debugfs_percpu(long cpu) trace_create_file("trace_pipe_raw", 0444, d_cpu, (void *) cpu, &tracing_buffers_fops); + + trace_create_file("stats", 0444, d_cpu, + (void *) cpu, &tracing_stats_fops); } #ifdef CONFIG_FTRACE_SELFTEST From 60aa605dfce2976e54fa76e805ab0f221372d4d9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:21 +0200 Subject: [PATCH 328/900] sched: rt: document the risk of small values in the bandwidth settings Thomas noted that we should disallow sysctl_sched_rt_runtime == 0 for (!RT_GROUP) since the root group always has some RT tasks in it. Further, update the documentation to inspire clue. [ Impact: exclude corner-case sysctl_sched_rt_runtime value ] Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra LKML-Reference: <20090505155436.863098054@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-rt-group.txt | 18 ++++++++++++++++++ kernel/sched.c | 7 +++++++ 2 files changed, 25 insertions(+) diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 5ba4d3fc625..eb74b014a3f 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt @@ -4,6 +4,7 @@ CONTENTS ======== +0. WARNING 1. Overview 1.1 The problem 1.2 The solution @@ -14,6 +15,23 @@ CONTENTS 3. Future plans +0. WARNING +========== + + Fiddling with these settings can result in an unstable system, the knobs are + root only and assumes root knows what he is doing. + +Most notable: + + * very small values in sched_rt_period_us can result in an unstable + system when the period is smaller than either the available hrtimer + resolution, or the time it takes to handle the budget refresh itself. + + * very small values in sched_rt_runtime_us can result in an unstable + system when the runtime is so small the system has difficulty making + forward progress (NOTE: the migration thread and kstopmachine both + are real-time processes). + 1. Overview =========== diff --git a/kernel/sched.c b/kernel/sched.c index 54d67b94f1a..2a43a581ead 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9917,6 +9917,13 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; From e4906eff9e6fbd2d311abcbcc53d5a531773c982 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Apr 2009 20:49:44 -0400 Subject: [PATCH 329/900] ring-buffer: convert cpu buffer entries to local_t The entries counter in cpu buffer is not atomic. It can be updated by other interrupts or from another CPU (readers). But making entries into "atomic_t" causes an atomic operation that can hurt performance. Instead we convert it to a local_t that will increment a counter with a local CPU atomic operation (if the arch supports it). Instead of fighting with readers and overwrites that decrement the counter, I added a "read" counter. Every time a reader reads an entry it is incremented. We already have a overrun counter and with that, the entries counter and the read counter, we can calculate the total number of entries in the buffer with: (entries - overrun) - read As long as the total number of entries in the ring buffer is less than the word size, this will work. But since the entries counter was previously a long, this is no different than what we had before. Thanks to Andrew Morton for pointing out in the first version that atomic_t does not replace unsigned long. I switched to atomic_long_t even though it is signed. A negative count is most likely a bug. [ Impact: keep accurate count of cpu buffer entries ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 26e1359fe19..c792ea893b0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -405,7 +405,8 @@ struct ring_buffer_per_cpu { unsigned long nmi_dropped; unsigned long commit_overrun; unsigned long overrun; - unsigned long entries; + unsigned long read; + local_t entries; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -997,7 +998,6 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; cpu_buffer->overrun++; - cpu_buffer->entries--; } } @@ -1588,7 +1588,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - cpu_buffer->entries++; + local_inc(&cpu_buffer->entries); /* Only process further if we own the commit */ if (!rb_is_commit(cpu_buffer, event)) @@ -1722,7 +1722,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, * The commit is still visible by the reader, so we * must increment entries. */ - cpu_buffer->entries++; + local_inc(&cpu_buffer->entries); out: /* * If a write came in and pushed the tail page @@ -1902,7 +1902,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->entries; + ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) + - cpu_buffer->read; return ret; } @@ -1985,7 +1986,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += cpu_buffer->entries; + entries += (local_read(&cpu_buffer->entries) - + cpu_buffer->overrun) - cpu_buffer->read; } return entries; @@ -2225,7 +2227,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX || rb_discarded_event(event)) - cpu_buffer->entries--; + cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); @@ -2642,7 +2644,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->nmi_dropped = 0; cpu_buffer->commit_overrun = 0; cpu_buffer->overrun = 0; - cpu_buffer->entries = 0; + cpu_buffer->read = 0; + local_set(&cpu_buffer->entries, 0); cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -2813,7 +2816,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, /* Only count data entries */ if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; - cpu_buffer->entries--; + cpu_buffer->read++; } __raw_spin_unlock(&cpu_buffer->lock); } From 41c51c98f588edcdf6141cff1895df738e03ddd4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 3 May 2009 23:11:18 +0200 Subject: [PATCH 330/900] rcu: rcu_sched_grace_period(): kill the bogus flush_signals() As a kernel thread, rcu_sched_grace_period() runs with all signals ignored. It can never receive a signal even if it sleeps in TASK_INTERRUPTIBLE, it needs the explicit allow_signal() to be visible for signals. [ Impact: reduce kernel size, remove dead code ] Signed-off-by: Oleg Nesterov Reviewed-by: Paul E. McKenney Cc: Andrew Morton LKML-Reference: <20090503211118.GA22973@redhat.com> Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index ce97a4df64d..beb0e659adc 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg) rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - ret = 0; + ret = 0; /* unused */ __wait_event_interruptible(rcu_ctrlblk.sched_wq, rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, ret); - /* - * Signals would prevent us from sleeping, and we cannot - * do much with them in any case. So flush them. - */ - if (ret) - flush_signals(current); couldsleepnext = 0; } while (!kthread_should_stop()); From 778c55d44eb4f5f658915ed631d68ed9d1ac3ad1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 May 2009 18:44:45 -0400 Subject: [PATCH 331/900] ring-buffer: record page entries in buffer page descriptor Currently, when the ring buffer writer overflows the buffer and must write over non consumed data, we increment the overrun counter by reading the entries on the page we are about to overwrite. This reads the entries one by one. This is not very effecient. This patch adds another entry counter into each buffer page descriptor that keeps track of the number of entries on the page. Now on overwrite, the overrun counter simply needs to add the number of entries that is on the page it is about to overwrite. [ Impact: speed up of ring buffer in overwrite mode ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 39 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c792ea893b0..342eacc4baa 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -321,9 +321,10 @@ struct buffer_data_page { }; struct buffer_page { + struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ - struct list_head list; /* list of free pages */ + local_t entries; /* entries on this page */ struct buffer_data_page *page; /* Actual data page */ }; @@ -977,30 +978,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->head_page); } -/* - * When the tail hits the head and the buffer is in overwrite mode, - * the head jumps to the next page and all content on the previous - * page is discarded. But before doing so, we update the overrun - * variable of the buffer. - */ -static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct ring_buffer_event *event; - unsigned long head; - - for (head = 0; head < rb_head_size(cpu_buffer); - head += rb_event_length(event)) { - - event = __rb_page_index(cpu_buffer->head_page, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) - continue; - cpu_buffer->overrun++; - } -} - static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page **bpage) { @@ -1253,7 +1230,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* tail_page has not moved yet? */ if (tail_page == cpu_buffer->tail_page) { /* count overflows */ - rb_update_overflow(cpu_buffer); + cpu_buffer->overrun += + local_read(&head_page->entries); rb_inc_page(cpu_buffer, &head_page); cpu_buffer->head_page = head_page; @@ -1268,6 +1246,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ if (tail_page == cpu_buffer->tail_page) { local_set(&next_page->write, 0); + local_set(&next_page->entries, 0); local_set(&next_page->page->commit, 0); cpu_buffer->tail_page = next_page; @@ -1313,6 +1292,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); + /* * If this is a commit and the tail is zero, then update * this page's time stamp. @@ -2183,6 +2166,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.prev = reader->list.prev; local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); /* Make the reader page now replace the head */ @@ -2629,6 +2613,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); + local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); cpu_buffer->head_page->read = 0; @@ -2638,6 +2623,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; @@ -2996,6 +2982,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, bpage = reader->page; reader->page = *data_page; local_set(&reader->write, 0); + local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; From afbab76a62b69ea6197e19727d4b8a8aef8deb25 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 May 2009 19:40:05 -0400 Subject: [PATCH 332/900] ring-buffer: have read page swap increment counter with page entries In the swap page ring buffer code that is used by the ftrace splice code, we scan the page to increment the counter of entries read. With the number of entries already in the page we simply need to add it. [ Impact: speed up reading page from ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 342eacc4baa..9e42a742a3f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2785,28 +2785,6 @@ out: } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); -static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_data_page *bpage, - unsigned int offset) -{ - struct ring_buffer_event *event; - unsigned long head; - - __raw_spin_lock(&cpu_buffer->lock); - for (head = offset; head < local_read(&bpage->commit); - head += rb_event_length(event)) { - - event = __rb_data_page_index(bpage, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) - continue; - cpu_buffer->read++; - } - __raw_spin_unlock(&cpu_buffer->lock); -} - /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. @@ -2977,6 +2955,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* we copied everything to the beginning */ read = 0; } else { + /* update the entry counter */ + cpu_buffer->read += local_read(&reader->entries); + /* swap the pages */ rb_init_page(bpage); bpage = reader->page; @@ -2985,9 +2966,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; - - /* update the entry counter */ - rb_remove_entries(cpu_buffer, bpage, read); } ret = read; From 41ede23eded40832c955d98d4b71bc244809abb3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 May 2009 20:26:54 -0400 Subject: [PATCH 333/900] ring-buffer: disable writers when resetting buffers As a precaution, it is best to disable writing to the ring buffers when reseting them. [ Impact: prevent weird things if write happens during reset ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9e42a742a3f..7876df00695 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2650,6 +2650,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + atomic_inc(&cpu_buffer->record_disabled); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); __raw_spin_lock(&cpu_buffer->lock); @@ -2659,6 +2661,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + atomic_dec(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); From 31b6e76e21b2ffd3cb2f6fe4149790a9fdadce2d Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Thu, 30 Apr 2009 20:06:11 -0400 Subject: [PATCH 334/900] ftrace: use .sched.text, not .text.sched in recordmcount.pl The only references in the kernel to the .text.sched section are in recordmcount.pl. Since the code it has is intended to be example code it should refer to real kernel sections. So change it to .sched.text instead. [ Impact: consistency in comments ] Signed-off-by: Tim Abbott LKML-Reference: <1241136371-10768-1-git-send-email-tabbott@mit.edu> Acked-by: Sam Ravnborg Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 409596eca12..0fae7da0529 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -26,7 +26,7 @@ # which will also be the location of that section after final link. # e.g. # -# .section ".text.sched" +# .section ".sched.text", "ax" # .globl my_func # my_func: # [...] @@ -39,7 +39,7 @@ # [...] # # Both relocation offsets for the mcounts in the above example will be -# offset from .text.sched. If we make another file called tmp.s with: +# offset from .sched.text. If we make another file called tmp.s with: # # .section __mcount_loc # .quad my_func + 0x5 @@ -51,7 +51,7 @@ # But this gets hard if my_func is not globl (a static function). # In such a case we have: # -# .section ".text.sched" +# .section ".sched.text", "ax" # my_func: # [...] # call mcount (offset: 0x5) From 94487d6d53af5acae10cf9fd52f74498994d46b1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 19:22:53 -0400 Subject: [PATCH 335/900] tracing: use proper export symbol for tracing api When adding the EXPORT_SYMBOL to some of the tracing API, I accidently used EXPORT_SYMBOL instead of EXPORT_SYMBOL_GPL. This patch fixes that mistake. [ Impact: export the tracing code only for GPL modules ] Reported-by: Christoph Hellwig Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 74df029056b..4164a344e72 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -887,21 +887,21 @@ trace_current_buffer_lock_reserve(int type, unsigned long len, return trace_buffer_lock_reserve(&global_trace, type, len, flags, pc); } -EXPORT_SYMBOL(trace_current_buffer_lock_reserve); +EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } -EXPORT_SYMBOL(trace_current_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); } -EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); void trace_current_buffer_discard_commit(struct ring_buffer_event *event) { From aa20ae8444fc6c318272c643f856d8d8ad3e198d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 21:16:11 -0400 Subject: [PATCH 336/900] ring-buffer: move big if statement down In the hot path of the ring buffer "__rb_reserve_next" there's a big if statement that does not even return back to the work flow. code; if (cross to next page) { [ lots of code ] return; } more code; The condition is even the unlikely path, although we do not denote it with an unlikely because gcc is fine with it. The condition is true when the write crosses a page boundary, and we need to start at a new page. Having this if statement makes it hard to read, but calling another function to do the work is also not appropriate, because we are using a lot of variables that were set before the if statement, and we do not want to send them as parameters. This patch changes it to a goto: code; if (cross to next page) goto next_page; more code; return; next_page: [ lots of code] This makes the code easier to understand, and a bit more obvious. The output from gcc is practically identical. For some reason, gcc decided to use different registers when I switched it to a goto. But other than that, the logic is the same. [ Impact: easier to read code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 224 +++++++++++++++++++------------------ 1 file changed, 114 insertions(+), 110 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7876df00695..424129eb20a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1159,6 +1159,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; + struct buffer_page *next_page; unsigned long tail, write; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; @@ -1173,116 +1174,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - length; /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) { - struct buffer_page *next_page = tail_page; - - local_irq_save(flags); - /* - * Since the write to the buffer is still not - * fully lockless, we must be careful with NMIs. - * The locks in the writers are taken when a write - * crosses to a new page. The locks protect against - * races with the readers (this will soon be fixed - * with a lockless solution). - * - * Because we can not protect against NMIs, and we - * want to keep traces reentrant, we need to manage - * what happens when we are in an NMI. - * - * NMIs can happen after we take the lock. - * If we are in an NMI, only take the lock - * if it is not already taken. Otherwise - * simply fail. - */ - if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) { - cpu_buffer->nmi_dropped++; - goto out_reset; - } - } else - __raw_spin_lock(&cpu_buffer->lock); - - lock_taken = true; - - rb_inc_page(cpu_buffer, &next_page); - - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; - - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_reset; - - /* - * If for some reason, we had an interrupt storm that made - * it all the way around the buffer, bail, and warn - * about it. - */ - if (unlikely(next_page == commit_page)) { - cpu_buffer->commit_overrun++; - goto out_reset; - } - - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; - - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - cpu_buffer->overrun += - local_read(&head_page->entries); - - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; - } - } - - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->entries, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; - - /* reread the time stamp */ - *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; - } - - /* - * The actual tail page has moved forward. - */ - if (tail < BUF_PAGE_SIZE) { - /* Mark the rest of the page with padding */ - event = __rb_page_index(tail_page, tail); - rb_event_set_padding(event); - } - - if (tail <= BUF_PAGE_SIZE) - /* Set the write back to the previous setting */ - local_set(&tail_page->write, tail); - - /* - * If this was a commit entry that failed, - * increment that too - */ - if (tail_page == cpu_buffer->commit_page && - tail == rb_commit_index(cpu_buffer)) { - rb_set_commit_to_write(cpu_buffer); - } - - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); - - /* fail and let the caller try again */ - return ERR_PTR(-EAGAIN); - } + if (write > BUF_PAGE_SIZE) + goto next_page; /* We reserved something on the buffer */ @@ -1305,6 +1198,117 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; + next_page: + + next_page = tail_page; + + local_irq_save(flags); + /* + * Since the write to the buffer is still not + * fully lockless, we must be careful with NMIs. + * The locks in the writers are taken when a write + * crosses to a new page. The locks protect against + * races with the readers (this will soon be fixed + * with a lockless solution). + * + * Because we can not protect against NMIs, and we + * want to keep traces reentrant, we need to manage + * what happens when we are in an NMI. + * + * NMIs can happen after we take the lock. + * If we are in an NMI, only take the lock + * if it is not already taken. Otherwise + * simply fail. + */ + if (unlikely(in_nmi())) { + if (!__raw_spin_trylock(&cpu_buffer->lock)) { + cpu_buffer->nmi_dropped++; + goto out_reset; + } + } else + __raw_spin_lock(&cpu_buffer->lock); + + lock_taken = true; + + rb_inc_page(cpu_buffer, &next_page); + + head_page = cpu_buffer->head_page; + reader_page = cpu_buffer->reader_page; + + /* we grabbed the lock before incrementing */ + if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) + goto out_reset; + + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == commit_page)) { + cpu_buffer->commit_overrun++; + goto out_reset; + } + + if (next_page == head_page) { + if (!(buffer->flags & RB_FL_OVERWRITE)) + goto out_reset; + + /* tail_page has not moved yet? */ + if (tail_page == cpu_buffer->tail_page) { + /* count overflows */ + cpu_buffer->overrun += + local_read(&head_page->entries); + + rb_inc_page(cpu_buffer, &head_page); + cpu_buffer->head_page = head_page; + cpu_buffer->head_page->read = 0; + } + } + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + local_set(&next_page->write, 0); + local_set(&next_page->entries, 0); + local_set(&next_page->page->commit, 0); + cpu_buffer->tail_page = next_page; + + /* reread the time stamp */ + *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); + cpu_buffer->tail_page->page->time_stamp = *ts; + } + + /* + * The actual tail page has moved forward. + */ + if (tail < BUF_PAGE_SIZE) { + /* Mark the rest of the page with padding */ + event = __rb_page_index(tail_page, tail); + rb_event_set_padding(event); + } + + if (tail <= BUF_PAGE_SIZE) + /* Set the write back to the previous setting */ + local_set(&tail_page->write, tail); + + /* + * If this was a commit entry that failed, + * increment that too + */ + if (tail_page == cpu_buffer->commit_page && + tail == rb_commit_index(cpu_buffer)) { + rb_set_commit_to_write(cpu_buffer); + } + + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); + out_reset: /* reset write */ if (tail <= BUF_PAGE_SIZE) From c898faf91b3ec6b0f6efa35831b3984fa3331db0 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 5 May 2009 17:28:56 -0400 Subject: [PATCH 337/900] x86: 46 bit physical address support on 64 bits Extend the maximum addressable memory on x86-64 from 2^44 to 2^46 bytes. This requires some shuffling around of the vmalloc and virtual memmap memory areas, to keep them away from the direct mapping of up to 64TB of physical memory. This patch also introduces a guard hole between the vmalloc area and the virtual memory map space. There's really no good reason why we wouldn't have a guard hole there. [ Impact: future hardware enablement ] Signed-off-by: Rik van Riel LKML-Reference: <20090505172856.6820db22@cuia.bos.redhat.com> Signed-off-by: H. Peter Anvin --- Documentation/x86/x86_64/mm.txt | 9 +++++---- arch/x86/include/asm/page_64_types.h | 2 +- arch/x86/include/asm/pgtable_64_types.h | 8 ++++---- arch/x86/include/asm/sparsemem.h | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 29b52b14d0b..53941323584 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables: 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm hole caused by [48:63] sign extension ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole -ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory -ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole -ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space -ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) +ffff880000000000 - ffffc8ffffffffff (=64 TB) direct mapping of all phys. memory +ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole +ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space +ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole +ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ... unused hole ... ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 3f587188ae6..6fadb020bd2 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -47,7 +47,7 @@ #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map _AC(0xffffffff80000000, UL) -/* See Documentation/x86_64/mm.txt for a description of the memory map. */ +/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ #define __PHYSICAL_MASK_SHIFT 46 #define __VIRTUAL_MASK_SHIFT 48 diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index fbf42b8e038..766ea16fbbb 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) - +/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) -#define VMALLOC_START _AC(0xffffc20000000000, UL) -#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) -#define VMEMMAP_START _AC(0xffffe20000000000, UL) +#define VMALLOC_START _AC(0xffffc90000000000, UL) +#define VMALLOC_END _AC(0xffffe8ffffffffff, UL) +#define VMEMMAP_START _AC(0xffffea0000000000, UL) #define MODULES_VADDR _AC(0xffffffffa0000000, UL) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index e3cc3c063ec..4517d6b9318 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -27,7 +27,7 @@ #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ # define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */ +# define MAX_PHYSMEM_BITS 46 #endif #endif /* CONFIG_SPARSEMEM */ From 2feceeff1e771850e49f9074307f071964fd9e3e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 5 May 2009 19:07:07 -0700 Subject: [PATCH 338/900] x86: fix typo in address space documentation Fix a trivial typo in Documentation/x86/x86_64/mm.txt. [ Impact: documentation only ] Signed-off-by: H. Peter Anvin Cc: Rik van Riel --- Documentation/x86/x86_64/mm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 53941323584..d6498e3cd71 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -6,7 +6,7 @@ Virtual memory map with 4 level page tables: 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm hole caused by [48:63] sign extension ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole -ffff880000000000 - ffffc8ffffffffff (=64 TB) direct mapping of all phys. memory +ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole From 5092dbc96f3acdac5433b27c06860352dc6d23b9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 22:47:18 -0400 Subject: [PATCH 339/900] ring-buffer: add benchmark and tester This patch adds code that can benchmark the ring buffer as well as test it. This code can be compiled into the kernel (not recommended) or as a module. A separate ring buffer is used to not interfer with other users, like ftrace. It creates a producer and a consumer (option to disable creation of the consumer) and will run for 10 seconds, then sleep for 10 seconds and then repeat. While running, the producer will write 10 byte loads into the ring buffer with just putting in the current CPU number. The reader will continually try to read the buffer. The reader will alternate from reading the buffer via event by event, or by full pages. The output is a pr_info, thus it will fill up the syslogs. Starting ring buffer hammer End ring buffer hammer Time: 9000349 (usecs) Overruns: 12578640 Read: 5358440 (by events) Entries: 0 Total: 17937080 Missed: 0 Hit: 17937080 Entries per millisec: 1993 501 ns per entry Sleeping for 10 secs Starting ring buffer hammer End ring buffer hammer Time: 9936350 (usecs) Overruns: 0 Read: 28146644 (by pages) Entries: 74 Total: 28146718 Missed: 0 Hit: 28146718 Entries per millisec: 2832 353 ns per entry Sleeping for 10 secs Time: is the time the test ran Overruns: the number of events that were overwritten and not read Read: the number of events read (either by pages or events) Entries: the number of entries left in the buffer (the by pages will only read full pages) Total: Entries + Read + Overruns Missed: the number of entries that failed to write Hit: the number of entries that were written The above example shows that it takes ~353 nanosecs per entry when there is a reader, reading by pages (and no overruns) The event by event reader slowed the producer down to 501 nanosecs. [ Impact: see how changes to the ring buffer affect stability and performance ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 16 ++ kernel/trace/Makefile | 1 + kernel/trace/ring_buffer_benchmark.c | 379 +++++++++++++++++++++++++++ 3 files changed, 396 insertions(+) create mode 100644 kernel/trace/ring_buffer_benchmark.c diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 450d3c2cfbd..50f62a296e1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -471,6 +471,22 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. +config RING_BUFFER_BENCHMARK + tristate "Ring buffer benchmark stress tester" + depends on RING_BUFFER + help + This option creates a test to stress the ring buffer and bench mark it. + It creates its own ring buffer such that it will not interfer with + any other users of the ring buffer (such as ftrace). It then creates + a producer and consumer that will run for 10 seconds and sleep for + 10 seconds. Each interval it will print out the number of events + it recorded and give a rough estimate of how long each iteration took. + + It does not disable interrupts or raise its priority, so it may be + affected by processes that are running. + + If unsure, say N + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index fb9d7f96489..7c34cbfff96 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -17,6 +17,7 @@ endif obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o +obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_clock.o diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c new file mode 100644 index 00000000000..747244acb8f --- /dev/null +++ b/kernel/trace/ring_buffer_benchmark.c @@ -0,0 +1,379 @@ +/* + * ring buffer tester and benchmark + * + * Copyright (C) 2009 Steven Rostedt + */ +#include +#include +#include +#include +#include + +struct rb_page { + u64 ts; + local_t commit; + char data[4080]; +}; + +/* run time and sleep time in seconds */ +#define RUN_TIME 10 +#define SLEEP_TIME 10 + +/* number of events for writer to wake up the reader */ +static int wakeup_interval = 100; + +static int reader_finish; +static struct completion read_start; +static struct completion read_done; + +static struct ring_buffer *buffer; +static struct task_struct *producer; +static struct task_struct *consumer; +static unsigned long read; + +static int disable_reader; +module_param(disable_reader, uint, 0644); +MODULE_PARM_DESC(disable_reader, "only run producer"); + +static int read_events; + +static int kill_test; + +#define KILL_TEST() \ + do { \ + if (!kill_test) { \ + kill_test = 1; \ + WARN_ON(1); \ + } \ + } while (0) + +enum event_status { + EVENT_FOUND, + EVENT_DROPPED, +}; + +static enum event_status read_event(int cpu) +{ + struct ring_buffer_event *event; + int *entry; + u64 ts; + + event = ring_buffer_consume(buffer, cpu, &ts); + if (!event) + return EVENT_DROPPED; + + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + return EVENT_DROPPED; + } + + read++; + return EVENT_FOUND; +} + +static enum event_status read_page(int cpu) +{ + struct ring_buffer_event *event; + struct rb_page *rpage; + unsigned long commit; + void *bpage; + int *entry; + int ret; + int inc; + int i; + + bpage = ring_buffer_alloc_read_page(buffer); + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + if (ret >= 0) { + rpage = bpage; + commit = local_read(&rpage->commit); + for (i = 0; i < commit && !kill_test; i += inc) { + + if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + KILL_TEST(); + break; + } + + inc = -1; + event = (void *)&rpage->data[i]; + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + /* We don't expect any padding */ + KILL_TEST(); + break; + case RINGBUF_TYPE_TIME_EXTEND: + inc = 8; + break; + case 0: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + if (!event->array[0]) { + KILL_TEST(); + break; + } + inc = event->array[0]; + break; + default: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + inc = ((event->type_len + 1) * 4); + } + if (kill_test) + break; + + if (inc <= 0) { + KILL_TEST(); + break; + } + } + } + ring_buffer_free_read_page(buffer, bpage); + + if (ret < 0) + return EVENT_DROPPED; + return EVENT_FOUND; +} + +static void ring_buffer_consumer(void) +{ + /* toggle between reading pages and events */ + read_events ^= 1; + + read = 0; + while (!reader_finish && !kill_test) { + int found; + + do { + int cpu; + + found = 0; + for_each_online_cpu(cpu) { + enum event_status stat; + + if (read_events) + stat = read_event(cpu); + else + stat = read_page(cpu); + + if (kill_test) + break; + if (stat == EVENT_FOUND) + found = 1; + } + } while (found && !kill_test); + + set_current_state(TASK_INTERRUPTIBLE); + if (reader_finish) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + reader_finish = 0; + complete(&read_done); +} + +static void ring_buffer_producer(void) +{ + struct timeval start_tv; + struct timeval end_tv; + unsigned long long time; + unsigned long long entries; + unsigned long long overruns; + unsigned long missed = 0; + unsigned long hit = 0; + unsigned long avg; + int cnt = 0; + + /* + * Hammer the buffer for 10 secs (this may + * make the system stall) + */ + pr_info("Starting ring buffer hammer\n"); + do_gettimeofday(&start_tv); + do { + struct ring_buffer_event *event; + int *entry; + + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } + do_gettimeofday(&end_tv); + + if (consumer && !(++cnt % wakeup_interval)) + wake_up_process(consumer); + + } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); + pr_info("End ring buffer hammer\n"); + + if (consumer) { + /* Init both completions here to avoid races */ + init_completion(&read_start); + init_completion(&read_done); + /* the completions must be visible before the finish var */ + smp_wmb(); + reader_finish = 1; + /* finish var visible before waking up the consumer */ + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_done); + } + + time = end_tv.tv_sec - start_tv.tv_sec; + time *= 1000000; + time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); + + entries = ring_buffer_entries(buffer); + overruns = ring_buffer_overruns(buffer); + + if (kill_test) + pr_info("ERROR!\n"); + pr_info("Time: %lld (usecs)\n", time); + pr_info("Overruns: %lld\n", overruns); + if (disable_reader) + pr_info("Read: (reader disabled)\n"); + else + pr_info("Read: %ld (by %s)\n", read, + read_events ? "events" : "pages"); + pr_info("Entries: %lld\n", entries); + pr_info("Total: %lld\n", entries + overruns + read); + pr_info("Missed: %ld\n", missed); + pr_info("Hit: %ld\n", hit); + + do_div(time, 1000); + if (time) + hit /= (long)time; + else + pr_info("TIME IS ZERO??\n"); + + pr_info("Entries per millisec: %ld\n", hit); + + if (hit) { + avg = 1000000 / hit; + pr_info("%ld ns per entry\n", avg); + } +} + +static void wait_to_die(void) +{ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); +} + +static int ring_buffer_consumer_thread(void *arg) +{ + while (!kthread_should_stop() && !kill_test) { + complete(&read_start); + + ring_buffer_consumer(); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop() || kill_test) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + __set_current_state(TASK_RUNNING); + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int ring_buffer_producer_thread(void *arg) +{ + init_completion(&read_start); + + while (!kthread_should_stop() && !kill_test) { + ring_buffer_reset(buffer); + + if (consumer) { + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_start); + } + + ring_buffer_producer(); + + pr_info("Sleeping for 10 secs\n"); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ * SLEEP_TIME); + __set_current_state(TASK_RUNNING); + } + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int __init ring_buffer_benchmark_init(void) +{ + int ret; + + /* make a one meg buffer in overwite mode */ + buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); + if (!buffer) + return -ENOMEM; + + if (!disable_reader) { + consumer = kthread_create(ring_buffer_consumer_thread, + NULL, "rb_consumer"); + ret = PTR_ERR(consumer); + if (IS_ERR(consumer)) + goto out_fail; + } + + producer = kthread_run(ring_buffer_producer_thread, + NULL, "rb_producer"); + ret = PTR_ERR(producer); + + if (IS_ERR(producer)) + goto out_kill; + + return 0; + + out_kill: + if (consumer) + kthread_stop(consumer); + + out_fail: + ring_buffer_free(buffer); + return ret; +} + +static void __exit ring_buffer_benchmark_exit(void) +{ + kthread_stop(producer); + if (consumer) + kthread_stop(consumer); + ring_buffer_free(buffer); +} + +module_init(ring_buffer_benchmark_init); +module_exit(ring_buffer_benchmark_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("ring_buffer_benchmark"); +MODULE_LICENSE("GPL"); From b2e5d8588de0b5341eddad87dbe48d2185eaa3dd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 May 2009 07:55:33 +0200 Subject: [PATCH 340/900] irq: change ->set_affinity() to return status, fix This build failure: arch/powerpc/sysdev/mpic.c:810: error: conflicting types for 'mpic_set_affinity' arch/powerpc/sysdev/mpic.h:39: error: previous declaration of 'mpic_set_affinity' was here make[2]: *** [arch/powerpc/sysdev/mpic.o] Error 1 make[2]: *** Waiting for unfinished jobs.... Triggers because the function prototype was not updated when the function call signature got changed by: d5dedd4: irq: change ->set_affinity() to return status [ Impact: build fix on powerpc ] Cc: Benjamin Herrenschmidt Cc: Yinghai Lu Cc: Andrew Morton Cc: Rusty Russell Cc: linux-arch@vger.kernel.org LKML-Reference: <49F654E9.4070809@kernel.org> Signed-off-by: Ingo Molnar --- arch/powerpc/sysdev/mpic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h index 3cef2af10f4..eff433c322a 100644 --- a/arch/powerpc/sysdev/mpic.h +++ b/arch/powerpc/sysdev/mpic.h @@ -36,6 +36,6 @@ static inline int mpic_pasemi_msi_init(struct mpic *mpic) extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type); extern void mpic_set_vector(unsigned int virq, unsigned int vector); -extern void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask); +extern int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask); #endif /* _POWERPC_SYSDEV_MPIC_H */ From fd6da10a617f483348ee32bcfe53fd20c302eca1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:32:13 +0800 Subject: [PATCH 341/900] tracing/events: don't say hi when loading the trace event sample The sample is useful for testing, and I'm using it. But after loading the module, it keeps saying hi every 10 seconds, this may be disturbing. Also Steven said commenting out the "hi" helped in causing races. :) [ Impact: make testing a bit easier ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A00F6AD.2070008@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- samples/trace_events/trace-events-sample.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index f33b3ba744a..aabc4e97091 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -16,10 +16,6 @@ static void simple_thread_func(int cnt) set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ); trace_foo_bar("hello", cnt); - - if (!(cnt % 10)) - /* It is really important that I say "hi!" */ - printk(KERN_EMERG "hi!\n"); } static int simple_thread(void *arg) From 96d17980fabeb757706d2d6db5a28580a6156bfc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:32:32 +0800 Subject: [PATCH 342/900] tracing/events: make SAMPLE_TRACE_EVENTS default to n Normally a config should be default to n. This patch also makes the sample module-only, like SAMPLE_MARKERS and SAMPLE_TRACEPOINTS. [ Impact: don't build trace event sample by default ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A00F6C0.8090803@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- samples/Kconfig | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index 93f41c05109..b75d28cba3f 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -20,9 +20,8 @@ config SAMPLE_TRACEPOINTS This build tracepoints example modules. config SAMPLE_TRACE_EVENTS - tristate "Build trace_events examples" - depends on EVENT_TRACING - default m + tristate "Build trace_events examples -- loadable modules only" + depends on EVENT_TRACING && m help This build trace event example modules. From 2df75e415709ad12862028916c772c1f377f6a7c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:33:04 +0800 Subject: [PATCH 343/900] tracing/events: fix memory leak when unloading module When unloading a module, memory allocated by init_preds() and trace_define_field() is not freed. [ Impact: fix memory leak ] Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <4A00F6E0.3040503@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 1 + kernel/trace/trace_events.c | 18 ++++++++++++++++++ kernel/trace/trace_events_filter.c | 22 +++++++++++++++------- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 5fff40c9ff5..662c1becf36 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -116,6 +116,7 @@ struct ftrace_event_call { #define MAX_FILTER_STR_VAL 128 extern int init_preds(struct ftrace_event_call *call); +extern void destroy_preds(struct ftrace_event_call *call); extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern int filter_current_check_discard(struct ftrace_event_call *call, void *rec, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f789ca540fe..f251a150e75 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -60,6 +60,22 @@ err: } EXPORT_SYMBOL_GPL(trace_define_field); +#ifdef CONFIG_MODULES + +static void trace_destroy_fields(struct ftrace_event_call *call) +{ + struct ftrace_event_field *field, *next; + + list_for_each_entry_safe(field, next, &call->fields, link) { + list_del(&field->link); + kfree(field->type); + kfree(field->name); + kfree(field); + } +} + +#endif /* CONFIG_MODULES */ + static void ftrace_clear_events(void) { struct ftrace_event_call *call; @@ -925,6 +941,8 @@ static void trace_module_remove_events(struct module *mod) unregister_ftrace_event(call->event); debugfs_remove_recursive(call->dir); list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); } } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f49486687ee..ce07b818671 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -346,6 +346,20 @@ static void filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } +void destroy_preds(struct ftrace_event_call *call) +{ + struct event_filter *filter = call->filter; + int i; + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); + } + kfree(filter->preds); + kfree(filter); + call->filter = NULL; +} + int init_preds(struct ftrace_event_call *call) { struct event_filter *filter; @@ -374,13 +388,7 @@ int init_preds(struct ftrace_event_call *call) return 0; oom: - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } - kfree(filter->preds); - kfree(call->filter); - call->filter = NULL; + destroy_preds(call); return -ENOMEM; } From 20c8928abe70e204bd077ab6cfe23002d7788983 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:33:45 +0800 Subject: [PATCH 344/900] tracing/events: fix concurrent access to ftrace_events list A module will add/remove its trace events when it gets loaded/unloaded, so the ftrace_events list is not "const", and concurrent access needs to be protected. This patch thus fixes races between loading/unloding modules and read 'available_events' or read/write 'set_event', etc. Below shows how to reproduce the race: # for ((; ;)) { cat /mnt/tracing/available_events; } > /dev/null & # for ((; ;)) { insmod trace-events-sample.ko; rmmod sample; } & After a while: BUG: unable to handle kernel paging request at 0010011c IP: [] t_next+0x1b/0x2d ... Call Trace: [] ? seq_read+0x217/0x30d [] ? seq_read+0x0/0x30d [] ? vfs_read+0x8f/0x136 [] ? sys_read+0x40/0x65 [] ? sysenter_do_call+0x12/0x36 [ Impact: fix races when concurrent accessing ftrace_events list ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker Cc: Tom Zanussi Cc: Peter Zijlstra LKML-Reference: <4A00F709.3080800@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 1 + kernel/trace/trace_event_profile.c | 19 ++++++++++++++----- kernel/trace/trace_events.c | 20 +++++++++++--------- kernel/trace/trace_events_filter.c | 10 +++++++--- 4 files changed, 33 insertions(+), 17 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7736fe8c1b7..777c6c3a0cd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -825,6 +825,7 @@ static int filter_pred_##size(struct filter_pred *pred, void *event, \ return match; \ } +extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 7bf2ad65eee..5b5895afecf 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -10,21 +10,30 @@ int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; + int ret = -EINVAL; + mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) - return event->profile_enable(event); + if (event->id == event_id) { + ret = event->profile_enable(event); + break; + } } + mutex_unlock(&event_mutex); - return -EINVAL; + return ret; } void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; + mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) - return event->profile_disable(event); + if (event->id == event_id) { + event->profile_disable(event); + break; + } } + mutex_unlock(&event_mutex); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f251a150e75..8d579ff2361 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -21,7 +21,7 @@ #define TRACE_SYSTEM "TRACE_SYSTEM" -static DEFINE_MUTEX(event_mutex); +DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); @@ -80,6 +80,7 @@ static void ftrace_clear_events(void) { struct ftrace_event_call *call; + mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (call->enabled) { @@ -87,6 +88,7 @@ static void ftrace_clear_events(void) call->unregfunc(); } } + mutex_unlock(&event_mutex); } static void ftrace_event_enable_disable(struct ftrace_event_call *call, @@ -274,6 +276,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&event_mutex); + if (*pos == 0) + m->private = ftrace_events.next; return t_next(m, NULL, pos); } @@ -303,6 +308,9 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&event_mutex); + if (*pos == 0) + m->private = ftrace_events.next; return s_next(m, NULL, pos); } @@ -319,12 +327,12 @@ static int t_show(struct seq_file *m, void *v) static void t_stop(struct seq_file *m, void *p) { + mutex_unlock(&event_mutex); } static int ftrace_event_seq_open(struct inode *inode, struct file *file) { - int ret; const struct seq_operations *seq_ops; if ((file->f_mode & FMODE_WRITE) && @@ -332,13 +340,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) ftrace_clear_events(); seq_ops = inode->i_private; - ret = seq_open(file, seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = ftrace_events.next; - } - return ret; + return seq_open(file, seq_ops); } static ssize_t diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index ce07b818671..7ac69108527 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -408,6 +408,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) filter->n_preds = 0; } + mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; @@ -417,6 +418,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) remove_filter_string(call->filter); } } + mutex_unlock(&event_mutex); } static int filter_add_pred_fn(struct filter_parse_state *ps, @@ -567,6 +569,7 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, { struct event_filter *filter = system->filter; struct ftrace_event_call *call; + int err = 0; if (!filter->preds) { filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), @@ -584,8 +587,8 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, filter->preds[filter->n_preds] = pred; filter->n_preds++; + mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - int err; if (!call->define_fields) continue; @@ -597,12 +600,13 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, if (err) { filter_free_subsystem_preds(system); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return err; + break; } replace_filter_string(call->filter, filter_string); } + mutex_unlock(&event_mutex); - return 0; + return err; } static void parse_init(struct filter_parse_state *ps, From de1d7286060430e79a1d50ad6e5fee8fe863c5f6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 5 May 2009 16:49:59 +0800 Subject: [PATCH 345/900] tracepoint: trace_sched_migrate_task(): remove parameter The orig_cpu parameter in trace_sched_migrate_task() is not necessary, it can be got by using task_cpu(p) in the probe. [ Impact: micro-optimization ] Signed-off-by: Mathieu Desnoyers [ modified from Mathieu's patch. The original patch is at: http://marc.info/?l=linux-kernel&m=123791201716239&w=2 ] Signed-off-by: Xiao Guangrong Cc: fweisbec@gmail.com Cc: rostedt@goodmis.org Cc: Li Zefan Cc: zhaolei@cn.fujitsu.com Cc: laijs@cn.fujitsu.com LKML-Reference: <49FFFDB7.1050402@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 6 +++--- kernel/sched.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ffa1cab586b..dd4033cf5b0 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -180,9 +180,9 @@ TRACE_EVENT(sched_switch, */ TRACE_EVENT(sched_migrate_task, - TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + TP_PROTO(struct task_struct *p, int dest_cpu), - TP_ARGS(p, orig_cpu, dest_cpu), + TP_ARGS(p, dest_cpu), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -196,7 +196,7 @@ TRACE_EVENT(sched_migrate_task, memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; - __entry->orig_cpu = orig_cpu; + __entry->orig_cpu = task_cpu(p); __entry->dest_cpu = dest_cpu; ), diff --git a/kernel/sched.c b/kernel/sched.c index 9f7ffd00b6e..9cdedbd181c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1954,7 +1954,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; - trace_sched_migrate_task(p, task_cpu(p), new_cpu); + trace_sched_migrate_task(p, new_cpu); #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) From a42aaa3bbce85ac487ad4fad5db99e8e91b7aac1 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Mon, 4 May 2009 16:27:26 -0400 Subject: [PATCH 346/900] blktrace: correct remap names This attempts to clarify names utilized during block I/O remap operations (partition, volume manager). It correctly matches up the /from/ information for both device & sector. This takes in the concept from Kosaki Motohiro and extends it to include better naming for the "device_from" field. [ Impact: cleanup ] Signed-off-by: Alan D. Brunelle Reviewed-by: Li Zefan Reviewed-by: KOSAKI Motohiro Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo LKML-Reference: <49FF4FAE.3000301@hp.com> Signed-off-by: Ingo Molnar --- include/linux/blktrace_api.h | 4 ++-- include/trace/block.h | 4 ++-- kernel/trace/blktrace.c | 24 ++++++++++++------------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 62763c95285..82b4636030e 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -116,9 +116,9 @@ struct blk_io_trace { * The remap event */ struct blk_io_trace_remap { - __be32 device; __be32 device_from; - __be64 sector; + __be32 device_to; + __be64 sector_from; }; enum { diff --git a/include/trace/block.h b/include/trace/block.h index 25b7068b819..87f6456fd32 100644 --- a/include/trace/block.h +++ b/include/trace/block.h @@ -70,7 +70,7 @@ DECLARE_TRACE(block_split, DECLARE_TRACE(block_remap, TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t from, sector_t to), - TP_ARGS(q, bio, dev, from, to)); + sector_t to, sector_t from), + TP_ARGS(q, bio, dev, to, from)); #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c32062bd10b..f8d46d6f5d3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -830,8 +830,8 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * @q: queue the io is for * @bio: the source bio * @dev: target device - * @from: source sector * @to: target sector + * @from: source sector * * Description: * Device mapper or raid target sometimes need to split a bio because @@ -839,7 +839,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * **/ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from, sector_t to) + dev_t dev, sector_t to, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -847,9 +847,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, if (likely(!bt)) return; - r.device = cpu_to_be32(dev); - r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); - r.sector = cpu_to_be64(to); + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); @@ -1028,11 +1028,11 @@ static void get_pdu_remap(const struct trace_entry *ent, struct blk_io_trace_remap *r) { const struct blk_io_trace_remap *__r = pdu_start(ent); - __u64 sector = __r->sector; + __u64 sector_from = __r->sector_from; - r->device = be32_to_cpu(__r->device); r->device_from = be32_to_cpu(__r->device_from); - r->sector = be64_to_cpu(sector); + r->device_to = be32_to_cpu(__r->device_to); + r->sector_from = be64_to_cpu(sector_from); } typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); @@ -1148,13 +1148,13 @@ static int blk_log_with_error(struct trace_seq *s, static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) { - struct blk_io_trace_remap r = { .device = 0, }; + struct blk_io_trace_remap r = { .device_from = 0, }; get_pdu_remap(ent, &r); return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", - t_sector(ent), - t_sec(ent), MAJOR(r.device), MINOR(r.device), - (unsigned long long)r.sector); + t_sector(ent), t_sec(ent), + MAJOR(r.device_from), MINOR(r.device_from), + (unsigned long long)r.sector_from); } static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) From 22a7c31a9659deaddafbbcec6562d44141e84474 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Mon, 4 May 2009 16:35:08 -0400 Subject: [PATCH 347/900] blktrace: from-sector redundant in trace_block_remap Remove redundant from-sector parameter: it's /always/ the bio's sector passed in. [ Impact: cleanup ] Signed-off-by: Alan D. Brunelle Reviewed-by: Li Zefan Reviewed-by: KOSAKI Motohiro Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo LKML-Reference: <49FF517C.7000503@hp.com> Signed-off-by: Ingo Molnar --- block/blk-core.c | 5 ++--- drivers/md/dm.c | 3 +-- include/trace/block.h | 4 ++-- kernel/trace/blktrace.c | 8 ++++---- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 07ab75403e1..a5f747a8312 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1275,7 +1275,7 @@ static inline void blk_partition_remap(struct bio *bio) bio->bi_bdev = bdev->bd_contains; trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, - bdev->bd_dev, bio->bi_sector, + bdev->bd_dev, bio->bi_sector - p->start_sect); } } @@ -1444,8 +1444,7 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; if (old_sector != -1) - trace_block_remap(q, bio, old_dev, bio->bi_sector, - old_sector); + trace_block_remap(q, bio, old_dev, old_sector); trace_block_bio_queue(q, bio); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8a994be035b..b01514afb6b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -657,8 +657,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, /* the bio has been remapped so dispatch it */ trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, - tio->io->bio->bi_bdev->bd_dev, - clone->bi_sector, sector); + tio->io->bio->bi_bdev->bd_dev, sector); generic_make_request(clone); } else if (r < 0 || r == DM_MAPIO_REQUEUE) { diff --git a/include/trace/block.h b/include/trace/block.h index 87f6456fd32..8ac945b7746 100644 --- a/include/trace/block.h +++ b/include/trace/block.h @@ -70,7 +70,7 @@ DECLARE_TRACE(block_split, DECLARE_TRACE(block_remap, TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t to, sector_t from), - TP_ARGS(q, bio, dev, to, from)); + sector_t to), + TP_ARGS(q, bio, dev, to)); #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f8d46d6f5d3..e099f8cc1d1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -830,7 +830,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * @q: queue the io is for * @bio: the source bio * @dev: target device - * @to: target sector * @from: source sector * * Description: @@ -839,7 +838,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * **/ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t to, sector_t from) + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -851,8 +850,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); r.sector_from = cpu_to_be64(from); - __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, + BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), + sizeof(r), &r); } /** From 48dd0fed90e2b1f1ba87401439b85942181c6df3 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 6 May 2009 15:45:45 +0530 Subject: [PATCH 348/900] tracing: trace_output.c, fix false positive compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This compiler warning: CC kernel/trace/trace_output.o kernel/trace/trace_output.c: In function ‘register_ftrace_event’: kernel/trace/trace_output.c:544: warning: ‘list’ may be used uninitialized in this function Is wrong as 'list' is always initialized - but GCC (4.3.2) does not recognize this relationship properly. Work around the warning by initializing the variable to NULL. [ Impact: fix false positive compiler warning ] Signed-off-by: Jaswinder Singh Rajput Acked-by: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5fc51f0f75f..8bd9a2c1a46 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -541,7 +541,7 @@ int register_ftrace_event(struct trace_event *event) INIT_LIST_HEAD(&event->list); if (!event->type) { - struct list_head *list; + struct list_head *list = NULL; if (next_event_type > FTRACE_MAX_EVENT) { From 35cf723e99c0e26ddf51f037dffaa4ff2c2c9106 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 May 2009 12:33:38 +0200 Subject: [PATCH 349/900] tracing: small trave_events sample Makefile cleanup Use -I$(src) to add the current directory the include path. [ Impact: cleanup ] Signed-off-by: Christoph Hellwig Acked-by: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- samples/trace_events/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile index 06c6dea1eb8..0d428dc6728 100644 --- a/samples/trace_events/Makefile +++ b/samples/trace_events/Makefile @@ -1,8 +1,6 @@ # builds the trace events example kernel modules; # then to use one (as root): insmod -PWD := $(shell pwd) - -CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/ +CFLAGS_trace-events-sample.o := -I$(src) obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o From 8e7abf1c62941ebb7a1416cbc62392c8a0902625 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 10:26:45 -0400 Subject: [PATCH 350/900] ring-buffer: remove unneeded conditional in rb_reserve_next The code in __rb_reserve_next checks on page overflow if it is the original commiter and then resets the page back to the original setting. Although this is fine, and the code is correct, it is a bit fragil. Some experimental work I did breaks it easily. The better and more robust solution is to have all commiters that overflow the page, simply subtract what they added. [ Impact: more robust ring buffer account management ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 424129eb20a..03ed52b67db 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1290,9 +1290,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, rb_event_set_padding(event); } - if (tail <= BUF_PAGE_SIZE) - /* Set the write back to the previous setting */ - local_set(&tail_page->write, tail); + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); /* * If this was a commit entry that failed, @@ -1311,8 +1310,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, out_reset: /* reset write */ - if (tail <= BUF_PAGE_SIZE) - local_set(&tail_page->write, tail); + local_sub(length, &tail_page->write); if (likely(lock_taken)) __raw_spin_unlock(&cpu_buffer->lock); From 00c81a58c5b4e0de14ee33bfbc3d71c90f69f9ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 12:40:51 -0400 Subject: [PATCH 351/900] ring-buffer: check for failed allocation in ring buffer benchmark The result of the allocation of the ring buffer read page in the ring buffer bench mark does not check the return to see if a page was actually allocated. This patch fixes that. [ Impact: avoid NULL dereference ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 747244acb8f..dcd75e9e49f 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -84,6 +84,9 @@ static enum event_status read_page(int cpu) int i; bpage = ring_buffer_alloc_read_page(buffer); + if (!bpage) + return EVENT_DROPPED; + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); if (ret >= 0) { rpage = bpage; From 6634ff26cce2da04e5c2a5481bcb8888e7d01786 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 15:30:07 -0400 Subject: [PATCH 352/900] ring-buffer: make moving the tail page a separate function Ingo Molnar thought the code would be cleaner if we used a function call instead of a goto for moving the tail page. After implementing this, it seems that gcc still inlines the result and the output is pretty much the same. Since this is considered a cleaner approach, might as well implement it. [ Impact: code clean up ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 89 +++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 03ed52b67db..3ae5ccf2c0f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1154,51 +1154,18 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } + static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) +rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length, unsigned long tail, + struct buffer_page *commit_page, + struct buffer_page *tail_page, u64 *ts) { - struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; - struct buffer_page *next_page; - unsigned long tail, write; + struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; - unsigned long flags; bool lock_taken = false; - - commit_page = cpu_buffer->commit_page; - /* we just need to protect against interrupts */ - barrier(); - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); - tail = write - length; - - /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) - goto next_page; - - /* We reserved something on the buffer */ - - if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) - return NULL; - - event = __rb_page_index(tail_page, tail); - rb_update_event(event, type, length); - - /* The passed in type is zero for DATA */ - if (likely(!type)) - local_inc(&tail_page->entries); - - /* - * If this is a commit and the tail is zero, then update - * this page's time stamp. - */ - if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->page->time_stamp = *ts; - - return event; - - next_page: + unsigned long flags; next_page = tail_page; @@ -1318,6 +1285,48 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length, u64 *ts) +{ + struct buffer_page *tail_page, *commit_page; + struct ring_buffer_event *event; + unsigned long tail, write; + + commit_page = cpu_buffer->commit_page; + /* we just need to protect against interrupts */ + barrier(); + tail_page = cpu_buffer->tail_page; + write = local_add_return(length, &tail_page->write); + tail = write - length; + + /* See if we shot pass the end of this buffer page */ + if (write > BUF_PAGE_SIZE) + return rb_move_tail(cpu_buffer, length, tail, + commit_page, tail_page, ts); + + /* We reserved something on the buffer */ + + if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) + return NULL; + + event = __rb_page_index(tail_page, tail); + rb_update_event(event, type, length); + + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); + + /* + * If this is a commit and the tail is zero, then update + * this page's time stamp. + */ + if (!tail && rb_is_commit(cpu_buffer, event)) + cpu_buffer->commit_page->page->time_stamp = *ts; + + return event; +} + static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, u64 *delta) From 3e07a4f680adc66dfa175aa5021aedf340251b12 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 18:36:59 -0400 Subject: [PATCH 353/900] ring-buffer: change test to be more latency friendly The ring buffer benchmark/test runs a producer for 10 seconds. This is done with preemption and interrupts enabled. But if the kernel is not compiled with CONFIG_PREEMPT, it basically stops everything but interrupts for 10 seconds. Although this is just a test and is not for production, this attribute can be quite annoying. It can also spawn badness elsewhere. This patch solves the issues by calling "cond_resched" when the system is not compiled with CONFIG_PREEMPT. It also keeps track of the time spent to call cond_resched such that it does not go against the time calculations. That is, if the task schedules away, the time scheduled out is removed from the test data. Note, this only works for non PREEMPT because we do not know when the task is scheduled out if we have PREEMPT enabled. [ Impact: prevent test from stopping the world for 10 seconds ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 31 ++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index dcd75e9e49f..a26fc67b63b 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -185,6 +185,35 @@ static void ring_buffer_consumer(void) complete(&read_done); } +/* + * If we are a non preempt kernel, the 10 second run will + * stop everything while it runs. Instead, we will call cond_resched + * and also add any time that was lost by a rescedule. + */ +#ifdef CONFIG_PREEMPT +static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) +{ +} +#else +static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) +{ + struct timeval tv; + + cond_resched(); + do_gettimeofday(&tv); + if (tv.tv_usec < end_tv->tv_usec) { + tv.tv_usec += 1000000; + tv.tv_sec--; + } + start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec; + start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec; + if (start_tv->tv_usec > 1000000) { + start_tv->tv_usec -= 1000000; + start_tv->tv_sec++; + } +} +#endif + static void ring_buffer_producer(void) { struct timeval start_tv; @@ -221,6 +250,8 @@ static void ring_buffer_producer(void) if (consumer && !(++cnt % wakeup_interval)) wake_up_process(consumer); + sched_if_needed(&start_tv, &end_tv); + } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); pr_info("End ring buffer hammer\n"); From 71e1c8ac42ae4038ddb1367cce7097ab868dc532 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 21:20:39 -0400 Subject: [PATCH 354/900] tracing: update sample with TRACE_INCLUDE_FILE When creating trace events for ftrace, the header file with the TRACE_EVENT macros must also have a macro called TRACE_SYSTEM. This macro describes the name of the system the TRACE_EVENTS are defined for. It also doubles as a way for the define_trace.h file to include the file that included it. For example: in irq.h #define TRACE_SYSTEM irq [...] #include The define_trace will use TRACE_SYSTEM to include irq.h. But if the name of the trace system does not match the name of the trace header file, one can override it with: Which will change define_trace.h to inclued foo_trace.h instead of foo.h The sample comments this, but people that use the sample code will more likely use the code and not read the comments. This patch changes the sample code to use the TRACE_INCLUDE_FILE to better show developers how to use it. [ Impact: make sample less confusing to developers ] Reported-by: Christoph Hellwig Signed-off-by: Steven Rostedt --- samples/trace_events/trace-events-sample.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index eab46443e61..128a897687c 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -31,7 +31,7 @@ * */ #undef TRACE_SYSTEM -#define TRACE_SYSTEM trace-events-sample +#define TRACE_SYSTEM sample /* * The TRACE_EVENT macro is broken up into 5 parts. @@ -120,5 +120,10 @@ TRACE_EVENT(foo_bar, * result. */ #undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH . +/* + * TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal + */ +#define TRACE_INCLUDE_FILE trace-events-sample #include From 9456f0fa6d3cb944d3b9fc31c9a244e0362c26ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 21:54:09 -0400 Subject: [PATCH 355/900] tracing: reset ring buffer when removing modules with events Li Zefan found that there's a race using the event ids of events and modules. When a module is loaded, an event id is incremented. We only have 16 bits for event ids (65536) and there is a possible (but highly unlikely) race that we could load and unload a module that registers events so many times that the event id counter overflows. When it overflows, it then restarts and goes looking for available ids. An id is available if it was added by a module and released. The race is if you have one module add an id, and then is removed. Another module loaded can use that same event id. But if the old module still had events in the ring buffer, the new module's call back would get bogus data. At best (and most likely) the output would just be garbage. But if the module for some reason used pointers (not recommended) then this could potentially crash. The safest thing to do is just reset the ring buffer if a module that registered events is removed. [ Impact: prevent unpredictable results of event id overflows ] Reported-by: Li Zefan LKML-Reference: <49FEAFD0.30106@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 ++++++++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 9 +++++++++ 3 files changed, 21 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4164a344e72..dd40d232034 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -639,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr) tracing_reset(tr, cpu); } +void tracing_reset_current(int cpu) +{ + tracing_reset(&global_trace, cpu); +} + +void tracing_reset_current_online_cpus(void) +{ + tracing_reset_online_cpus(&global_trace); +} + #define SAVED_CMDLINES 128 #define NO_CMDLINE_MAP UINT_MAX static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 777c6c3a0cd..ba25793ffe6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -409,6 +409,8 @@ int tracing_is_enabled(void); void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset_current(int cpu); +void tracing_reset_current_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *trace_create_file(const char *name, mode_t mode, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8d579ff2361..6d2c842a024 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -932,9 +932,11 @@ static void trace_module_remove_events(struct module *mod) { struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; + bool found = false; list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { + found = true; if (call->enabled) { call->enabled = 0; call->unregfunc(); @@ -957,6 +959,13 @@ static void trace_module_remove_events(struct module *mod) list_del(&file_ops->list); kfree(file_ops); } + + /* + * It is safest to reset the ring buffer if the module being unloaded + * registered any events. + */ + if (found) + tracing_reset_current_online_cpus(); } static int trace_module_notify(struct notifier_block *self, From 8ae79a138e88aceeeb07077bff2883245fb7c218 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 22:52:15 -0400 Subject: [PATCH 356/900] tracing: add hierarchical enabling of events With the current event directory, you can only enable individual events. The file debugfs/tracing/set_event is used to be able to enable or disable several events at once. But that can still be awkward. This patch adds hierarchical enabling of events. That is, each directory in debugfs/tracing/events has an "enable" file. This file can enable or disable all events within the directory and below. # echo 1 > /debugfs/tracing/events/enable will enable all events. # echo 1 > /debugfs/tracing/events/sched/enable will enable all events in the sched subsystem. # echo 1 > /debugfs/tracing/events/enable # echo 0 > /debugfs/tracing/events/irq/enable will enable all events, but then disable just the irq subsystem events. When reading one of these enable files, there are four results: 0 - all events this file affects are disabled 1 - all events this file affects are enabled X - there is a mixture of events enabled and disabled ? - this file does not affect any event Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 140 ++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6d2c842a024..87feb0117ce 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -400,6 +400,133 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char *system = filp->private_data; + struct ftrace_event_call *call; + char buf[2]; + int set = -1; + int all = 0; + int ret; + + if (system[0] == '*') + all = 1; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!call->name || !call->regfunc) + continue; + + if (!all && strcmp(call->system, system) != 0) + continue; + + /* + * We need to find out if all the events are set + * or if all events or cleared, or if we have + * a mixture. + */ + if (call->enabled) { + switch (set) { + case -1: + set = 1; + break; + case 0: + set = 2; + break; + } + } else { + switch (set) { + case -1: + set = 0; + break; + case 1: + set = 2; + break; + } + } + /* + * If we have a mixture, no need to look further. + */ + if (set == 2) + break; + } + mutex_unlock(&event_mutex); + + buf[1] = '\n'; + switch (set) { + case 0: + buf[0] = '0'; + break; + case 1: + buf[0] = '1'; + break; + case 2: + buf[0] = 'X'; + break; + default: + buf[0] = '?'; + } + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + + return ret; +} + +static ssize_t +system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char *system = filp->private_data; + unsigned long val; + char *command; + char buf[64]; + ssize_t ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + switch (val) { + case 0: + case 1: + break; + + default: + return -EINVAL; + } + + command = kstrdup(system, GFP_KERNEL); + if (!command) + return -ENOMEM; + + ret = ftrace_set_clr_event(command, val); + if (ret) + goto out_free; + + ret = cnt; + + out_free: + kfree(command); + + *ppos += cnt; + + return ret; +} + extern char *__bad_type_size(void); #undef FIELD @@ -686,6 +813,12 @@ static const struct file_operations ftrace_subsystem_filter_fops = { .write = subsystem_filter_write, }; +static const struct file_operations ftrace_system_enable_fops = { + .open = tracing_open_generic, + .read = system_enable_read, + .write = system_enable_write, +}; + static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, @@ -768,6 +901,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } + entry = trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); + return system->entry; } @@ -1041,6 +1178,9 @@ static __init int event_trace_init(void) ring_buffer_print_entry_header, &ftrace_show_header_fops); + trace_create_file("enable", 0644, d_events, + "*:*", &ftrace_system_enable_fops); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) From 975e5f45500dff6d15c0001bb662e9aac0ce0076 Mon Sep 17 00:00:00 2001 From: Samuel Bronson Date: Wed, 6 May 2009 22:27:55 -0400 Subject: [PATCH 357/900] x86: use symbolic name for VM86_SIGNAL when used as vm86 default return This code has apparently used "0" and not VM86_SIGNAL since Linux 1.1.9, when Linus added VM86_SIGNAL to vm86.h. This patch changes the code to use the symbolic name. The magic 0 tripped me up in trying to extend the vm86(2) manpage to actually explain vm86()'s interface -- my greps for VM86_SIGNAL came up fruitless. [ Impact: cleanup; no object code change ] Signed-off-by: Samuel Bronson Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vm86_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index d7ac84e7fc1..b8035a0f404 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -318,9 +318,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk } /* - * Save old state, set default return value (%ax) to 0 + * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) */ - info->regs32->ax = 0; + info->regs32->ax = VM86_SIGNAL; tsk->thread.saved_sp0 = tsk->thread.sp0; tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); From 643bec956544d376b7c2a80a3d5c3d0bf94da8d3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 7 May 2009 09:12:50 +0200 Subject: [PATCH 358/900] x86: clean up arch/x86/kernel/tsc_sync.c a bit - remove unused define - make the lock variable definition stand out some more - convert KERN_* to pr_info() / pr_warning() [ Impact: cleanup ] LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_sync.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bf36328f6ef..027b5b49899 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count; * of a critical section, to be able to prove TSC time-warps: */ static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; + static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; static __cpuinitdata int nr_warps; @@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - printk(KERN_INFO - "Skipping synchronization checks as TSC is reliable.\n"); + pr_info("Skipping synchronization checks as TSC is reliable.\n"); return; } - printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", - smp_processor_id(), cpu); + pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", + smp_processor_id(), cpu); /* * Reset it - in case this is a second bootup: @@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu) if (nr_warps) { printk("\n"); - printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," - " turning off TSC clock.\n", max_warp); + pr_warning("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); } else { printk(" passed.\n"); @@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void) while (atomic_read(&stop_count) != cpus) cpu_relax(); } -#undef NR_LOOPS - From aa47b7e0f89b9998dad4d1667447e8cb7703ff4e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 4 May 2009 01:38:05 -0700 Subject: [PATCH 359/900] sched: emit thread info flags with stack trace When a thread is oom killed and fails to exit, it's helpful to know which threads have access to memory reserves if the machine livelocks. This is done by testing for the TIF_MEMDIE thread info flag and should be displayed alongside stack traces to identify tasks that have access to such reserves but are still stuck allocating pages, for instance. It would probably be helpful in other cases as well, so all thread info flags are emitted when showing a task. ( v2: fix warning reported by Stephen Rothwell ) [ Impact: extend debug printout info ] Signed-off-by: David Rientjes Cc: Peter Zijlstra Cc: Stephen Rothwell LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 2a43a581ead..5aa63f50c69 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,8 +6610,9 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent)); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent), + (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); } From e8808c1019b048a43686dbd25c188a035842c2e2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 3 May 2009 02:48:52 +0200 Subject: [PATCH 360/900] tracing/filters: support for filters of dynamic sized arrays Currently the filtering infrastructure supports well the numeric types and fixed sized array types. But the recently added __string() field uses a specific indirect offset mechanism which requires a specific predicate. Until now it wasn't supported. This patch adds this support and implies very few changes, only a new predicate is needed, the management of this specific field can be done through the usual string helpers in the filtering infrastructure. [ Impact: support all kinds of strings in the tracing filters ] Cc: Tom Zanussi Cc: Steven Rostedt Cc: Li Zefan Cc: Zhaolei Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events_filter.c | 44 ++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7ac69108527..01c76eb3e16 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -151,6 +151,7 @@ static int filter_pred_or(struct filter_pred *pred __attribute((unused)), return val1 || val2; } +/* Filter predicate for fixed sized arrays of characters */ static int filter_pred_string(struct filter_pred *pred, void *event, int val1, int val2) { @@ -164,6 +165,30 @@ static int filter_pred_string(struct filter_pred *pred, void *event, return match; } +/* + * Filter predicate for dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry. + * Also each of these strings have a field in the entry which + * contains its offset from the beginning of the entry. + * We have then first to get this field, dereference it + * and add it to the address of the entry, and at last we have + * the address of the string. + */ +static int filter_pred_strloc(struct filter_pred *pred, void *event, + int val1, int val2) +{ + int str_loc = *(int *)(event + pred->offset); + char *addr = (char *)(event + str_loc); + int cmp, match; + + cmp = strncmp(addr, pred->str_val, pred->str_len); + + match = (!cmp) ^ pred->not; + + return match; +} + static int filter_pred_none(struct filter_pred *pred, void *event, int val1, int val2) { @@ -446,10 +471,18 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return 0; } +enum { + FILTER_STATIC_STRING = 1, + FILTER_DYN_STRING +}; + static int is_string_field(const char *type) { if (strchr(type, '[') && strstr(type, "char")) - return 1; + return FILTER_STATIC_STRING; + + if (!strcmp(type, "__str_loc")) + return FILTER_DYN_STRING; return 0; } @@ -512,6 +545,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_field *field; filter_pred_fn_t fn; unsigned long long val; + int string_type; pred->fn = filter_pred_none; @@ -536,8 +570,12 @@ static int filter_add_pred(struct filter_parse_state *ps, return -EINVAL; } - if (is_string_field(field->type)) { - fn = filter_pred_string; + string_type = is_string_field(field->type); + if (string_type) { + if (string_type == FILTER_STATIC_STRING) + fn = filter_pred_string; + else + fn = filter_pred_strloc; pred->str_len = field->size; if (pred->op == OP_NE) pred->not = 1; From 5928c3cc0ffcb6894bbab6be591b7ae1786b2d87 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 3 May 2009 03:03:57 +0200 Subject: [PATCH 361/900] tracing/filters: support for operator reserved characters in strings When we set a filter for an event, such as: echo "name == my_lock_name" > \ /debug/tracing/events/lockdep/lock_acquired/filter then the following order of token type is parsed: - space - operator - parentheses - operand Because the operators and parentheses have a higher precedence than the operand characters, which is normal, then we can't use any string containing such special characters: ()=<>!&| To get this support and also avoid ambiguous intepretation from the parser or the human, we can do it using double quotes so that we keep the usual languages habits. Then after this patch you can still declare string condition like before: echo name == myname But if you want to compare against a string containing an operator character, you can use double quotes: echo 'name == "&myname"' Don't forget to include the whole expression into single quotes or the double ones will be eaten by echo. [ Impact: support strings with special characters for tracing filters ] Cc: Tom Zanussi Cc: Steven Rostedt Cc: Li Zefan Cc: Zhaolei Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events_filter.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 01c76eb3e16..8c62e5bdff0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -851,10 +851,19 @@ static void postfix_clear(struct filter_parse_state *ps) static int filter_parse(struct filter_parse_state *ps) { + int in_string = 0; int op, top_op; char ch; while ((ch = infix_next(ps))) { + if (ch == '"') { + in_string ^= 1; + continue; + } + + if (in_string) + goto parse_operand; + if (isspace(ch)) continue; @@ -908,6 +917,7 @@ static int filter_parse(struct filter_parse_state *ps) } continue; } +parse_operand: if (append_operand_char(ps, ch)) { parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); return -EINVAL; From d94fc523f3c35bd8013f04827e94756cbc0212f4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 7 May 2009 15:11:15 +0800 Subject: [PATCH 362/900] tracing/events: fix concurrent access to ftrace_events list, fix In filter_add_subsystem_pred() we should release event_mutex before calling filter_free_subsystem_preds(), since both functions hold event_mutex. [ Impact: fix deadlock when writing invalid pred into subsystem filter ] Signed-off-by: Li Zefan Cc: tzanussi@gmail.com Cc: a.p.zijlstra@chello.nl Cc: fweisbec@gmail.com Cc: rostedt@goodmis.org LKML-Reference: <4A028993.7020509@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8c62e5bdff0..85ad6a8939a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -636,14 +636,15 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, err = filter_add_pred(ps, call, pred); if (err) { + mutex_unlock(&event_mutex); filter_free_subsystem_preds(system); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - break; + goto out; } replace_filter_string(call->filter, filter_string); } mutex_unlock(&event_mutex); - +out: return err; } From 29c8000ee7da3a6756d26143991e573eaaf2a9f6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 11:13:42 -0400 Subject: [PATCH 363/900] ring-buffer: remove complex calculations in ring-buffer-test Ingo Molnar thought that the code to calculate the time in cond_resched is a bit too ugly and is not needed. This patch removes it and replaces it with a simple call to cond_resched. I kept the comment that explains the reason for the cond_resched. [ Impact: remove ugly code ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 37 ++++++---------------------- 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a26fc67b63b..f4ceb453c7d 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -185,35 +185,6 @@ static void ring_buffer_consumer(void) complete(&read_done); } -/* - * If we are a non preempt kernel, the 10 second run will - * stop everything while it runs. Instead, we will call cond_resched - * and also add any time that was lost by a rescedule. - */ -#ifdef CONFIG_PREEMPT -static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) -{ -} -#else -static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) -{ - struct timeval tv; - - cond_resched(); - do_gettimeofday(&tv); - if (tv.tv_usec < end_tv->tv_usec) { - tv.tv_usec += 1000000; - tv.tv_sec--; - } - start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec; - start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec; - if (start_tv->tv_usec > 1000000) { - start_tv->tv_usec -= 1000000; - start_tv->tv_sec++; - } -} -#endif - static void ring_buffer_producer(void) { struct timeval start_tv; @@ -250,7 +221,13 @@ static void ring_buffer_producer(void) if (consumer && !(++cnt % wakeup_interval)) wake_up_process(consumer); - sched_if_needed(&start_tv, &end_tv); + /* + * If we are a non preempt kernel, the 10 second run will + * stop everything while it runs. Instead, we will call + * cond_resched and also add any time that was lost by a + * rescedule. + */ + cond_resched(); } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); pr_info("End ring buffer hammer\n"); From 54f2c841fa0007e5fee3b7d01a911c774f0a6cda Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 7 May 2009 17:28:59 +0200 Subject: [PATCH 364/900] oprofile: fix cpu buffer size The unit of oprofile_cpu_buffer_size is in samples, but was allocated in bytes. This led to the allocation of too small cpu buffers. This patch recalculates the buffer size in bytes taking also the ring_buffer_event header size into account. Reported-by: Suravee Suthikulpanit Signed-off-by: Robert Richter --- drivers/oprofile/cpu_buffer.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index f0e99d4c066..242257b1944 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -78,16 +78,20 @@ void free_cpu_buffers(void) op_ring_buffer_write = NULL; } +#define RB_EVENT_HDR_SIZE 4 + int alloc_cpu_buffers(void) { int i; unsigned long buffer_size = oprofile_cpu_buffer_size; + unsigned long byte_size = buffer_size * (sizeof(struct op_sample) + + RB_EVENT_HDR_SIZE); - op_ring_buffer_read = ring_buffer_alloc(buffer_size, OP_BUFFER_FLAGS); + op_ring_buffer_read = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS); if (!op_ring_buffer_read) goto fail; - op_ring_buffer_write = ring_buffer_alloc(buffer_size, OP_BUFFER_FLAGS); + op_ring_buffer_write = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS); if (!op_ring_buffer_write) goto fail; From d6bf81ef0f7474434c2a049e8bf3c9146a14dd96 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 11:49:35 -0400 Subject: [PATCH 365/900] tracing: append ":*" to internal setting of system events The system enabling of events uses the same code as the set_event file. It passes in the name of the system to the parser and that will enable all the events that has that system as a name. The problem is that it will also enable events with the same name as the system. If you have system name foo, and system name bar, but within the system bar, there exists an event called foo. By setting the system name foo, you will also be enabling the event foo in the system bar. This is not an expected result. The solution is to pass in "foo:*", which will only enable the system foo and not events called foo. [ Impact: prevent accidental enabling of events with same name as a system ] Reported-by: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 87feb0117ce..8d0fae3af59 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -509,9 +509,11 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; } - command = kstrdup(system, GFP_KERNEL); + /* +3 for the ":*\0" */ + command = kmalloc(strlen(system)+3, GFP_KERNEL); if (!command) return -ENOMEM; + sprintf(command, "%s:*", system); ret = ftrace_set_clr_event(command, val); if (ret) @@ -1179,7 +1181,7 @@ static __init int event_trace_init(void) &ftrace_show_header_fops); trace_create_file("enable", 0644, d_events, - "*:*", &ftrace_system_enable_fops); + "*", &ftrace_system_enable_fops); for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ From 65b77242043f74bca6a0d733c0e48ef03a8c9893 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 12:49:27 -0400 Subject: [PATCH 366/900] tracing: have menu default enabled when kernel debug is configured Tracing can be very helpful to debug the kernel. When DEBUG_KERNEL is enabled it is nice to enable the trace menu as well. This patch only make the tracing menu enabled by default, it does not make any of the tracers enabled. And the menu is only enabled by default if DEBUG_KERNEL is enabled. [ Impact: show tracing options to those debugging the kernel ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 50f62a296e1..f61be301578 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -79,6 +79,7 @@ if TRACING_SUPPORT menuconfig FTRACE bool "Tracers" + default y if DEBUG_KERNEL help Enable the kernel tracing infrastructure. From 0574ea421b90e0e45a72c447dd3c2c79ffd8c153 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 14:20:28 -0400 Subject: [PATCH 367/900] ring-buffer: only periodically call cond_resched to ring-buffer-benchmark Calling cond_resched at every iteration of the loop adds a bit of overhead to the benchmark. This patch does two things. 1) only calls cond-resched when CONFIG_PREEMPT is not enabled 2) only calls cond-resched after so many traces has been performed. [ Impact: less overhead to the ring-buffer-benchmark ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index f4ceb453c7d..a7c048bb446 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -218,16 +218,23 @@ static void ring_buffer_producer(void) } do_gettimeofday(&end_tv); - if (consumer && !(++cnt % wakeup_interval)) + cnt++; + if (consumer && !(cnt % wakeup_interval)) wake_up_process(consumer); +#ifndef CONFIG_PREEMPT /* * If we are a non preempt kernel, the 10 second run will * stop everything while it runs. Instead, we will call * cond_resched and also add any time that was lost by a * rescedule. + * + * Do a cond resched at the same frequency we would wake up + * the reader. */ - cond_resched(); + if (cnt % wakeup_interval) + cond_resched(); +#endif } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); pr_info("End ring buffer hammer\n"); From d3584183d2f40f40371e288ceef187d04da213b5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 7 May 2009 15:36:13 -0700 Subject: [PATCH 368/900] sparc64: Fix SET_PERSONALITY to not clip bits outside of PER_MASK. Signed-off-by: David S. Miller --- arch/sparc/include/asm/elf_64.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/sparc/include/asm/elf_64.h b/arch/sparc/include/asm/elf_64.h index 425c2f9be6d..d42e393078c 100644 --- a/arch/sparc/include/asm/elf_64.h +++ b/arch/sparc/include/asm/elf_64.h @@ -208,8 +208,9 @@ do { unsigned long new_flags = current_thread_info()->flags; \ else \ clear_thread_flag(TIF_ABI_PENDING); \ /* flush_thread will update pgd cache */ \ - if (current->personality != PER_LINUX32) \ - set_personality(PER_LINUX); \ + if (personality(current->personality) != PER_LINUX32) \ + set_personality(PER_LINUX | \ + (current->personality & (~PER_MASK))); \ } while (0) #endif /* !(__ASM_SPARC64_ELF_H) */ From 7da3046d6ce6ea97494020081c509b642b7016af Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 19:52:20 -0400 Subject: [PATCH 369/900] ring-buffer: add total count in ring-buffer-benchmark It is nice to see the overhead of the benchmark test when tracing is disabled. That is, we turn off the ring buffer just to see what the cost of running the loop that calls into the ring buffer is. Currently, if no entries wer made, we get 0. This is not informative. This patch changes it to check if we had any "missed" (non recorded) events. If so, a total count is also reported. [ Impact: evaluate the over head of the ring buffer benchmark test ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a7c048bb446..a21aa7b3d05 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -285,6 +285,17 @@ static void ring_buffer_producer(void) avg = 1000000 / hit; pr_info("%ld ns per entry\n", avg); } + + + if (missed) { + if (time) + missed /= (long)time; + + pr_info("Total iterations per millisec: %ld\n", hit + missed); + + avg = 1000000 / (hit + missed); + pr_info("%ld ns per entry\n", avg); + } } static void wait_to_die(void) From 74f4fd21664148b8c454cc07bfe74e4dd51cf07b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 19:58:55 -0400 Subject: [PATCH 370/900] ring-buffer: change WARN_ON from checking preempt_count to preemptible There's a WARN_ON in the ring buffer code that makes sure preemption is disabled. It checks "!preempt_count()". But when CONFIG_PREEMPT is not enabled, preempt_count() is always zero, and this will trigger the warning. [ Impact: prevent false warning on non preemptible kernels ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3ae5ccf2c0f..361170609bd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1688,7 +1688,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, * committed yet. Thus we can assume that preemption * is still disabled. */ - RB_WARN_ON(buffer, !preempt_count()); + RB_WARN_ON(buffer, preemptible()); cpu = smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; From 6b2e8523df148c15ea5abf13075026fb8bdb3f86 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 May 2009 11:56:49 -0700 Subject: [PATCH 371/900] xen: reserve Xen start_info rather than e820 reserving Use reserve_early rather than e820 reservations for Xen start info and mfn->pfn table, so that the memory use is a bit more self-documenting. [ Impact: cleanup ] Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel Cc: Linus Torvalds LKML-Reference: <4A032EF1.6070708@goop.org> Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 15c6c68db6a..ad0047f47cd 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -61,9 +61,9 @@ char * __init xen_memory_setup(void) * - xen_start_info * See comment above "struct start_info" in */ - e820_add_region(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list, - E820_RESERVED); + reserve_early(__pa(xen_start_info->mfn_list), + __pa(xen_start_info->pt_base), + "XEN START INFO"); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); From 1dcdb5a9e7c235e6e80f1f4d5b8247b3e5347e48 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 17:44:11 +0200 Subject: [PATCH 372/900] oprofile: re-add force_arch_perfmon option This re-adds the force_arch_perfmon option that was in the original arch perfmon patchkit. Originally this was rejected in favour of a generalized perfmon=name option, but it turned out implementing the later in a reliable way is hard (and it would have been easy to crash the kernel if a user gets it wrong) But now Atom and Core i7 support being readded a user would need to update their oprofile userland to beyond 0.9.4 to use oprofile again on Atom or Core i7. To avoid this problem readd the force_arch_perfmon option. Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- Documentation/kernel-parameters.txt | 6 ++++++ arch/x86/oprofile/nmi_int.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 90b3924071b..9b9566bf330 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1650,6 +1650,12 @@ and is between 256 and 4096 characters. It is defined in the file oprofile.timer= [HW] Use timer interrupt instead of performance counters + oprofile.force_arch_perfmon=1 [X86] + Force use of architectural perfmon instead of + the CPU specific event set. + This might be useful if you have older oprofile + userland or if you want common events over Intel CPUs. + osst= [HW,SCSI] SCSI Tape Driver Format: , See also Documentation/scsi/st.txt. diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 202864ad49a..e5171c99e15 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -389,10 +389,16 @@ static int __init p4_init(char **cpu_type) return 0; } +int force_arch_perfmon; +module_param(force_arch_perfmon, int, 0); + static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; + if (force_arch_perfmon && cpu_has_arch_perfmon) + return 0; + switch (cpu_model) { case 0 ... 2: *cpu_type = "i386/ppro"; From 1f3d7b60691993d8d368d8dd7d5d85871d41e8f5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 17:44:12 +0200 Subject: [PATCH 373/900] oprofile: remove undocumented oprofile.p4force option There are no new P4s and the oprofile code knows about all existing ones, so we don't really need the p4force option anymore. Remove it. Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index e5171c99e15..f472c0c48a3 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -356,14 +356,11 @@ static void exit_sysfs(void) #define exit_sysfs() do { } while (0) #endif /* CONFIG_PM */ -static int p4force; -module_param(p4force, int, 0); - static int __init p4_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; - if (!p4force && (cpu_model > 6 || cpu_model == 5)) + if (cpu_model > 6 || cpu_model == 5) return 0; #ifndef CONFIG_SMP From 6adf406f0a0eaf37251018d15f51e93f5b538ee6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 17:44:13 +0200 Subject: [PATCH 374/900] oprofile: add support for Core i7 and Atom The registers are about the same as other Family 6 CPUs so we only need to add detection. I'm not completely happy with calling Nehalem Core i7 because there will be undoubtedly other Nehalem based CPUs in the future with different marketing names, but it's the best we got for now. Requires updated oprofile userland for the new event files. If you don't want to update right now you can also use oprofile.force_arch_perfmon=1 (added in the next patch) with 0.9.4 Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index f472c0c48a3..3308147182a 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -417,6 +417,13 @@ static int __init ppro_init(char **cpu_type) case 15: case 23: *cpu_type = "i386/core_2"; break; + case 26: + arch_perfmon_setup_counters(); + *cpu_type = "i386/core_i7"; + break; + case 28: + *cpu_type = "i386/atom"; + break; default: /* Unknown */ return 0; From 7e4e0bd50e80df2fe5501f48f872448376cdd997 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 May 2009 12:10:23 +0200 Subject: [PATCH 375/900] oprofile: introduce module_param oprofile.cpu_type This patch removes module_param oprofile.force_arch_perfmon and introduces oprofile.cpu_type=archperfmon instead. This new parameter can be reused for other models and architectures. Currently only archperfmon is supported. Cc: Andi Kleen Signed-off-by: Robert Richter --- Documentation/kernel-parameters.txt | 12 +++++++----- arch/x86/oprofile/nmi_int.c | 13 +++++++++++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9b9566bf330..6ce5f48859c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1650,11 +1650,13 @@ and is between 256 and 4096 characters. It is defined in the file oprofile.timer= [HW] Use timer interrupt instead of performance counters - oprofile.force_arch_perfmon=1 [X86] - Force use of architectural perfmon instead of - the CPU specific event set. - This might be useful if you have older oprofile - userland or if you want common events over Intel CPUs. + oprofile.cpu_type= Force an oprofile cpu type + This might be useful if you have an older oprofile + userland or if you want common events. + Format: { archperfmon } + archperfmon: [X86] Force use of architectural + perfmon on Intel CPUs instead of the + CPU specific event set. osst= [HW,SCSI] SCSI Tape Driver Format: , diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3308147182a..3b285e656e2 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -386,8 +386,17 @@ static int __init p4_init(char **cpu_type) return 0; } -int force_arch_perfmon; -module_param(force_arch_perfmon, int, 0); +static int force_arch_perfmon; +static int force_cpu_type(const char *str, struct kernel_param *kp) +{ + if (!strcmp(str, "archperfmon")) { + force_arch_perfmon = 1; + printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); + } + + return 0; +} +module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); static int __init ppro_init(char **cpu_type) { From 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 8 May 2009 10:31:42 +0800 Subject: [PATCH 376/900] tracing/events: clean up for ftrace_set_clr_event() Add a helper function __ftrace_set_clr_event(), and replace some ftrace_set_clr_event() calls with this helper, thus we don't need any kstrdup() or kmalloc(). As a side effect, this patch fixes an issue in self tests code, which is similar to the one fixed in commit d6bf81ef0f7474434c2a049e8bf3c9146a14dd96 ("tracing: append ":*" to internal setting of system events") It's a small issue and won't cause any bug in fact, but we should do things right anyway. [ Impact: prevent spurious event-enabling in tracing self-tests ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A03998E.3020503@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 128 +++++++++++++----------------------- 1 file changed, 47 insertions(+), 81 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8d0fae3af59..45f1099386b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -111,35 +111,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, } } -static int ftrace_set_clr_event(char *buf, int set) +/* + * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. + */ +static int __ftrace_set_clr_event(const char *match, const char *sub, + const char *event, int set) { struct ftrace_event_call *call; - char *event = NULL, *sub = NULL, *match; - int ret = -EINVAL; - - /* - * The buf format can be : - * *: means any event by that name. - * : is the same. - * - * :* means all events in that subsystem - * : means the same. - * - * (no ':') means all events in a subsystem with - * the name or any event that matches - */ - - match = strsep(&buf, ":"); - if (buf) { - sub = match; - event = buf; - match = NULL; - - if (!strlen(sub) || strcmp(sub, "*") == 0) - sub = NULL; - if (!strlen(event) || strcmp(event, "*") == 0) - event = NULL; - } + int ret; mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { @@ -167,6 +146,37 @@ static int ftrace_set_clr_event(char *buf, int set) return ret; } +static int ftrace_set_clr_event(char *buf, int set) +{ + char *event = NULL, *sub = NULL, *match; + + /* + * The buf format can be : + * *: means any event by that name. + * : is the same. + * + * :* means all events in that subsystem + * : means the same. + * + * (no ':') means all events in a subsystem with + * the name or any event that matches + */ + + match = strsep(&buf, ":"); + if (buf) { + sub = match; + event = buf; + match = NULL; + + if (!strlen(sub) || strcmp(sub, "*") == 0) + sub = NULL; + if (!strlen(event) || strcmp(event, "*") == 0) + event = NULL; + } + + return __ftrace_set_clr_event(match, sub, event, set); +} + /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 @@ -408,18 +418,14 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, struct ftrace_event_call *call; char buf[2]; int set = -1; - int all = 0; int ret; - if (system[0] == '*') - all = 1; - mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (!call->name || !call->regfunc) continue; - if (!all && strcmp(call->system, system) != 0) + if (system && strcmp(call->system, system) != 0) continue; /* @@ -480,7 +486,6 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, { const char *system = filp->private_data; unsigned long val; - char *command; char buf[64]; ssize_t ret; @@ -500,30 +505,16 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; - switch (val) { - case 0: - case 1: - break; - - default: + if (val != 0 && val != 1) return -EINVAL; - } - /* +3 for the ":*\0" */ - command = kmalloc(strlen(system)+3, GFP_KERNEL); - if (!command) - return -ENOMEM; - sprintf(command, "%s:*", system); - - ret = ftrace_set_clr_event(command, val); + ret = __ftrace_set_clr_event(NULL, system, NULL, val); if (ret) - goto out_free; + goto out; ret = cnt; - out_free: - kfree(command); - +out: *ppos += cnt; return ret; @@ -1181,7 +1172,7 @@ static __init int event_trace_init(void) &ftrace_show_header_fops); trace_create_file("enable", 0644, d_events, - "*", &ftrace_system_enable_fops); + NULL, &ftrace_system_enable_fops); for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ @@ -1259,7 +1250,6 @@ static __init void event_trace_self_tests(void) { struct ftrace_event_call *call; struct event_subsystem *system; - char *sysname; int ret; pr_info("Running tests on trace events:\n"); @@ -1305,14 +1295,7 @@ static __init void event_trace_self_tests(void) pr_info("Testing event system %s: ", system->name); - /* ftrace_set_clr_event can modify the name passed in. */ - sysname = kstrdup(system->name, GFP_KERNEL); - if (WARN_ON(!sysname)) { - pr_warning("Can't allocate memory, giving up!\n"); - return; - } - ret = ftrace_set_clr_event(sysname, 1); - kfree(sysname); + ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warning("error enabling system %s\n", system->name); @@ -1321,14 +1304,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); - sysname = kstrdup(system->name, GFP_KERNEL); - if (WARN_ON(!sysname)) { - pr_warning("Can't allocate memory, giving up!\n"); - return; - } - ret = ftrace_set_clr_event(sysname, 0); - kfree(sysname); - + ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) pr_warning("error disabling system %s\n", system->name); @@ -1341,15 +1317,8 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); - sysname = kmalloc(4, GFP_KERNEL); - if (WARN_ON(!sysname)) { - pr_warning("Can't allocate memory, giving up!\n"); - return; - } - memcpy(sysname, "*:*", 4); - ret = ftrace_set_clr_event(sysname, 1); + ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { - kfree(sysname); pr_warning("error enabling all events\n"); return; } @@ -1357,10 +1326,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); /* reset sysname */ - memcpy(sysname, "*:*", 4); - ret = ftrace_set_clr_event(sysname, 0); - kfree(sysname); - + ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warning("error disabling all events\n"); return; From c142b15dc56ee6d55cb97a062e3c8e9c61e384c0 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 8 May 2009 10:32:05 +0800 Subject: [PATCH 377/900] tracing/events: simplify system_enable_read() A smarter way to figure out the output of an enable file. [ Impact: clean up ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A0399A5.2080603@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 40 ++++++------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 45f1099386b..df394bc6d54 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -414,10 +414,11 @@ static ssize_t system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + const char set_to_char[4] = { '?', '0', '1', 'X' }; const char *system = filp->private_data; struct ftrace_event_call *call; char buf[2]; - int set = -1; + int set = 0; int ret; mutex_lock(&event_mutex); @@ -433,47 +434,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - if (call->enabled) { - switch (set) { - case -1: - set = 1; - break; - case 0: - set = 2; - break; - } - } else { - switch (set) { - case -1: - set = 0; - break; - case 1: - set = 2; - break; - } - } + set |= (1 << !!call->enabled); + /* * If we have a mixture, no need to look further. */ - if (set == 2) + if (set == 3) break; } mutex_unlock(&event_mutex); + buf[0] = set_to_char[set]; buf[1] = '\n'; - switch (set) { - case 0: - buf[0] = '0'; - break; - case 1: - buf[0] = '1'; - break; - case 2: - buf[0] = 'X'; - break; - default: - buf[0] = '?'; - } ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); From bf8b9a63c18a1a7777571650de0c9f4fd4368ca0 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 8 May 2009 20:53:58 +0530 Subject: [PATCH 378/900] x86: msr-index.h remove duplicate MSR C001_0015 declaration MSRC001_0015 Hardware Configuration Register (HWCR) is already defined as MSR_K7_HWCR. And HWCR is available for >= K7. So MSR_K8_HWCR is not required and no-one is using it. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput --- arch/x86/include/asm/msr-index.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ec41fc16c16..4d58d04fca8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -121,7 +121,6 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 -#define MSR_K8_HWCR 0xc0010015 #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 From 29f93943d1916d1a3faa3f10f4a06994347ac990 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 16:06:47 -0400 Subject: [PATCH 379/900] tracing: initialize return value for __ftrace_set_clr_event Commit 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2 tracing/events: clean up for ftrace_set_clr_event() Moved out the code for ftrace_set_clr_event into a helper funciton but did not initialize the return value. As a result, we do not warn about a typo in the echoing of events in set_event. This patch restores the old warning: # echo foobar > set_event -bash: echo: write error: Invalid argument [ Impact: restore warning of invalid entries to set_event ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index df394bc6d54..2eecb87e42d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -118,7 +118,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, const char *event, int set) { struct ftrace_event_call *call; - int ret; + int ret = -EINVAL; mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { From 4671c79408a3f8a5a6a45e39c4c164dada3a5678 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 16:27:41 -0400 Subject: [PATCH 380/900] tracing: add trace_set_clr_event to export event enabling function Other parts of the kernel may need to be able to enable or disable specific events. Especially parts that create trace events. [ Impact: allow enabling of trace events by those that create the event ] Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 ++ kernel/trace/trace_events.c | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 662c1becf36..bae51ddfabd 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -127,6 +127,8 @@ extern int trace_define_field(struct ftrace_event_call *call, char *type, #define is_signed_type(type) (((type)(-1)) < 0) +int trace_set_clr_event(const char *system, const char *event, int set); + /* * The double __builtin_constant_p is because gcc will give us an error * if we try to allocate the static variable to fmt if it is not a diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2eecb87e42d..0eec0c55dd8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -177,6 +177,23 @@ static int ftrace_set_clr_event(char *buf, int set) return __ftrace_set_clr_event(match, sub, event, set); } +/** + * trace_set_clr_event - enable or disable an event + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @set: 1 to enable, 0 to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_set_clr_event(const char *system, const char *event, int set) +{ + return __ftrace_set_clr_event(NULL, system, event, set); +} + /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 From 6cac5a924668a56c7ccefc345805f1fe0536a90e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 29 Mar 2009 19:56:29 -0700 Subject: [PATCH 381/900] xen/x86-64: fix breakpoints and hardware watchpoints Native x86-64 uses the IST mechanism to run int3 and debug traps on an alternative stack. Xen does not do this, and so the frames were being misinterpreted by the ptrace code. This change special-cases these two exceptions by using Xen variants which run on the normal kernel stack properly. Impact: avoid crash or bad data when IST trap is invoked under Xen Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/traps.h | 3 +++ arch/x86/kernel/entry_64.S | 5 +++++ arch/x86/xen/enlighten.c | 19 ++++++++++++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0d5342515b8..c44e5002f2f 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -13,6 +13,9 @@ asmlinkage void divide_error(void); asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); +asmlinkage void xen_debug(void); +asmlinkage void xen_int3(void); +asmlinkage void xen_stack_segment(void); asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 38946c6e843..bb01ce080b8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1379,6 +1379,11 @@ END(xen_failsafe_callback) paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK paranoiderrorentry stack_segment do_stack_segment +#ifdef CONFIG_XEN +zeroentry xen_debug do_debug +zeroentry xen_int3 do_int3 +errorentry xen_stack_segment do_stack_segment +#endif errorentry general_protection do_general_protection errorentry page_fault do_page_fault #ifdef CONFIG_X86_MCE diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 12a3159333b..7566e13c0ca 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -428,11 +430,26 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { + unsigned long addr; + if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) return 0; info->vector = vector; - info->address = gate_offset(*val); + + addr = gate_offset(*val); +#ifdef CONFIG_X86_64 + if (addr == (unsigned long)debug) + addr = (unsigned long)xen_debug; + else if (addr == (unsigned long)int3) + addr = (unsigned long)xen_int3; + else if (addr == (unsigned long)stack_segment) + addr = (unsigned long)xen_stack_segment; + else + WARN_ON(val->ist != 0); +#endif /* CONFIG_X86_64 */ + info->address = addr; + info->cs = gate_segment(*val); info->flags = val->dpl; /* interrupt gates clear IF */ From b80119bb35a49a4e8dbfb9708872adfd5cf38dee Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 24 Apr 2009 00:22:08 -0700 Subject: [PATCH 382/900] xen/x86-64: clean up warnings about IST-using traps Ignore known IST-using traps. Aside from the debugger traps, they're low-level faults which Xen will handle for us, so the kernel needn't worry about them. Keep warning in case unknown trap starts using IST. Impact: suppress spurious warnings Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 7566e13c0ca..e9df942aa14 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -439,14 +439,32 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, addr = gate_offset(*val); #ifdef CONFIG_X86_64 + /* + * Look for known traps using IST, and substitute them + * appropriately. The debugger ones are the only ones we care + * about. Xen will handle faults like double_fault and + * machine_check, so we should never see them. Warn if + * there's an unexpected IST-using fault handler. + */ if (addr == (unsigned long)debug) addr = (unsigned long)xen_debug; else if (addr == (unsigned long)int3) addr = (unsigned long)xen_int3; else if (addr == (unsigned long)stack_segment) addr = (unsigned long)xen_stack_segment; - else - WARN_ON(val->ist != 0); + else if (addr == (unsigned long)double_fault || + addr == (unsigned long)nmi) { + /* Don't need to handle these */ + return 0; +#ifdef CONFIG_X86_MCE + } else if (addr == (unsigned long)machine_check) { + return 0; +#endif + } else { + /* Some other trap using IST? */ + if (WARN_ON(val->ist != 0)) + return 0; + } #endif /* CONFIG_X86_64 */ info->address = addr; From a789ed5fb6d0256c4177c2cc27e06520ddbe4d4c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 24 Apr 2009 00:26:50 -0700 Subject: [PATCH 383/900] xen: cache cr0 value to avoid trap'n'emulate for read_cr0 stts() is implemented in terms of read_cr0/write_cr0 to update the state of the TS bit. This happens during context switch, and so is fairly performance critical. Rather than falling back to a trap-and-emulate native read_cr0, implement our own by caching the last-written value from write_cr0 (the TS bit is the only one we really care about). Impact: optimise Xen context switches Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e9df942aa14..0a1700a2be9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -658,10 +658,26 @@ static void xen_clts(void) xen_mc_issue(PARAVIRT_LAZY_CPU); } +static DEFINE_PER_CPU(unsigned long, xen_cr0_value); + +static unsigned long xen_read_cr0(void) +{ + unsigned long cr0 = percpu_read(xen_cr0_value); + + if (unlikely(cr0 == 0)) { + cr0 = native_read_cr0(); + percpu_write(xen_cr0_value, cr0); + } + + return cr0; +} + static void xen_write_cr0(unsigned long cr0) { struct multicall_space mcs; + percpu_write(xen_cr0_value, cr0); + /* Only pay attention to cr0.TS; everything else is ignored. */ mcs = xen_mc_entry(0); @@ -847,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .clts = xen_clts, - .read_cr0 = native_read_cr0, + .read_cr0 = xen_read_cr0, .write_cr0 = xen_write_cr0, .read_cr4 = native_read_cr4, From 0b4eb462da10f832b28d518abffa4d77805928a0 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 30 Apr 2009 17:59:36 -0700 Subject: [PATCH 384/900] x86, boot: align the .bss section in the decompressor Aligning the .bss section makes it trivial to use large operation sizes for moving the initialized sections and clearing the .bss. The alignment chosen (L1 cache) is somewhat arbitrary, but should be large enough to avoid all known performance traps and small enough to not cause troubles. [ Impact: trivial performance enhancement, future patch prep ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 0d26c92d3c7..dbe515e13fe 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -42,6 +42,7 @@ SECTIONS *(.data.*) _edata = . ; } + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .bss : { _bss = . ; *(.bss) From d3dd3b5a29bb9582957451531fed461628dfc834 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 5 May 2009 21:17:15 -0700 Subject: [PATCH 385/900] kbuild: allow compressors (gzip, bzip2, lzma) to take multiple inputs Allow the compression commands in Kbuild (i.e. gzip, bzip2, lzma) to take multiple input files and emit the concatenated compressed output. This avoids an intermediate step when a kernel image is built from multiple components, such as the relocatable x86-32 kernel. Sam Ravnborg integrated the bin_size script into the Makefile. [ Impact: new build feature, not yet used ] Signed-off-by: H. Peter Anvin Acked-by: Sam Ravnborg --- scripts/Makefile.lib | 26 ++++++++++++++++++++------ scripts/bin_size | 10 ---------- 2 files changed, 20 insertions(+), 16 deletions(-) delete mode 100644 scripts/bin_size diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 979619574f7..f8cf938dde9 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -183,20 +183,34 @@ cmd_objcopy = $(OBJCOPY) $(OBJCOPYFLAGS) $(OBJCOPYFLAGS_$(@F)) $< $@ # --------------------------------------------------------------------------- quiet_cmd_gzip = GZIP $@ -cmd_gzip = gzip -f -9 < $< > $@ +cmd_gzip = (cat $(filter-out FORCE,$^) | gzip -f -9 > $@) || \ + (rm -f $@ ; false) # Bzip2 # --------------------------------------------------------------------------- -# Bzip2 does not include size in file... so we have to fake that -size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size +# Bzip2 and LZMA do not include size in file... so we have to fake that; +# append the size as a 32-bit littleendian number as gzip does. +size_append = echo -ne $(shell \ +dec_size=0; \ +for F in $1; do \ + fsize=$$(stat -c "%s" $$F); \ + dec_size=$$(expr $$dec_size + $$fsize); \ +done; \ +printf "%08x" $$dec_size | \ + sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g' \ +) -quiet_cmd_bzip2 = BZIP2 $@ -cmd_bzip2 = (bzip2 -9 < $< && $(size_append) $<) > $@ || (rm -f $@ ; false) +quiet_cmd_bzip2 = BZIP2 $@ +cmd_bzip2 = (cat $(filter-out FORCE,$^) | \ + bzip2 -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) # Lzma # --------------------------------------------------------------------------- quiet_cmd_lzma = LZMA $@ -cmd_lzma = (lzma -9 -c $< && $(size_append) $<) >$@ || (rm -f $@ ; false) +cmd_lzma = (cat $(filter-out FORCE,$^) | \ + lzma -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) diff --git a/scripts/bin_size b/scripts/bin_size deleted file mode 100644 index 43e1b360cee..00000000000 --- a/scripts/bin_size +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh - -if [ $# = 0 ] ; then - echo Usage: $0 file -fi - -size_dec=`stat -c "%s" $1` -size_hex_echo_string=`printf "%08x" $size_dec | - sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'` -/bin/echo -ne $size_hex_echo_string From 845adf7266a7ba6970bf982ffd96abc60d2018ab Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 5 May 2009 21:20:51 -0700 Subject: [PATCH 386/900] x86: add a Kconfig symbol for when relocations are needed We only need to build relocations when we are building a 32-bit relocatable kernel. Rather than unnecessarily complicating the Makefiles, make an explicit Kbuild symbol for this. [ Impact: permits future cleanup ] Signed-off-by: H. Peter Anvin Cc: Sam Ravnborg --- arch/x86/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 039c3f04aac..5aee45356b5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1513,6 +1513,11 @@ config RELOCATABLE it has been loaded at and the compile time physical address (CONFIG_PHYSICAL_START) is ignored. +# Relocation on x86-32 needs some additional build support +config X86_NEED_RELOCS + def_bool y + depends on X86_32 && RELOCATABLE + config PHYSICAL_ALIGN hex prompt "Alignment value to which kernel should be aligned" if X86_32 From 5f11e02019ef44f041e6e38a1363fa2fd4b8785d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 5 May 2009 22:53:11 -0700 Subject: [PATCH 387/900] x86, boot: simplify arch/x86/boot/compressed/Makefile Simplify the arch/x86/boot/compressed/Makefile, by using the new capability of specifying multiple inputs to a compressor, and the CONFIG_X86_NEED_RELOCS Kconfig symbol. Signed-off-by: H. Peter Anvin Acked-by: Sam Ravnborg --- arch/x86/boot/compressed/Makefile | 39 +++++-------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 0f4b5e2abd3..b35c3bb7090 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -29,7 +29,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += vmlinux.bin.all vmlinux.relocs relocs -hostprogs-$(CONFIG_X86_32) += relocs +hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< @@ -37,46 +37,19 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE $(call if_changed,relocs) vmlinux.bin.all-y := $(obj)/vmlinux.bin -vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs -quiet_cmd_relocbin = BUILD $@ - cmd_relocbin = cat $(filter-out FORCE,$^) > $@ -$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE - $(call if_changed,relocbin) +vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs -ifeq ($(CONFIG_X86_32),y) - -ifdef CONFIG_RELOCATABLE -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE +$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE +$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE +$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzma) -else -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE - $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE - $(call if_changed,lzma) -endif -LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T - -else - -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE - $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE - $(call if_changed,lzma) - -LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T -endif suffix_$(CONFIG_KERNEL_GZIP) = gz suffix_$(CONFIG_KERNEL_BZIP2) = bz2 suffix_$(CONFIG_KERNEL_LZMA) = lzma +LDFLAGS_piggy.o := -r --format binary --oformat $(CONFIG_OUTPUT_FORMAT) -T $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE $(call if_changed,ld) From 283ab1c0bd462dd0b179393fb081a626f6687413 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 15:32:47 -0700 Subject: [PATCH 388/900] x86, boot: follow standard Kbuild style for compression suffix When generating the compression suffix in arch/x86/boot/compressed/Makefile, follow standard Kbuild conventions, that is: - Use a dash not underscore before y/m/n endings - Use := whenever possible. Requested-by: Sam Ravnborg Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index b35c3bb7090..7f24fdb584e 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -46,10 +46,10 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzma) -suffix_$(CONFIG_KERNEL_GZIP) = gz -suffix_$(CONFIG_KERNEL_BZIP2) = bz2 -suffix_$(CONFIG_KERNEL_LZMA) = lzma +suffix-$(CONFIG_KERNEL_GZIP) := gz +suffix-$(CONFIG_KERNEL_BZIP2) := bz2 +suffix-$(CONFIG_KERNEL_LZMA) := lzma LDFLAGS_piggy.o := -r --format binary --oformat $(CONFIG_OUTPUT_FORMAT) -T -$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE +$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE $(call if_changed,ld) From bd2a36984c50bb546a7d04cb395fddcf98a1092c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 5 May 2009 23:24:50 -0700 Subject: [PATCH 389/900] x86, boot: use BP_scratch in arch/x86/boot/compressed/head_*.S Use the BP_scratch symbol from asm-offsets.h instead of hard-coding the location. [ Impact: cleanup ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 2 +- arch/x86/boot/compressed/head_64.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 85bd3285706..e3398f3d1b3 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -53,7 +53,7 @@ ENTRY(startup_32) * data at 0x1e4 (defined as a scratch field) are used as the stack * for this calculation. Only 4 bytes are needed. */ - leal (0x1e4+4)(%esi), %esp + leal (BP_scratch+4)(%esi), %esp call 1f 1: popl %ebp subl $1b, %ebp diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index ed4a8294800..06cc7e59352 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -56,7 +56,7 @@ ENTRY(startup_32) * data at 0x1e4 (defined as a scratch field) are used as the stack * for this calculation. Only 4 bytes are needed. */ - leal (0x1e4+4)(%esi), %esp + leal (BP_scratch+4)(%esi), %esp call 1f 1: popl %ebp subl $1b, %ebp From 5f64ec64e7f9b246c0a94f34cdf7782f98a6e55d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 15:45:17 -0700 Subject: [PATCH 390/900] x86, boot: stylistic cleanups for boot/compressed/head_32.S Reformat arch/x86/boot/compressed/head_32.S to be closer to currently preferred kernel assembly style, that is: - opcode and operand separated by tab - operands separated by ", " - C-style comments This also makes it more similar to head_64.S. [ Impact: cleanup, no object code change ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 170 +++++++++++++++-------------- 1 file changed, 89 insertions(+), 81 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index e3398f3d1b3..7bd7766ffab 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -12,16 +12,16 @@ * the page directory. [According to comments etc elsewhere on a compressed * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] * - * Page 0 is deliberately kept safe, since System Management Mode code in + * Page 0 is deliberately kept safe, since System Management Mode code in * laptops may need to access the BIOS data stored there. This is also - * useful for future device drivers that either access the BIOS via VM86 + * useful for future device drivers that either access the BIOS via VM86 * mode. */ /* * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ -.text + .text #include #include @@ -29,75 +29,80 @@ #include #include -.section ".text.head","ax",@progbits + .section ".text.head","ax",@progbits ENTRY(startup_32) cld - /* test KEEP_SEGMENTS flag to see if the bootloader is asking - * us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) - jnz 1f + /* + * Test KEEP_SEGMENTS flag to see if the bootloader is asking + * us to not reload segments + */ + testb $(1<<6), BP_loadflags(%esi) + jnz 1f cli - movl $(__BOOT_DS),%eax - movl %eax,%ds - movl %eax,%es - movl %eax,%fs - movl %eax,%gs - movl %eax,%ss + movl $__BOOT_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss 1: -/* Calculate the delta between where we were compiled to run +/* + * Calculate the delta between where we were compiled to run * at and where we were actually loaded at. This can only be done * with a short local call on x86. Nothing else will tell us what * address we are running at. The reserved chunk of the real-mode * data at 0x1e4 (defined as a scratch field) are used as the stack * for this calculation. Only 4 bytes are needed. */ - leal (BP_scratch+4)(%esi), %esp - call 1f -1: popl %ebp - subl $1b, %ebp + leal (BP_scratch+4)(%esi), %esp + call 1f +1: popl %ebp + subl $1b, %ebp -/* %ebp contains the address we are loaded at by the boot loader and %ebx +/* + * %ebp contains the address we are loaded at by the boot loader and %ebx * contains the address where we should move the kernel image temporarily * for safe in-place decompression. */ #ifdef CONFIG_RELOCATABLE - movl %ebp, %ebx + movl %ebp, %ebx addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx #else - movl $LOAD_PHYSICAL_ADDR, %ebx + movl $LOAD_PHYSICAL_ADDR, %ebx #endif /* Replace the compressed data size with the uncompressed size */ - subl input_len(%ebp), %ebx - movl output_len(%ebp), %eax - addl %eax, %ebx + subl input_len(%ebp), %ebx + movl output_len(%ebp), %eax + addl %eax, %ebx /* Add 8 bytes for every 32K input block */ - shrl $12, %eax - addl %eax, %ebx + shrl $12, %eax + addl %eax, %ebx /* Add 32K + 18 bytes of extra slack */ - addl $(32768 + 18), %ebx + addl $(32768 + 18), %ebx /* Align on a 4K boundary */ - addl $4095, %ebx - andl $~4095, %ebx + addl $4095, %ebx + andl $~4095, %ebx -/* Copy the compressed kernel to the end of our buffer +/* + * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - pushl %esi - leal _ebss(%ebp), %esi - leal _ebss(%ebx), %edi - movl $(_ebss - startup_32), %ecx + pushl %esi + leal _ebss(%ebp), %esi + leal _ebss(%ebx), %edi + movl $(_ebss - startup_32), %ecx std - rep - movsb + rep movsb cld - popl %esi + popl %esi -/* Compute the kernel start address. +/* + * Compute the kernel start address. */ #ifdef CONFIG_RELOCATABLE addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp @@ -109,81 +114,84 @@ ENTRY(startup_32) /* * Jump to the relocated address. */ - leal relocated(%ebx), %eax - jmp *%eax + leal relocated(%ebx), %eax + jmp *%eax ENDPROC(startup_32) -.section ".text" + .text relocated: /* * Clear BSS */ - xorl %eax,%eax - leal _edata(%ebx),%edi - leal _ebss(%ebx), %ecx - subl %edi,%ecx + xorl %eax, %eax + leal _edata(%ebx), %edi + leal _ebss(%ebx), %ecx + subl %edi, %ecx cld - rep - stosb + rep stosb /* * Setup the stack for the decompressor */ - leal boot_stack_end(%ebx), %esp + leal boot_stack_end(%ebx), %esp /* * Do the decompression, and jump to the new kernel.. */ - movl output_len(%ebx), %eax - pushl %eax - # push arguments for decompress_kernel: - pushl %ebp # output address - movl input_len(%ebx), %eax - pushl %eax # input_len - leal input_data(%ebx), %eax - pushl %eax # input_data - leal boot_heap(%ebx), %eax - pushl %eax # heap area - pushl %esi # real mode pointer - call decompress_kernel - addl $20, %esp - popl %ecx + movl output_len(%ebx), %eax + pushl %eax + /* push arguments for decompress_kernel: */ + pushl %ebp /* output address */ + movl input_len(%ebx), %eax + pushl %eax /* input_len */ + leal input_data(%ebx), %eax + pushl %eax /* input_data */ + leal boot_heap(%ebx), %eax + pushl %eax /* heap area */ + pushl %esi /* real mode pointer */ + call decompress_kernel + addl $20, %esp + popl %ecx #if CONFIG_RELOCATABLE -/* Find the address of the relocations. +/* + * Find the address of the relocations. */ - movl %ebp, %edi - addl %ecx, %edi + movl %ebp, %edi + addl %ecx, %edi -/* Calculate the delta between where vmlinux was compiled to run +/* + * Calculate the delta between where vmlinux was compiled to run * and where it was actually loaded. */ - movl %ebp, %ebx - subl $LOAD_PHYSICAL_ADDR, %ebx - jz 2f /* Nothing to be done if loaded at compiled addr. */ + movl %ebp, %ebx + subl $LOAD_PHYSICAL_ADDR, %ebx + jz 2f /* Nothing to be done if loaded at compiled addr. */ /* * Process relocations. */ -1: subl $4, %edi - movl 0(%edi), %ecx - testl %ecx, %ecx - jz 2f - addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) - jmp 1b +1: subl $4, %edi + movl (%edi), %ecx + testl %ecx, %ecx + jz 2f + addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) + jmp 1b 2: #endif /* * Jump to the decompressed kernel. */ - xorl %ebx,%ebx - jmp *%ebp + xorl %ebx, %ebx + jmp *%ebp -.bss -/* Stack and heap for uncompression */ -.balign 4 +/* + * Stack and heap for uncompression + */ + .bss + .balign 4 boot_heap: .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: From b40d68d5b5b799caaf99d2e073e62962e6d917ce Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 15:59:13 -0700 Subject: [PATCH 391/900] x86, boot: stylistic cleanups for boot/compressed/head_64.S Clean up style issues in arch/x86/boot/compressed/head_64.S. This file had a lot fewer style issues than its 32-bit cousin, but the ones it has are worth fixing, especially since it makes the two files more similar. [ Impact: cleanup, no object code change ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_64.S | 52 ++++++++++++++++++------------ 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 06cc7e59352..26c3def43ac 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -21,8 +21,8 @@ /* * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ -.code32 -.text + .code32 + .text #include #include @@ -33,12 +33,14 @@ #include #include -.section ".text.head" + .section ".text.head" .code32 ENTRY(startup_32) cld - /* test KEEP_SEGMENTS flag to see if the bootloader is asking - * us to not reload segments */ + /* + * Test KEEP_SEGMENTS flag to see if the bootloader is asking + * us to not reload segments + */ testb $(1<<6), BP_loadflags(%esi) jnz 1f @@ -49,7 +51,8 @@ ENTRY(startup_32) movl %eax, %ss 1: -/* Calculate the delta between where we were compiled to run +/* + * Calculate the delta between where we were compiled to run * at and where we were actually loaded at. This can only be done * with a short local call on x86. Nothing else will tell us what * address we are running at. The reserved chunk of the real-mode @@ -70,10 +73,11 @@ ENTRY(startup_32) testl %eax, %eax jnz no_longmode -/* Compute the delta between where we were compiled to run at +/* + * Compute the delta between where we were compiled to run at * and where the code will actually run at. - */ -/* %ebp contains the address we are loaded at by the boot loader and %ebx + * + * %ebp contains the address we are loaded at by the boot loader and %ebx * contains the address where we should move the kernel image temporarily * for safe in-place decompression. */ @@ -114,7 +118,7 @@ ENTRY(startup_32) /* * Build early 4G boot pagetable */ - /* Initialize Page tables to 0*/ + /* Initialize Page tables to 0 */ leal pgtable(%ebx), %edi xorl %eax, %eax movl $((4096*6)/4), %ecx @@ -155,7 +159,8 @@ ENTRY(startup_32) btsl $_EFER_LME, %eax wrmsr - /* Setup for the jump to 64bit mode + /* + * Setup for the jump to 64bit mode * * When the jump is performend we will be in long mode but * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 @@ -184,7 +189,8 @@ no_longmode: #include "../../kernel/verify_cpu_64.S" - /* Be careful here startup_64 needs to be at a predictable + /* + * Be careful here startup_64 needs to be at a predictable * address so I can export it in an ELF header. Bootloaders * should look at the ELF header to find this address, as * it may change in the future. @@ -192,7 +198,8 @@ no_longmode: .code64 .org 0x200 ENTRY(startup_64) - /* We come here either from startup_32 or directly from a + /* + * We come here either from startup_32 or directly from a * 64bit bootloader. If we come here from a bootloader we depend on * an identity mapped page table being provied that maps our * entire text+data+bss and hopefully all of memory. @@ -209,7 +216,8 @@ ENTRY(startup_64) movl $0x20, %eax ltr %ax - /* Compute the decompressed kernel start address. It is where + /* + * Compute the decompressed kernel start address. It is where * we were loaded at aligned to a 2M boundary. %rbp contains the * decompressed kernel start address. * @@ -241,7 +249,8 @@ ENTRY(startup_64) addq $(32768 + 18 + 4095), %rbx andq $~4095, %rbx -/* Copy the compressed kernel to the end of our buffer +/* + * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ leaq _end_before_pgt(%rip), %r8 @@ -260,7 +269,7 @@ ENTRY(startup_64) leaq relocated(%rbx), %rax jmp *%rax -.section ".text" + .text relocated: /* @@ -271,8 +280,7 @@ relocated: leaq _end_before_pgt(%rbx), %rcx subq %rdi, %rcx cld - rep - stosb + rep stosb /* Setup the stack */ leaq boot_stack_end(%rip), %rsp @@ -311,9 +319,11 @@ gdt: .quad 0x0000000000000000 /* TS continued */ gdt_end: -.bss -/* Stack and heap for uncompression */ -.balign 4 +/* + * Stack and heap for uncompression + */ + .bss + .balign 4 boot_heap: .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: From 5b11f1cee5797b38d16b94d8745b12b6727a8373 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 16:20:34 -0700 Subject: [PATCH 392/900] x86, boot: straighten out ranges to copy/zero in compressed/head*.S Both on 32 and 64 bits, we copy all the way up to the end of bss, except that on 64 bits there is a hack to avoid copying on top of the page tables. There is no point in copying bss at all, especially since we are just about to zero it all anyway. To clean up and unify the handling, we now do: - copy from startup_32 to _bss. - zero from _bss to _ebss. - the _ebss symbol is aligned to an 8-byte boundary. - the page tables are moved to a separate section. Use _bss as the copy endpoint since _edata may be misaligned. [ Impact: cleanup, trivial performance improvement ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 8 ++++---- arch/x86/boot/compressed/head_64.S | 18 +++++++++++++----- arch/x86/boot/compressed/vmlinux.lds.S | 19 ++++++++++++------- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 7bd7766ffab..59425e157df 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -93,9 +93,9 @@ ENTRY(startup_32) * where decompression in place becomes safe. */ pushl %esi - leal _ebss(%ebp), %esi - leal _ebss(%ebx), %edi - movl $(_ebss - startup_32), %ecx + leal _bss(%ebp), %esi + leal _bss(%ebx), %edi + movl $(_bss - startup_32), %ecx std rep movsb cld @@ -125,7 +125,7 @@ relocated: * Clear BSS */ xorl %eax, %eax - leal _edata(%ebx), %edi + leal _bss(%ebx), %edi leal _ebss(%ebx), %ecx subl %edi, %ecx cld diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 26c3def43ac..5bc9052615b 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -253,9 +253,9 @@ ENTRY(startup_64) * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - leaq _end_before_pgt(%rip), %r8 - leaq _end_before_pgt(%rbx), %r9 - movq $_end_before_pgt /* - $startup_32 */, %rcx + leaq _bss(%rip), %r8 + leaq _bss(%rbx), %r9 + movq $_bss /* - $startup_32 */, %rcx 1: subq $8, %r8 subq $8, %r9 movq 0(%r8), %rax @@ -276,8 +276,8 @@ relocated: * Clear BSS */ xorq %rax, %rax - leaq _edata(%rbx), %rdi - leaq _end_before_pgt(%rbx), %rcx + leaq _bss(%rbx), %rdi + leaq _ebss(%rbx), %rcx subq %rdi, %rcx cld rep stosb @@ -329,3 +329,11 @@ boot_heap: boot_stack: .fill BOOT_STACK_SIZE, 1, 0 boot_stack_end: + +/* + * Space for page tables (not in .bss so not zeroed) + */ + .section ".pgtable","a",@nobits + .balign 4096 +pgtable: + .fill 6*4096, 1, 0 diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index dbe515e13fe..cc353e1b3ff 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -2,6 +2,8 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) #undef i386 +#include + #ifdef CONFIG_X86_64 OUTPUT_ARCH(i386:x86-64) ENTRY(startup_64) @@ -48,13 +50,16 @@ SECTIONS *(.bss) *(.bss.*) *(COMMON) -#ifdef CONFIG_X86_64 - . = ALIGN(8); - _end_before_pgt = . ; - . = ALIGN(4096); - pgtable = . ; - . = . + 4096 * 6; -#endif + . = ALIGN(8); /* For convenience during zeroing */ _ebss = .; } +#ifdef CONFIG_X86_64 + . = ALIGN(PAGE_SIZE); + .pgtable : { + _pgtable = . ; + *(.pgtable) + _epgtable = . ; + } +#endif + _end = .; } From 0a137736704ef9af719409933b3c33e138461786 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 16:27:41 -0700 Subject: [PATCH 393/900] x86, boot: set up the decompression stack as early as possible Set up the decompression stack as soon as we know where it needs to go. That way we have a full-service stack as soon as possible, rather than relying on the BP_scratch field. Note that the stack does need to be empty during bss zeroing (or else the stack needs to be moved out of the bss segment, which is also an option.) [ Impact: cleanup, minor paranoia ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 10 ++++------ arch/x86/boot/compressed/head_64.S | 16 ++++++++-------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 59425e157df..d7245cf8026 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -88,6 +88,9 @@ ENTRY(startup_32) addl $4095, %ebx andl $~4095, %ebx + /* Set up the stack */ + leal boot_stack_end(%ebx), %esp + /* * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. @@ -122,7 +125,7 @@ ENDPROC(startup_32) relocated: /* - * Clear BSS + * Clear BSS (stack is currently empty) */ xorl %eax, %eax leal _bss(%ebx), %edi @@ -131,11 +134,6 @@ relocated: cld rep stosb -/* - * Setup the stack for the decompressor - */ - leal boot_stack_end(%ebx), %esp - /* * Do the decompression, and jump to the new kernel.. */ diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 5bc9052615b..a0b18426069 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -249,6 +249,13 @@ ENTRY(startup_64) addq $(32768 + 18 + 4095), %rbx andq $~4095, %rbx + /* Set up the stack */ + leaq boot_stack_end(%rbx), %rsp + + /* Zero EFLAGS */ + pushq $0 + popfq + /* * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. @@ -273,7 +280,7 @@ ENTRY(startup_64) relocated: /* - * Clear BSS + * Clear BSS (stack is currently empty) */ xorq %rax, %rax leaq _bss(%rbx), %rdi @@ -282,13 +289,6 @@ relocated: cld rep stosb - /* Setup the stack */ - leaq boot_stack_end(%rip), %rsp - - /* zero EFLAGS after setting rsp */ - pushq $0 - popfq - /* * Do the decompression, and jump to the new kernel.. */ From 97541912785369925723b6255438ad9fce2ddf04 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 6 May 2009 17:56:51 -0700 Subject: [PATCH 394/900] x86, boot: zero EFLAGS on 32 bits The 64-bit code already clears EFLAGS as soon as it has a stack. This seems like a reasonable precaution, so do it on 32 bits as well. [ Impact: extra paranoia ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index d7245cf8026..d02a4f02be1 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -91,6 +91,10 @@ ENTRY(startup_32) /* Set up the stack */ leal boot_stack_end(%ebx), %esp + /* Zero EFLAGS */ + pushl $0 + popfl + /* * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. From 36d3793c947f1ef7ba3d24eeeddc1be41adc5ab4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 16:45:15 -0700 Subject: [PATCH 395/900] x86, boot: use appropriate rep string for move and clear In the pre-decompression code, use the appropriate largest possible rep movs and rep stos to move code and clear bss, respectively. For reverse copy, do note that the initial values are supposed to be the address of the first (highest) copy datum, not one byte beyond the end of the buffer. rep strings are not necessarily the fastest way to perform these operations on all current processors, but are likely to be in the future, and perhaps more importantly, we want to encourage the architecturally right thing to do here. This also fixes a couple of trivial inefficiencies on 64 bits. [ Impact: trivial performance enhancement, increase code similarity ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 11 ++++++----- arch/x86/boot/compressed/head_64.S | 26 +++++++++++++------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index d02a4f02be1..6710dc78ac5 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -100,11 +100,12 @@ ENTRY(startup_32) * where decompression in place becomes safe. */ pushl %esi - leal _bss(%ebp), %esi - leal _bss(%ebx), %edi + leal (_bss-4)(%ebp), %esi + leal (_bss-4)(%ebx), %edi movl $(_bss - startup_32), %ecx + shrl $2, %ecx std - rep movsb + rep movsl cld popl %esi @@ -135,8 +136,8 @@ relocated: leal _bss(%ebx), %edi leal _ebss(%ebx), %ecx subl %edi, %ecx - cld - rep stosb + shrl $2, %ecx + rep stosl /* * Do the decompression, and jump to the new kernel.. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index a0b18426069..723c72dfd7b 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -260,15 +260,15 @@ ENTRY(startup_64) * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - leaq _bss(%rip), %r8 - leaq _bss(%rbx), %r9 + pushq %rsi + leaq (_bss-8)(%rip), %rsi + leaq (_bss-8)(%rbx), %rdi movq $_bss /* - $startup_32 */, %rcx -1: subq $8, %r8 - subq $8, %r9 - movq 0(%r8), %rax - movq %rax, 0(%r9) - subq $8, %rcx - jnz 1b + shrq $3, %rcx + std + rep movsq + cld + popq %rsi /* * Jump to the relocated address. @@ -282,12 +282,12 @@ relocated: /* * Clear BSS (stack is currently empty) */ - xorq %rax, %rax - leaq _bss(%rbx), %rdi - leaq _ebss(%rbx), %rcx + xorl %eax, %eax + leaq _bss(%rip), %rdi + leaq _ebss(%rip), %rcx subq %rdi, %rcx - cld - rep stosb + shrq $3, %rcx + rep stosq /* * Do the decompression, and jump to the new kernel.. From 02a884c0fe7ec8459d00d34b7d4101af21fc4a86 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 17:42:16 -0700 Subject: [PATCH 396/900] x86, boot: determine compressed code offset at compile time Determine the compressed code offset (from the kernel runtime address) at compile time. This allows some minor optimizations in arch/x86/boot/compressed/head_*.S, but more importantly it makes this value available to the build process, which will enable a future patch to export the necessary linear memory footprint into the bzImage header. [ Impact: cleanup, future patch enabling ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/Makefile | 11 +++- arch/x86/boot/compressed/head_32.S | 24 ++----- arch/x86/boot/compressed/head_64.S | 41 ++++-------- arch/x86/boot/compressed/mkpiggy.c | 97 ++++++++++++++++++++++++++++ arch/x86/boot/compressed/vmlinux.scr | 10 --- 5 files changed, 123 insertions(+), 60 deletions(-) create mode 100644 arch/x86/boot/compressed/mkpiggy.c delete mode 100644 arch/x86/boot/compressed/vmlinux.scr diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 7f24fdb584e..49c8a4c37d7 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -19,6 +19,8 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ LDFLAGS := -m elf_$(UTS_MACHINE) LDFLAGS_vmlinux := -T +hostprogs-y := mkpiggy + $(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: @@ -50,6 +52,9 @@ suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma -LDFLAGS_piggy.o := -r --format binary --oformat $(CONFIG_OUTPUT_FORMAT) -T -$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE - $(call if_changed,ld) +quiet_cmd_mkpiggy = MKPIGGY $@ + cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) + +targets += piggy.S +$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE + $(call if_changed,mkpiggy) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 6710dc78ac5..470474bafc4 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -75,18 +75,8 @@ ENTRY(startup_32) movl $LOAD_PHYSICAL_ADDR, %ebx #endif - /* Replace the compressed data size with the uncompressed size */ - subl input_len(%ebp), %ebx - movl output_len(%ebp), %eax - addl %eax, %ebx - /* Add 8 bytes for every 32K input block */ - shrl $12, %eax - addl %eax, %ebx - /* Add 32K + 18 bytes of extra slack */ - addl $(32768 + 18), %ebx - /* Align on a 4K boundary */ - addl $4095, %ebx - andl $~4095, %ebx + /* Target address to relocate to for decompression */ + addl $z_extract_offset, %ebx /* Set up the stack */ leal boot_stack_end(%ebx), %esp @@ -142,12 +132,10 @@ relocated: /* * Do the decompression, and jump to the new kernel.. */ - movl output_len(%ebx), %eax - pushl %eax + leal z_extract_offset_negative(%ebx), %ebp /* push arguments for decompress_kernel: */ pushl %ebp /* output address */ - movl input_len(%ebx), %eax - pushl %eax /* input_len */ + pushl $z_input_len /* input_len */ leal input_data(%ebx), %eax pushl %eax /* input_data */ leal boot_heap(%ebx), %eax @@ -155,14 +143,12 @@ relocated: pushl %esi /* real mode pointer */ call decompress_kernel addl $20, %esp - popl %ecx #if CONFIG_RELOCATABLE /* * Find the address of the relocations. */ - movl %ebp, %edi - addl %ecx, %edi + leal z_output_len(%ebp), %edi /* * Calculate the delta between where vmlinux was compiled to run diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 723c72dfd7b..2b9f2510507 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -90,16 +90,8 @@ ENTRY(startup_32) movl $CONFIG_PHYSICAL_START, %ebx #endif - /* Replace the compressed data size with the uncompressed size */ - subl input_len(%ebp), %ebx - movl output_len(%ebp), %eax - addl %eax, %ebx - /* Add 8 bytes for every 32K input block */ - shrl $12, %eax - addl %eax, %ebx - /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ - addl $(32768 + 18 + 4095), %ebx - andl $~4095, %ebx + /* Target address to relocate to for decompression */ + addl $z_extract_offset, %ebx /* * Prepare for entering 64 bit mode @@ -224,6 +216,9 @@ ENTRY(startup_64) * If it is a relocatable kernel then decompress and run the kernel * from load address aligned to 2MB addr, otherwise decompress and * run the kernel from CONFIG_PHYSICAL_START + * + * We cannot rely on the calculation done in 32-bit mode, since we + * may have been invoked via the 64-bit entry point. */ /* Start with the delta to where the kernel will run at. */ @@ -237,17 +232,8 @@ ENTRY(startup_64) movq %rbp, %rbx #endif - /* Replace the compressed data size with the uncompressed size */ - movl input_len(%rip), %eax - subq %rax, %rbx - movl output_len(%rip), %eax - addq %rax, %rbx - /* Add 8 bytes for every 32K input block */ - shrq $12, %rax - addq %rax, %rbx - /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ - addq $(32768 + 18 + 4095), %rbx - andq $~4095, %rbx + /* Target address to relocate to for decompression */ + leaq z_extract_offset(%rbp), %rbx /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp @@ -292,13 +278,12 @@ relocated: /* * Do the decompression, and jump to the new kernel.. */ - pushq %rsi # Save the real mode argument - movq %rsi, %rdi # real mode address - leaq boot_heap(%rip), %rsi # malloc area for uncompression - leaq input_data(%rip), %rdx # input_data - movl input_len(%rip), %eax - movq %rax, %rcx # input_len - movq %rbp, %r8 # output + pushq %rsi /* Save the real mode argument */ + movq %rsi, %rdi /* real mode address */ + leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ + leaq input_data(%rip), %rdx /* input_data */ + movl $z_input_len, %ecx /* input_len */ + movq %rbp, %r8 /* output target address */ call decompress_kernel popq %rsi diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c new file mode 100644 index 00000000000..bcbd36c4143 --- /dev/null +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -0,0 +1,97 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright (C) 2009 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * H. Peter Anvin + * + * ----------------------------------------------------------------------- */ + +/* + * Compute the desired load offset from a compressed program; outputs + * a small assembly wrapper with the appropriate symbols defined. + */ + +#include +#include +#include +#include + +static uint32_t getle32(const void *p) +{ + const uint8_t *cp = p; + + return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) + + ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24); +} + +int main(int argc, char *argv[]) +{ + uint32_t olen; + long ilen; + unsigned long offs; + FILE *f; + + if (argc < 2) { + fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); + return 1; + } + + /* Get the information for the compressed kernel image first */ + + f = fopen(argv[1], "r"); + if (!f) { + perror(argv[1]); + return 1; + } + + + if (fseek(f, -4L, SEEK_END)) { + perror(argv[1]); + } + fread(&olen, sizeof olen, 1, f); + ilen = ftell(f); + olen = getle32(&olen); + fclose(f); + + /* + * Now we have the input (compressed) and output (uncompressed) + * sizes, compute the necessary decompression offset... + */ + + offs = (olen > ilen) ? olen - ilen : 0; + offs += olen >> 12; /* Add 8 bytes for each 32K block */ + offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ + offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ + + printf(".section \".rodata.compressed\",\"a\",@progbits\n"); + printf(".globl z_input_len\n"); + printf("z_input_len = %lu\n", ilen); + printf(".globl z_output_len\n"); + printf("z_output_len = %lu\n", (unsigned long)olen); + printf(".globl z_extract_offset\n"); + printf("z_extract_offset = 0x%lx\n", offs); + /* z_extract_offset_negative allows simplification of head_32.S */ + printf(".globl z_extract_offset_negative\n"); + printf("z_extract_offset_negative = -0x%lx\n", offs); + + printf(".globl input_data, input_data_end\n"); + printf("input_data:\n"); + printf(".incbin \"%s\"\n", argv[1]); + printf("input_data_end:\n"); + + return 0; +} diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr deleted file mode 100644 index f02382ae5c4..00000000000 --- a/arch/x86/boot/compressed/vmlinux.scr +++ /dev/null @@ -1,10 +0,0 @@ -SECTIONS -{ - .rodata.compressed : { - input_len = .; - LONG(input_data_end - input_data) input_data = .; - *(.data) - output_len = . - 4; - input_data_end = .; - } -} From 778dedae0cb76a441145f3a0c5d59fcb3ba296d5 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sat, 9 May 2009 12:54:34 +0800 Subject: [PATCH 397/900] x86: mce: remove duplicated #include Remove duplicated #include in arch/x86/kernel/cpu/mcheck/mce_intel_64.c. [ Impact: cleanup ] Signed-off-by: Huang Weiyi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index d6b72df89d6..aedf9766ee9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -15,7 +15,6 @@ #include #include #include -#include asmlinkage void smp_thermal_interrupt(void) { From b30505c81a9d4adea8b70ecff512b0216929b797 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 7 May 2009 15:40:14 -0700 Subject: [PATCH 398/900] futex: add requeue-pi documentation Add Documentation/futex-requeue-pi.txt describing the motivation for the newly added FUTEX_*REQUEUE_PI op codes and their implementation. [ Impact: add documentation ] Signed-off-by: Darren Hart Cc: Sripathi Kodi Cc: Peter Zijlstra Cc: John Stultz Cc: Steven Rostedt Cc: Dinakar Guniguntala Cc: Ulrich Drepper Cc: Eric Dumazet Cc: Jakub Jelinek LKML-Reference: <4A03634E.3080609@us.ibm.com> [ reformatted the file ] Signed-off-by: Ingo Molnar --- Documentation/futex-requeue-pi.txt | 131 +++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 Documentation/futex-requeue-pi.txt diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt new file mode 100644 index 00000000000..9dc1ff4fd53 --- /dev/null +++ b/Documentation/futex-requeue-pi.txt @@ -0,0 +1,131 @@ +Futex Requeue PI +---------------- + +Requeueing of tasks from a non-PI futex to a PI futex requires +special handling in order to ensure the underlying rt_mutex is never +left without an owner if it has waiters; doing so would break the PI +boosting logic [see rt-mutex-desgin.txt] For the purposes of +brevity, this action will be referred to as "requeue_pi" throughout +this document. Priority inheritance is abbreviated throughout as +"PI". + +Motivation +---------- + +Without requeue_pi, the glibc implementation of +pthread_cond_broadcast() must resort to waking all the tasks waiting +on a pthread_condvar and letting them try to sort out which task +gets to run first in classic thundering-herd formation. An ideal +implementation would wake the highest-priority waiter, and leave the +rest to the natural wakeup inherent in unlocking the mutex +associated with the condvar. + +Consider the simplified glibc calls: + +/* caller must lock mutex */ +pthread_cond_wait(cond, mutex) +{ + lock(cond->__data.__lock); + unlock(mutex); + do { + unlock(cond->__data.__lock); + futex_wait(cond->__data.__futex); + lock(cond->__data.__lock); + } while(...) + unlock(cond->__data.__lock); + lock(mutex); +} + +pthread_cond_broadcast(cond) +{ + lock(cond->__data.__lock); + unlock(cond->__data.__lock); + futex_requeue(cond->data.__futex, cond->mutex); +} + +Once pthread_cond_broadcast() requeues the tasks, the cond->mutex +has waiters. Note that pthread_cond_wait() attempts to lock the +mutex only after it has returned to user space. This will leave the +underlying rt_mutex with waiters, and no owner, breaking the +previously mentioned PI-boosting algorithms. + +In order to support PI-aware pthread_condvar's, the kernel needs to +be able to requeue tasks to PI futexes. This support implies that +upon a successful futex_wait system call, the caller would return to +user space already holding the PI futex. The glibc implementation +would be modified as follows: + + +/* caller must lock mutex */ +pthread_cond_wait_pi(cond, mutex) +{ + lock(cond->__data.__lock); + unlock(mutex); + do { + unlock(cond->__data.__lock); + futex_wait_requeue_pi(cond->__data.__futex); + lock(cond->__data.__lock); + } while(...) + unlock(cond->__data.__lock); + /* the kernel acquired the the mutex for us */ +} + +pthread_cond_broadcast_pi(cond) +{ + lock(cond->__data.__lock); + unlock(cond->__data.__lock); + futex_requeue_pi(cond->data.__futex, cond->mutex); +} + +The actual glibc implementation will likely test for PI and make the +necessary changes inside the existing calls rather than creating new +calls for the PI cases. Similar changes are needed for +pthread_cond_timedwait() and pthread_cond_signal(). + +Implementation +-------------- + +In order to ensure the rt_mutex has an owner if it has waiters, it +is necessary for both the requeue code, as well as the waiting code, +to be able to acquire the rt_mutex before returning to user space. +The requeue code cannot simply wake the waiter and leave it to +acquire the rt_mutex as it would open a race window between the +requeue call returning to user space and the waiter waking and +starting to run. This is especially true in the uncontended case. + +The solution involves two new rt_mutex helper routines, +rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which +allow the requeue code to acquire an uncontended rt_mutex on behalf +of the waiter and to enqueue the waiter on a contended rt_mutex. +Two new system calls provide the kernel<->user interface to +requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI. + +FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait() +and pthread_cond_timedwait()) to block on the initial futex and wait +to be requeued to a PI-aware futex. The implementation is the +result of a high-speed collision between futex_wait() and +futex_lock_pi(), with some extra logic to check for the additional +wake-up scenarios. + +FUTEX_REQUEUE_CMP_PI is called by the waker +(pthread_cond_broadcast() and pthread_cond_signal()) to requeue and +possibly wake the waiting tasks. Internally, this system call is +still handled by futex_requeue (by passing requeue_pi=1). Before +requeueing, futex_requeue() attempts to acquire the requeue target +PI futex on behalf of the top waiter. If it can, this waiter is +woken. futex_requeue() then proceeds to requeue the remaining +nr_wake+nr_requeue tasks to the PI futex, calling +rt_mutex_start_proxy_lock() prior to each requeue to prepare the +task as a waiter on the underlying rt_mutex. It is possible that +the lock can be acquired at this stage as well, if so, the next +waiter is woken to finish the acquisition of the lock. + +FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but +their sum is all that really matters. futex_requeue() will wake or +requeue up to nr_wake + nr_requeue tasks. It will wake only as many +tasks as it can acquire the lock for, which in the majority of cases +should be 0 as good programming practice dictates that the caller of +either pthread_cond_broadcast() or pthread_cond_signal() acquire the +mutex prior to making the call. FUTEX_REQUEUE_PI requires that +nr_wake=1. nr_requeue should be INT_MAX for broadcast and 0 for +signal. From 45fbe3ee01b8e463b28c2751b5dcc0cbdc142d90 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 6 May 2009 08:06:44 -0700 Subject: [PATCH 399/900] x86, e820, pci: reserve extra free space near end of RAM The point is to take all RAM resources we have, and _after_ we've added all the resources we've seen in the E820 tree, we then _also_ try to add fake reserved entries for any "round up to X" at the end of the RAM resources. [ Impact: improve PCI mem-resource allocation robustness, protect "stolen RAM" ] Reported-by: Yannick Roehlly Acked-by: Jesse Barnes Signed-off-by: Yinghai Lu Cc: Ivan Kokshaysky Cc: Andrew Morton Cc: yannick.roehlly@free.fr LKML-Reference: <4A01A784.2050407@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 00628130292..a2335d9de05 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1371,6 +1371,23 @@ void __init e820_reserve_resources(void) } } +/* How much should we pad RAM ending depending on where it is? */ +static unsigned long ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 32MB for anything above that */ + return 32*1024*1024; +} + void __init e820_reserve_resources_late(void) { int i; @@ -1382,6 +1399,24 @@ void __init e820_reserve_resources_late(void) insert_resource_expand_to_fit(&iomem_resource, res); res++; } + + /* + * Try to bump up RAM regions to reasonable boundaries to + * avoid stolen RAM: + */ + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *entry = &e820_saved.map[i]; + resource_size_t start, end; + + if (entry->type != E820_RAM) + continue; + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)); + if (start == end) + continue; + reserve_region_with_split(&iomem_resource, start, + end - 1, "RAM buffer"); + } } char *__init default_machine_specific_memory_setup(void) From 5d423ccd7ba4285f1084e91b26805e1d0ae978ed Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 08:07:52 -0700 Subject: [PATCH 400/900] x86/pci: remove rounding quirk from e820_setup_gap() Now that the e820 code explicitly reserves 'potentially dangerous' free physical memory address space to protect ACPI stolen RAM, there's no need for the rounding quirk in the PCI allocator anymore. Also, this quirk was open-ended iteration that could end up reserving a lot of free space and potentially breaking drivers - such as the one reported by Yannick Roehlly where there's a PCI device with a large memory resource. So remove it. [ Impact: make more of the PCI hole available for assigning pci devices ] Reported-by: Yannick Roehlly Signed-off-by: Yinghai Lu Acked-by: Jesse Barnes Cc: Ivan Kokshaysky Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <4A01A7C8.5090701@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a2335d9de05..7271fa33d79 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, */ __init void e820_setup_gap(void) { - unsigned long gapstart, gapsize, round; + unsigned long gapstart, gapsize; int found; gapstart = 0x10000000; @@ -635,14 +635,9 @@ __init void e820_setup_gap(void) #endif /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. + * e820_reserve_resources_late protect stolen RAM already */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; + pci_mem_start = gapstart; printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", From b9e0353fc85dab4ef5ebcef2bd09ebc4ce6d5a7b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:05:32 -0700 Subject: [PATCH 401/900] x86/acpi: remove irq-compression trick on 32-bit We already have a per cpu vector on 32-bit via recent changes, and don't need this trick any more (which trick obfuscates the real GSI mappings and which only triggers on larger systems to begin with): On 3 ioapic system (24 per ioapic) before patch I got: ACPI: PCI Interrupt Link [ILSB] enabled at IRQ 71 IOAPIC[2]: Set routing entry (10-23 -> 0xa9 -> IRQ 64 Mode:1 Active:1) pci 0000:80:01.1: PCI INT A -> Link[ILSB] -> GSI 71 (level, low) -> IRQ 64 ACPI: PCI Interrupt Link [LE5B] enabled at IRQ 67 IOAPIC[2]: Set routing entry (10-19 -> 0xb1 -> IRQ 65 Mode:1 Active:1) pci 0000:83:00.0: PCI INT B -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65 ACPI: PCI Interrupt Link [LE5A] enabled at IRQ 66 IOAPIC[2]: Set routing entry (10-18 -> 0xb9 -> IRQ 66 Mode:1 Active:1) pci 0000:83:00.1: PCI INT A -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 ACPI: PCI Interrupt Link [LE5D] enabled at IRQ 65 IOAPIC[2]: Set routing entry (10-17 -> 0xc1 -> IRQ 67 Mode:1 Active:1) pci 0000:84:00.0: PCI INT B -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67 ACPI: PCI Interrupt Link [LE5C] enabled at IRQ 64 IOAPIC[2]: Set routing entry (10-16 -> 0xc9 -> IRQ 68 Mode:1 Active:1) pci 0000:84:00.1: PCI INT A -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68 pci 0000:87:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 pci 0000:87:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67 pci 0000:88:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68 pci 0000:88:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65 pci 0000:8b:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 pci 0000:8b:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67 pci 0000:8c:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68 pci 0000:8c:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65 after the patch we get: ACPI: PCI Interrupt Link [ILSB] enabled at IRQ 71 IOAPIC[2]: Set routing entry (10-23 -> 0xa9 -> IRQ 71 Mode:1 Active:1) pci 0000:80:01.1: PCI INT A -> Link[ILSB] -> GSI 71 (level, low) -> IRQ 71 ACPI: PCI Interrupt Link [LE5B] enabled at IRQ 67 IOAPIC[2]: Set routing entry (10-19 -> 0xb1 -> IRQ 67 Mode:1 Active:1) pci 0000:83:00.0: PCI INT B -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67 ACPI: PCI Interrupt Link [LE5A] enabled at IRQ 66 IOAPIC[2]: Set routing entry (10-18 -> 0xb9 -> IRQ 66 Mode:1 Active:1) pci 0000:83:00.1: PCI INT A -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 ACPI: PCI Interrupt Link [LE5D] enabled at IRQ 65 IOAPIC[2]: Set routing entry (10-17 -> 0xc1 -> IRQ 65 Mode:1 Active:1) pci 0000:84:00.0: PCI INT B -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65 ACPI: PCI Interrupt Link [LE5C] enabled at IRQ 64 IOAPIC[2]: Set routing entry (10-16 -> 0xc9 -> IRQ 64 Mode:1 Active:1) pci 0000:84:00.1: PCI INT A -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64 pci 0000:87:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 pci 0000:87:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65 pci 0000:88:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64 pci 0000:88:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67 pci 0000:8b:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 pci 0000:8b:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65 pci 0000:8c:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64 pci 0000:8c:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67 As it can be seen that GSIs now get mapped lineary. [ Impact: simplify irq number mapping on bigger 32-bit systems ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C35C.7060207@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 65 ++++--------------------------------- 1 file changed, 7 insertions(+), 58 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6ee96b5530f..fb5e88262d2 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1162,22 +1162,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { int ioapic; int ioapic_pin; -#ifdef CONFIG_X86_32 -#define MAX_GSI_NUM 4096 -#define IRQ_COMPRESSION_START 64 - - static int pci_irq = IRQ_COMPRESSION_START; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, and IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; -#else if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; -#endif /* Don't set up the ACPI SCI because it's already set up */ if (acpi_gbl_FADT.sci_interrupt == gsi) @@ -1196,66 +1183,28 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) gsi = ioapic_renumber_irq(ioapic, gsi); #endif - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); return gsi; } + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { pr_debug("Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); -#ifdef CONFIG_X86_32 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); -#else return gsi; -#endif } - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); -#ifdef CONFIG_X86_32 - /* - * For GSI >= 64, use IRQ compression - */ - if ((gsi >= IRQ_COMPRESSION_START) - && (triggering == ACPI_LEVEL_SENSITIVE)) { - /* - * For PCI devices assign IRQs in order, avoiding gaps - * due to unused I/O APIC pins. - */ - int irq = gsi; - if (gsi < MAX_GSI_NUM) { - /* - * Retain the VIA chipset work-around (gsi > 15), but - * avoid a problem where the 8254 timer (IRQ0) is setup - * via an override (so it's not on pin 0 of the ioapic), - * and at the same time, the pin 0 interrupt is a PCI - * type. The gsi > 15 test could cause these two pins - * to be shared as IRQ0, and they are not shareable. - * So test for this condition, and if necessary, avoid - * the pin collision. - */ - gsi = pci_irq++; - /* - * Don't assign IRQ used by ACPI SCI - */ - if (gsi == acpi_gbl_FADT.sci_interrupt) - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } else { - printk(KERN_ERR "GSI %u is too high\n", gsi); - return gsi; - } - } -#endif io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi, triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; } From ee214558c2e959781a406e76c5b34364da638e1d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:07:07 -0700 Subject: [PATCH 402/900] x86: fix alloc_mptable() Fix the conditions when we stop updating the mptable due to running out of slots. [ Impact: fix memory corruption / non-working update_mptable boot parameter ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C3BB.1000609@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 70fd7e414c1..cd2a41a7c45 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -870,24 +870,17 @@ static inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} #endif /* CONFIG_X86_IO_APIC */ -static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, - int count) +static int +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { - if (!mpc_new_phys) { - pr_info("No spare slots, try to append...take your risk, " - "new mpc_length %x\n", count); - } else { - if (count <= mpc_new_length) - pr_info("No spare slots, try to append..., " - "new mpc_length %x\n", count); - else { - pr_err("mpc_new_length %lx is too small\n", - mpc_new_length); - return -1; - } + int ret = 0; + + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; } - return 0; + return ret; } static int __init replace_intsrc_all(struct mpc_table *mpc, @@ -946,7 +939,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, } else { struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; count += sizeof(struct mpc_intsrc); - if (!check_slot(mpc_new_phys, mpc_new_length, count)) + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) goto out; assign_to_mpc_intsrc(&mp_irqs[i], m); mpc->length = count; From a31f82057ce6f7ced578d64c07a72ccbdc7336e4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:06:15 -0700 Subject: [PATCH 403/900] x86/acpi: call mp_config_acpi_gsi() in mp_register_gsi() The patch to call mp_config_acpi_gsi() from the ACPI IRQ registration code never got mainline because there were open discussions about it. This call is needed to properly update the kernel's copy of the mptable, when the update_mptable boot parameter is needed. Now that the dust has settled with the APIC unification, and since there were no objections when the patch was re-submitted, try this again. [ Impact: fix the update_mptable boot parameter ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C387.7090103@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mpspec.h | 9 ----- arch/x86/kernel/acpi/boot.c | 66 +++++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 3ea1f531f53..c34961a45ec 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -77,17 +77,8 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level, int active_high_low); extern int acpi_probe_gsi(void); #ifdef CONFIG_X86_IO_APIC -extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, - u32 gsi, int triggering, int polarity); extern int mp_find_ioapic(int gsi); extern int mp_find_ioapic_pin(int ioapic, int gsi); -#else -static inline int -mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, - u32 gsi, int triggering, int polarity) -{ - return 0; -} #endif #else /* !CONFIG_ACPI: */ static inline int acpi_probe_gsi(void) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index fb5e88262d2..8019ecf66e9 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -1158,6 +1159,44 @@ void __init mp_config_acpi_legacy_irqs(void) } } +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, + int polarity) +{ +#ifdef CONFIG_X86_MPPARSE + struct mpc_intsrc mp_irq; + struct pci_dev *pdev; + unsigned char number; + unsigned int devfn; + int ioapic; + u8 pin; + + if (!acpi_ioapic) + return 0; + if (!dev) + return 0; + if (dev->bus != &pci_bus_type) + return 0; + + pdev = to_pci_dev(dev); + number = pdev->bus->number; + devfn = pdev->devfn; + pin = pdev->pin; + /* print the entry should happen on mptable identically */ + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + mp_irq.srcbus = number; + mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; + mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + + save_mp_irq(&mp_irq); +#endif + return 0; +} + int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { int ioapic; @@ -1189,6 +1228,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) ioapic_pin); return gsi; } + mp_config_acpi_gsi(dev, gsi, triggering, polarity); /* * Avoid pin reprogramming. PRTs typically include entries @@ -1208,32 +1248,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) return gsi; } -int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, - u32 gsi, int triggering, int polarity) -{ -#ifdef CONFIG_X86_MPPARSE - struct mpc_intsrc mp_irq; - int ioapic; - - if (!acpi_ioapic) - return 0; - - /* print the entry should happen on mptable identically */ - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | - (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); - mp_irq.srcbus = number; - mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); - ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; - mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); - - save_mp_irq(&mp_irq); -#endif - return 0; -} - /* * Parse IOAPIC related entries in MADT * returns 0 on success, < 0 on error From bdfe8ac153546537ed24de69610ea781a734f785 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:07:41 -0700 Subject: [PATCH 404/900] x86/acpi: move pin_programmed bit map to io_apic.c Prepare to call setup_io_apic_routing() in pcibios_irq_enable() also remove not needed member apic_id. [ Impact: clean up, prepare for future change ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C3DD.3050104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 18 ++---------------- arch/x86/kernel/apic/io_apic.c | 25 ++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8019ecf66e9..dcfbc3ab9e4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -904,10 +904,8 @@ extern int es7000_plat; #endif static struct { - int apic_id; int gsi_base; int gsi_end; - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); } mp_ioapic_routing[MAX_IO_APICS]; int mp_find_ioapic(int gsi) @@ -996,7 +994,6 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); @@ -1189,7 +1186,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, mp_irq.srcbus = number; mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; + mp_irq.dstapic = mp_ioapics[ioapic].apicid; mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); save_mp_irq(&mp_irq); @@ -1224,23 +1221,12 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + "%d-%d\n", mp_ioapics[ioapic].apicid, ioapic_pin); return gsi; } mp_config_acpi_gsi(dev, gsi, triggering, polarity); - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi; - } - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi, triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 21c30e1121e..e279ae33928 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3922,7 +3922,7 @@ int __init io_apic_get_version(int ioapic) } #endif -int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, +static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, int triggering, int polarity) { struct irq_desc *desc; @@ -3959,6 +3959,29 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, return 0; } +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + +int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, + int triggering, int polarity) +{ + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[ioapic].apicid, pin); + return 0; + } + set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); + + return __io_apic_set_pci_routing(dev, ioapic, pin, irq, + triggering, polarity); +} int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { From e20c06fd6950265a899edd96a02dc2e6ae2d1ce5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:08:22 -0700 Subject: [PATCH 405/900] x86/pci: add 4 more return parameters to IO_APIC_get_PCI_irq_vector() To prepare those params for pcibios_irq_enable() to call setup_io_apic_routing(). [ Impact: extend function call API to prepare for new functionality ] Signed-off-by: Yinghai Lu Acked-by: Jesse Barnes Cc: Len Brown Cc: Andrew Morton LKML-Reference: <4A01C406.2040303@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 4 +- arch/x86/kernel/apic/io_apic.c | 107 ++++++++++++++++-------------- arch/x86/pci/irq.c | 24 +++++-- drivers/pci/hotplug/ibmphp_core.c | 52 ++++++++------- 4 files changed, 110 insertions(+), 77 deletions(-) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b762ea49bd7..26a40ab7013 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -63,7 +63,9 @@ extern unsigned long io_apic_irqs; extern void init_VISWS_APIC_irqs(void); extern void setup_IO_APIC(void); extern void disable_IO_APIC(void); -extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); +extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, + int *ioapic, int *ioapic_pin, + int *trigger, int *polarity); extern void setup_ioapic_dest(void); extern void enable_IO_APIC(void); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e279ae33928..caf9dbdde05 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -873,54 +873,6 @@ static int __init find_isa_irq_apic(int irq, int type) return -1; } -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || - mp_irqs[i].dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - #if defined(CONFIG_EISA) || defined(CONFIG_MCA) /* * EISA Edge/Level control register, ELCR @@ -1139,6 +1091,65 @@ static int pin_2_irq(int idx, int apic, int pin) return irq; } +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, + int *ioapic, int *ioapic_pin, + int *trigger, int *polarity) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, + "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, + "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || + mp_irqs[i].dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].srcbusirq & 3)) { + *ioapic = apic; + *ioapic_pin = mp_irqs[i].dstirq; + *trigger = irq_trigger(i); + *polarity = irq_polarity(i); + return irq; + } + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) { + *ioapic = apic; + *ioapic_pin = mp_irqs[i].dstirq; + *trigger = irq_trigger(i); + *polarity = irq_polarity(i); + best_guess = irq; + } + } + } + return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + void lock_vector_lock(void) { /* Used to the online set of cpus does not change diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index fecbce6e7d7..a2f6bde9c4e 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1051,12 +1051,16 @@ static void __init pcibios_fixup_irqs(void) */ if (io_apic_assign_pci_irqs) { int irq; + int ioapic = -1, ioapic_pin = -1; + int triggering, polarity; /* * interrupt pins are numbered starting from 1 */ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, - PCI_SLOT(dev->devfn), pin - 1); + PCI_SLOT(dev->devfn), pin - 1, + &ioapic, &ioapic_pin, + &triggering, &polarity); /* * Busses behind bridges are typically not listed in the * MP-table. In this case we have to look up the IRQ @@ -1072,7 +1076,10 @@ static void __init pcibios_fixup_irqs(void) pin = pci_swizzle_interrupt_pin(dev, pin); bus = bridge->bus->number; irq = IO_APIC_get_PCI_irq_vector(bus, - PCI_SLOT(bridge->devfn), pin - 1); + PCI_SLOT(bridge->devfn), + pin - 1, + &ioapic, &ioapic_pin, + &triggering, &polarity); if (irq >= 0) dev_warn(&dev->dev, "using bridge %s INT %c to " @@ -1221,8 +1228,14 @@ static int pirq_enable_irq(struct pci_dev *dev) if (io_apic_assign_pci_irqs) { int irq; + int ioapic = -1, ioapic_pin = -1; + int triggering, polarity; - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1); + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, + PCI_SLOT(dev->devfn), + pin - 1, + &ioapic, &ioapic_pin, + &triggering, &polarity); /* * Busses behind bridges are typically not listed in the MP-table. * In this case we have to look up the IRQ based on the parent bus, @@ -1235,7 +1248,10 @@ static int pirq_enable_irq(struct pci_dev *dev) pin = pci_swizzle_interrupt_pin(dev, pin); irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, - PCI_SLOT(bridge->devfn), pin - 1); + PCI_SLOT(bridge->devfn), + pin - 1, + &ioapic, &ioapic_pin, + &triggering, &polarity); if (irq >= 0) dev_warn(&dev->dev, "using bridge %s " "INT %c to get IRQ %d\n", diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c index dd18f857dfb..ef53b05a411 100644 --- a/drivers/pci/hotplug/ibmphp_core.c +++ b/drivers/pci/hotplug/ibmphp_core.c @@ -153,45 +153,49 @@ int ibmphp_init_devno(struct slot **cur_slot) return -1; } for (loop = 0; loop < len; loop++) { - if ((*cur_slot)->number == rtable->slots[loop].slot) { - if ((*cur_slot)->bus == rtable->slots[loop].bus) { + if ((*cur_slot)->number == rtable->slots[loop].slot && + (*cur_slot)->bus == rtable->slots[loop].bus) { + int ioapic = -1, ioapic_pin = -1; + int triggering, polarity; + (*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn); for (i = 0; i < 4; i++) (*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus, - (int) (*cur_slot)->device, i); + (int) (*cur_slot)->device, i. + &ioapic, &ioapic_pin, + &triggering, &polarity); - debug("(*cur_slot)->irq[0] = %x\n", - (*cur_slot)->irq[0]); - debug("(*cur_slot)->irq[1] = %x\n", - (*cur_slot)->irq[1]); - debug("(*cur_slot)->irq[2] = %x\n", - (*cur_slot)->irq[2]); - debug("(*cur_slot)->irq[3] = %x\n", - (*cur_slot)->irq[3]); + debug("(*cur_slot)->irq[0] = %x\n", + (*cur_slot)->irq[0]); + debug("(*cur_slot)->irq[1] = %x\n", + (*cur_slot)->irq[1]); + debug("(*cur_slot)->irq[2] = %x\n", + (*cur_slot)->irq[2]); + debug("(*cur_slot)->irq[3] = %x\n", + (*cur_slot)->irq[3]); - debug("rtable->exlusive_irqs = %x\n", + debug("rtable->exlusive_irqs = %x\n", rtable->exclusive_irqs); - debug("rtable->slots[loop].irq[0].bitmap = %x\n", + debug("rtable->slots[loop].irq[0].bitmap = %x\n", rtable->slots[loop].irq[0].bitmap); - debug("rtable->slots[loop].irq[1].bitmap = %x\n", + debug("rtable->slots[loop].irq[1].bitmap = %x\n", rtable->slots[loop].irq[1].bitmap); - debug("rtable->slots[loop].irq[2].bitmap = %x\n", + debug("rtable->slots[loop].irq[2].bitmap = %x\n", rtable->slots[loop].irq[2].bitmap); - debug("rtable->slots[loop].irq[3].bitmap = %x\n", + debug("rtable->slots[loop].irq[3].bitmap = %x\n", rtable->slots[loop].irq[3].bitmap); - debug("rtable->slots[loop].irq[0].link = %x\n", + debug("rtable->slots[loop].irq[0].link = %x\n", rtable->slots[loop].irq[0].link); - debug("rtable->slots[loop].irq[1].link = %x\n", + debug("rtable->slots[loop].irq[1].link = %x\n", rtable->slots[loop].irq[1].link); - debug("rtable->slots[loop].irq[2].link = %x\n", + debug("rtable->slots[loop].irq[2].link = %x\n", rtable->slots[loop].irq[2].link); - debug("rtable->slots[loop].irq[3].link = %x\n", + debug("rtable->slots[loop].irq[3].link = %x\n", rtable->slots[loop].irq[3].link); - debug("end of init_devno\n"); - kfree(rtable); - return 0; - } + debug("end of init_devno\n"); + kfree(rtable); + return 0; } } From 5ef2183768bb7d64b85eccbfa1537a61cbefa97c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:08:50 -0700 Subject: [PATCH 406/900] x86/acpi: move setup io apic routing out of CONFIG_ACPI scope So we could set io apic routing when ACPI is not enabled. [ Impact: prepare for new functionality ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C422.5070400@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io_apic.h | 4 +- arch/x86/kernel/apic/io_apic.c | 122 ++++++++++++++++----------------- 2 files changed, 63 insertions(+), 63 deletions(-) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 27bd2fdd00a..6fd99f96eb0 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -154,10 +154,10 @@ extern int timer_through_8259; extern int io_apic_get_unique_id(int ioapic, int apic_id); extern int io_apic_get_version(int ioapic); extern int io_apic_get_redir_entries(int ioapic); -extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, - int irq, int edge_level, int active_high_low); #endif /* CONFIG_ACPI */ +extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, + int irq, int edge_level, int active_high_low); extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index caf9dbdde05..3a68daee0d9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3839,6 +3839,67 @@ int __init arch_probe_nr_irqs(void) } #endif +static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, + int triggering, int polarity) +{ + struct irq_desc *desc; + struct irq_cfg *cfg; + int node; + + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + if (dev) + node = dev_to_node(dev); + else + node = cpu_to_node(boot_cpu_id); + + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc %d\n", irq); + return 0; + } + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= NR_IRQS_LEGACY) { + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, ioapic, pin); + } + + setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); + + return 0; +} + +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + +int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, + int triggering, int polarity) +{ + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[ioapic].apicid, pin); + return 0; + } + set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); + + return __io_apic_set_pci_routing(dev, ioapic, pin, irq, + triggering, polarity); +} + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -3933,67 +3994,6 @@ int __init io_apic_get_version(int ioapic) } #endif -static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, - int triggering, int polarity) -{ - struct irq_desc *desc; - struct irq_cfg *cfg; - int node; - - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - if (dev) - node = dev_to_node(dev); - else - node = cpu_to_node(boot_cpu_id); - - desc = irq_to_desc_alloc_node(irq, node); - if (!desc) { - printk(KERN_INFO "can not get irq_desc %d\n", irq); - return 0; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= NR_IRQS_LEGACY) { - cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, ioapic, pin); - } - - setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); - - return 0; -} - -static struct { - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); -} mp_ioapic_routing[MAX_IO_APICS]; - -int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, - int triggering, int polarity) -{ - - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[ioapic].apicid, pin); - return 0; - } - set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); - - return __io_apic_set_pci_routing(dev, ioapic, pin, irq, - triggering, polarity); -} - int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { int i; From b9c61b70075c87a8612624736faf4a2de5b1ed30 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:10:06 -0700 Subject: [PATCH 407/900] x86/pci: update pirq_enable_irq() to setup io apic routing So we can set io apic routing only when enabling the device irq. This is advantageous for IRQ descriptor allocation affinity: if we set up the IO-APIC entry later, we have a chance to allocate the IRQ descriptor later and know which device it is on and can set affinity accordingly. [ Impact: standardize/enhance irq-enabling sequence for mptable irqs ] Signed-off-by: Yinghai Lu Acked-by: Jesse Barnes Cc: Len Brown Cc: Andrew Morton LKML-Reference: <4A01C46E.8000501@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 162 ++++++++++++++++----------------- arch/x86/pci/irq.c | 84 ++++++----------- 2 files changed, 110 insertions(+), 136 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3a68daee0d9..5d5f4120c74 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1480,9 +1480,13 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq ioapic_write_entry(apic_id, pin, entry); } +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + static void __init setup_IO_APIC_irqs(void) { - int apic_id, pin, idx, irq; + int apic_id = 0, pin, idx, irq; int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; @@ -1490,48 +1494,53 @@ static void __init setup_IO_APIC_irqs(void) apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { - for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + apic_id = mp_find_ioapic(0); + if (apic_id < 0) + apic_id = 0; + } +#endif - idx = find_irq_entry(apic_id, pin, mp_INT); - if (idx == -1) { - if (!notcon) { - notcon = 1; - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic_id].apicid, pin); - } else - apic_printk(APIC_VERBOSE, " %d-%d", - mp_ioapics[apic_id].apicid, pin); - continue; - } - if (notcon) { + for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) { + if (!notcon) { + notcon = 1; apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } - - irq = pin_2_irq(idx, apic_id, pin); - - /* - * Skip the timer IRQ if there's a quirk handler - * installed and if it returns 1: - */ - if (apic->multi_timer_check && - apic->multi_timer_check(apic_id, irq)) - continue; - - desc = irq_to_desc_alloc_node(irq, node); - if (!desc) { - printk(KERN_INFO "can not get irq_desc for %d\n", irq); - continue; - } - cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, apic_id, pin); - - setup_IO_APIC_irq(apic_id, pin, irq, desc, - irq_trigger(idx), irq_polarity(idx)); + KERN_DEBUG " %d-%d", + mp_ioapics[apic_id].apicid, pin); + } else + apic_printk(APIC_VERBOSE, " %d-%d", + mp_ioapics[apic_id].apicid, pin); + continue; } + if (notcon) { + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); + notcon = 0; + } + + irq = pin_2_irq(idx, apic_id, pin); + + /* + * Skip the timer IRQ if there's a quirk handler + * installed and if it returns 1: + */ + if (apic->multi_timer_check && + apic->multi_timer_check(apic_id, irq)) + continue; + + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; + } + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); + set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + setup_IO_APIC_irq(apic_id, pin, irq, desc, + irq_trigger(idx), irq_polarity(idx)); } if (notcon) @@ -3876,10 +3885,6 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in return 0; } -static struct { - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); -} mp_ioapic_routing[MAX_IO_APICS]; - int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, int triggering, int polarity) { @@ -4023,51 +4028,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) { - int pin, ioapic, irq, irq_entry; + int pin, ioapic = 0, irq, irq_entry; struct irq_desc *desc; - struct irq_cfg *cfg; const struct cpumask *mask; if (skip_ioapic_setup == 1) return; - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - if (!cfg->vector) { - setup_IO_APIC_irq(ioapic, pin, irq, desc, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); - continue; - - } - - /* - * Honour affinities which have been set in early boot - */ - if (desc->status & - (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; - else - mask = apic->target_cpus(); - - if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq_desc(desc, mask); - else - set_ioapic_affinity_irq_desc(desc, mask); - } - +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + ioapic = 0; } +#endif + + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + + desc = irq_to_desc(irq); + + /* + * Honour affinities which have been set in early boot + */ + if (desc->status & + (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) + mask = desc->affinity; + else + mask = apic->target_cpus(); + + if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq_desc(desc, mask); + else + set_ioapic_affinity_irq_desc(desc, mask); + } + } #endif diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index a2f6bde9c4e..2f3e192615c 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -889,6 +889,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) return 0; } + if (io_apic_assign_pci_irqs) + return 0; + /* Find IRQ routing entry */ if (!pirq_table) @@ -1039,63 +1042,15 @@ static void __init pcibios_fixup_irqs(void) pirq_penalty[dev->irq]++; } + if (io_apic_assign_pci_irqs) + return; + dev = NULL; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (!pin) continue; -#ifdef CONFIG_X86_IO_APIC - /* - * Recalculate IRQ numbers if we use the I/O APIC. - */ - if (io_apic_assign_pci_irqs) { - int irq; - int ioapic = -1, ioapic_pin = -1; - int triggering, polarity; - - /* - * interrupt pins are numbered starting from 1 - */ - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, - PCI_SLOT(dev->devfn), pin - 1, - &ioapic, &ioapic_pin, - &triggering, &polarity); - /* - * Busses behind bridges are typically not listed in the - * MP-table. In this case we have to look up the IRQ - * based on the parent bus, parent slot, and pin number. - * The SMP code detects such bridged busses itself so we - * should get into this branch reliably. - */ - if (irq < 0 && dev->bus->parent) { - /* go back to the bridge */ - struct pci_dev *bridge = dev->bus->self; - int bus; - - pin = pci_swizzle_interrupt_pin(dev, pin); - bus = bridge->bus->number; - irq = IO_APIC_get_PCI_irq_vector(bus, - PCI_SLOT(bridge->devfn), - pin - 1, - &ioapic, &ioapic_pin, - &triggering, &polarity); - if (irq >= 0) - dev_warn(&dev->dev, - "using bridge %s INT %c to " - "get IRQ %d\n", - pci_name(bridge), - 'A' + pin - 1, irq); - } - if (irq >= 0) { - dev_info(&dev->dev, - "PCI->APIC IRQ transform: INT %c " - "-> IRQ %d\n", - 'A' + pin - 1, irq); - dev->irq = irq; - } - } -#endif /* * Still no IRQ? Try to lookup one... */ @@ -1190,6 +1145,19 @@ int __init pcibios_irq_init(void) pcibios_enable_irq = pirq_enable_irq; pcibios_fixup_irqs(); + + if (io_apic_assign_pci_irqs && pci_routeirq) { + struct pci_dev *dev = NULL; + /* + * PCI IRQ routing is set up by pci_enable_device(), but we + * also do it here in case there are still broken drivers that + * don't use pci_enable_device(). + */ + printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n"); + for_each_pci_dev(dev) + pirq_enable_irq(dev); + } + return 0; } @@ -1220,13 +1188,17 @@ void pcibios_penalize_isa_irq(int irq, int active) static int pirq_enable_irq(struct pci_dev *dev) { u8 pin; - struct pci_dev *temp_dev; pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { + if (pin && !pcibios_lookup_irq(dev, 1)) { char *msg = ""; + if (!io_apic_assign_pci_irqs && dev->irq) + return 0; + if (io_apic_assign_pci_irqs) { +#ifdef CONFIG_X86_IO_APIC + struct pci_dev *temp_dev; int irq; int ioapic = -1, ioapic_pin = -1; int triggering, polarity; @@ -1261,12 +1233,16 @@ static int pirq_enable_irq(struct pci_dev *dev) } dev = temp_dev; if (irq >= 0) { + io_apic_set_pci_routing(&dev->dev, ioapic, + ioapic_pin, irq, + triggering, polarity); + dev->irq = irq; dev_info(&dev->dev, "PCI->APIC IRQ transform: " "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); - dev->irq = irq; return 0; } else msg = "; probably buggy MP table"; +#endif } else if (pci_probe & PCI_BIOS_IRQ_SCAN) msg = ""; else From 61fe91e1319556f32bebfd7ed2c68ef02e2c17f7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 9 May 2009 23:47:42 -0700 Subject: [PATCH 408/900] x86: apic: Check rev 3 fadt correctly for physical_apic bit Impact: fix fadt version checking FADT2_REVISION_ID has value 3 aka rev 3 FADT. So need to use >= instead of >, as other places in the code do. [ Impact: extend scope of APIC boot quirk ] Signed-off-by: Yinghai Lu LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 306e5e88fb6..744e6d8af27 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) * regardless of how many processors are present (x86_64 ES7000 * is an example). */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; From 3e0c373749d7eb5b354ac0b043f2b2cdf84eefef Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 9 May 2009 23:47:42 -0700 Subject: [PATCH 409/900] x86: clean up and fix setup_clear/force_cpu_cap handling setup_force_cpu_cap() only have one user (Xen guest code), but it should not reuse cleared_cpu_cpus, otherwise it will have problems on SMP. Need to have a separate cpu_cpus_set array too, for forced-on flags, beyond the forced-off flags. Also need to setup handling before all cpus caps are combined. [ Impact: fix the forced-set CPU feature flag logic ] Cc: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Rusty Russell Signed-off-by: Yinghai Lu LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 4 ++-- arch/x86/include/asm/processor.h | 3 ++- arch/x86/kernel/cpu/common.c | 17 ++++++++++++----- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index ccc1061b8b2..13cc6a503a0 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -192,11 +192,11 @@ extern const char * const x86_power_flags[32]; #define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) #define setup_clear_cpu_cap(bit) do { \ clear_cpu_cap(&boot_cpu_data, bit); \ - set_bit(bit, (unsigned long *)cleared_cpu_caps); \ + set_bit(bit, (unsigned long *)cpu_caps_cleared); \ } while (0) #define setup_force_cpu_cap(bit) do { \ set_cpu_cap(&boot_cpu_data, bit); \ - clear_bit(bit, (unsigned long *)cleared_cpu_caps); \ + set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) #define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c2cceae709c..fed93fec976 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -135,7 +135,8 @@ extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; extern struct tss_struct doublefault_tss; -extern __u32 cleared_cpu_caps[NCAPINTS]; +extern __u32 cpu_caps_cleared[NCAPINTS]; +extern __u32 cpu_caps_set[NCAPINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c4f667896c2..e7fd5c4935a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -292,7 +292,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_set[NCAPINTS] __cpuinitdata; void load_percpu_segment(int cpu) { @@ -806,6 +807,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #endif init_hypervisor(c); + + /* + * Clear/Set all flags overriden by options, need do it + * before following smp all cpus cap AND. + */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -818,10 +829,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } - /* Clear all flags overriden by options */ - for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] &= ~cleared_cpu_caps[i]; - #ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_init(c); From 80989ce0643c1034822f3e339ed8d790b649abe1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 9 May 2009 23:47:42 -0700 Subject: [PATCH 410/900] x86: clean up and and print out initial max_pfn_mapped Do this so we can check the range that is mapped before init_memory_mapping(). To be able to print out meaningful info, we first have to fix 64-bit to have max_pfn_mapped assigned before that call. This also unifies the code-path a bit. [ Impact: print more debug info, cleanup ] Signed-off-by: Yinghai Lu LKML-Reference: <49BF0978.40605@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 4 ++++ arch/x86/mm/init.c | 7 +++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0d77e56e821..4031d6cb3ff 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -862,12 +862,16 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = max_pfn; high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; #endif #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); #endif + printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", + max_pfn_mapped< Date: Sat, 2 May 2009 10:40:57 -0700 Subject: [PATCH 411/900] x86: read apic ID in the !acpi_lapic case Ed found that on 32-bit, boot_cpu_physical_apicid is not read right, when the mptable is broken. Interestingly, actually three paths use/set it: 1. acpi: at that time that is already read from reg 2. mptable: only read from mptable 3. no madt, and no mptable, that use default apic id 0 for 64-bit, -1 for 32-bit so we could read the apic id for the 2/3 path. We trust the hardware register more than we trust a BIOS data structure (the mptable). We can also avoid the double set_fixmap() when acpi_lapic is used, and also need to move cpu_has_apic earlier and call apic_disable(). Also when need to update the apic id, we'd better read and set the apic version as well - so that quirks are applied precisely. v2: make path 3 with 64bit, use -1 as apic id, so could read it later. v3: fix whitespace problem pointed out by Ed Swierk [ Impact: get correct apic id for bsp other than acpi path ] Reported-by: Ed Swierk Signed-off-by: Yinghai Lu Acked-by: Cyrill Gorcunov LKML-Reference: <49FC85A9.2070702@kernel.org> [ v4: sanity-check in the ACPI case too ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e258bedce7c..1ee966f4ae9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1456,7 +1456,6 @@ static int __init detect_init_APIC(void) } mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_physical_apicid = 0; return 0; } #else @@ -1570,6 +1569,8 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { + unsigned int new_apicid; + if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); return; @@ -1586,21 +1587,31 @@ void __init init_apic_mappings(void) } else apic_phys = mp_lapic_addr; - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + /* lets check if we may NOP'ify apic operations */ + if (!cpu_has_apic) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + return; + } + + /* + * acpi lapic path already maps that address in + * acpi_register_lapic_address() + */ + if (!acpi_lapic) + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", APIC_BASE, apic_phys); - /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); - - /* lets check if we may to NOP'ify apic operations */ - if (!cpu_has_apic) { - pr_info("APIC: disable apic facility\n"); - apic_disable(); + new_apicid = read_apic_id(); + if (boot_cpu_physical_apicid != new_apicid) { + boot_cpu_physical_apicid = new_apicid; + apic_version[new_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); } } From b37ab91907e9002925f4217e3bbd496aa12c2fa3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 8 May 2009 00:36:44 -0700 Subject: [PATCH 412/900] x86: Sanity check the e820 against the SRAT table using e820 map only node_cover_memory() sanity checks the SRAT table by ensuring that all PXMs cover the memory reported in the e820. However, when calculating the size of the holes in the e820, it uses the early_node_map[] which contains information taken from both SRAT and e820. If the SRAT is missing an entry, then it is not detected that the SRAT table is incorrect and missing entries. This patch uses the e820 map to calculate the holes instead of early_node_map[]. comment is from Mel. [ Impact: reject incorrect SRAT tables ] Signed-off-by: Yinghai Lu Acked-by: Mel Gorman Cc: Andrew Morton LKML-Reference: <4A03E10C.60906@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 01765955baa..c7a18aaccf8 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -345,7 +345,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) pxmram = 0; } - e820ram = max_pfn - absent_pages_in_range(0, max_pfn); + e820ram = max_pfn - (e820_hole_size(0, max_pfn<>PAGE_SHIFT); /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ if ((long)(e820ram - pxmram) >= 1*1024*1024) { printk(KERN_ERR From 0964b0562bb9c93194e852b47bab2397b9e11c18 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 8 May 2009 00:37:34 -0700 Subject: [PATCH 413/900] x86: Allow 1MB of slack between the e820 map and SRAT, not 4GB It is expected that there might be slight differences between the e820 map and the SRAT table and the intention was that 1MB of slack be allowed. The calculation comparing e820ram and pxmram assumes the units are bytes, when they are in fact pages. This means 4GB of slack is being allowed, not 1MB. This patch makes the correct comparison. comment is from Mel. [ Impact: don't accept buggy SRATs that could dump up to 4G of RAM ] Signed-off-by: Yinghai Lu Acked-by: Mel Gorman Cc: Andrew Morton LKML-Reference: <4A03E13E.6050107@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index c7a18aaccf8..87b45bff250 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -346,8 +346,8 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) } e820ram = max_pfn - (e820_hole_size(0, max_pfn<>PAGE_SHIFT); - /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ - if ((long)(e820ram - pxmram) >= 1*1024*1024) { + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { printk(KERN_ERR "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", (pxmram << PAGE_SHIFT) >> 20, From 3551f88f6439cf4da3f5a3747b320280e30500de Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 7 May 2009 15:35:41 +0300 Subject: [PATCH 414/900] x86: unify 64-bit UMA and NUMA paging_init() 64-bit UMA and NUMA versions of paging_init() are almost identical. Therefore, merge the copy in mm/numa_64.c to mm/init_64.c to remove duplicate code. [ Impact: cleanup ] Signed-off-by: Pekka Enberg LKML-Reference: <1241699741.17846.30.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 6 +++++- arch/x86/mm/numa_64.c | 15 --------------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6a1a573e20f..be7e1279178 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -585,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) early_res_to_bootmem(0, end_pfn< Date: Thu, 7 May 2009 15:35:42 +0300 Subject: [PATCH 415/900] x86: use sparse_memory_present_with_active_regions() on UMA There's no need to use call memory_present() manually on UMA because initmem_init() sets up early_node_map by calling e820_register_active_regions(). [ Impact: cleanup ] Signed-off-by: Pekka Enberg LKML-Reference: <1241699742.17846.31.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index be7e1279178..52bb9519bb8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -596,11 +596,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = max_pfn; -#ifdef CONFIG_NUMA sparse_memory_present_with_active_regions(MAX_NUMNODES); -#else - memory_present(0, 0, max_pfn); -#endif sparse_init(); free_area_init_nodes(max_zone_pfns); } From 049862579333cc6cd9e6edfd6987cd0addfd8c59 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 May 2009 14:33:23 +0800 Subject: [PATCH 416/900] blktrace: pdu_buf of pc events should be unsigned I got this: 8,0 1 305.417782332 2037 I R 32 (ffffff9e 10 00 ...) [bash] It should be: 8,0 1 305.417782332 2037 I R 32 (9e 10 00 ...) [bash] [ Impact: fix output of pc events ] Signed-off-by: Li Zefan Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A07C6B3.9080802@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e099f8cc1d1..05b4747fd87 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1065,7 +1065,7 @@ static int blk_log_action(struct trace_iterator *iter, const char *act) static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) { - const char *pdu_buf; + const unsigned char *pdu_buf; int pdu_len; int i, end, ret; From 79c5d3ce614d8fe706545c7bca2158b63db6bb5e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 May 2009 15:06:46 +0800 Subject: [PATCH 417/900] blktrace: from-sector redundant in trace_block_remap, cleanup The last argument of block_remap prober is the original sector before remap, so it should be 'from', not 'to'. [ Impact: clean up ] Signed-off-by: Li Zefan Cc: "Alan D. Brunelle" Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: KOSAKI Motohiro LKML-Reference: <4A07CE86.5090301@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/block.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/block.h b/include/trace/block.h index 8ac945b7746..5b12efa096b 100644 --- a/include/trace/block.h +++ b/include/trace/block.h @@ -70,7 +70,7 @@ DECLARE_TRACE(block_split, DECLARE_TRACE(block_remap, TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t to), - TP_ARGS(q, bio, dev, to)); + sector_t from), + TP_ARGS(q, bio, dev, from)); #endif From 97a52714658cd959a3cfa35c5b6f489859f0204b Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 8 May 2009 18:23:50 +0200 Subject: [PATCH 418/900] x86: display extended apic registers with print_local_APIC and cpu_debug code Both print_local_APIC (used when apic=debug kernel param is set) and cpu_debug code missed support for some extended APIC registers that I'd like to see. This adds support to show: - extended APIC feature register - extended APIC control register - extended LVT registers [ Impact: print more debug info ] Signed-off-by: Andreas Herrmann Cc: Jaswinder Singh Rajput Cc: Cyrill Gorcunov LKML-Reference: <20090508162350.GO29045@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apicdef.h | 8 ++++---- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/io_apic.c | 14 +++++++++++++- arch/x86/kernel/cpu/cpu_debug.c | 14 +++++++++++++- 4 files changed, 31 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index bc9514fb3b1..7ddb36ab933 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -22,6 +22,7 @@ # define APIC_INTEGRATED(x) (1) #endif #define APIC_XAPIC(x) ((x) >= 0x14) +#define APIC_EXT_SPACE(x) ((x) & 0x80000000) #define APIC_TASKPRI 0x80 #define APIC_TPRI_MASK 0xFFu #define APIC_ARBPRI 0x90 @@ -116,7 +117,9 @@ #define APIC_TDR_DIV_32 0x8 #define APIC_TDR_DIV_64 0x9 #define APIC_TDR_DIV_128 0xA -#define APIC_EILVT0 0x500 +#define APIC_EFEAT 0x400 +#define APIC_ECTRL 0x410 +#define APIC_EILVTn(n) (0x500 + 0x10 * n) #define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ #define APIC_EILVT_NR_AMD_10H 4 #define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) @@ -125,9 +128,6 @@ #define APIC_EILVT_MSG_NMI 0x4 #define APIC_EILVT_MSG_EXT 0x7 #define APIC_EILVT_MASKED (1 << 16) -#define APIC_EILVT1 0x510 -#define APIC_EILVT2 0x520 -#define APIC_EILVT3 0x530 #define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) #define APIC_BASE_MSR 0x800 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1ee966f4ae9..0e6543fafb5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -395,7 +395,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) { - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); unsigned int v = (mask << 16) | (msg_type << 8) | vector; apic_write(reg, v); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2afe145d277..65b824c9c4f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1739,7 +1739,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base) __apicdebuginit(void) print_local_APIC(void *dummy) { - unsigned int v, ver, maxlvt; + unsigned int i, v, ver, maxlvt; u64 icr; if (apic_verbosity == APIC_QUIET) @@ -1827,6 +1827,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); v = apic_read(APIC_TDCR); printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + v = apic_read(APIC_EFEAT); + maxlvt = (v >> 16) & 0xff; + printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v); + v = apic_read(APIC_ECTRL); + printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v); + for (i = 0; i < maxlvt; i++) { + v = apic_read(APIC_EILVTn(i)); + printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); + } + } printk("\n"); } diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 46e29ab96c6..2fc4f6bb9ca 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -588,8 +588,20 @@ static void print_apic(void *arg) seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); -#endif /* CONFIG_X86_LOCAL_APIC */ + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + unsigned int i, v, maxeilvt; + v = apic_read(APIC_EFEAT); + maxeilvt = (v >> 16) & 0xff; + seq_printf(seq, " EFEAT\t\t: %08x\n", v); + seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); + + for (i = 0; i < maxeilvt; i++) { + v = apic_read(APIC_EILVTn(i)); + seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); + } + } +#endif /* CONFIG_X86_LOCAL_APIC */ seq_printf(seq, "\n MSR\t:\n"); } From cec6be6d1069d697beb490bbb40a290d5ff554a2 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 11 May 2009 17:41:40 +0400 Subject: [PATCH 419/900] x86: apic: Fixmap apic address even if apic disabled In case if apic were disabled by boot option we still need read_apic operation. So fixmap a fake apic area if needed. [ Impact: fix boot crash ] Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org Cc: eswierk@aristanetworks.com LKML-Reference: <20090511134140.GH4624@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0e6543fafb5..07cffc1214c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1587,13 +1587,6 @@ void __init init_apic_mappings(void) } else apic_phys = mp_lapic_addr; - /* lets check if we may NOP'ify apic operations */ - if (!cpu_has_apic) { - pr_info("APIC: disable apic facility\n"); - apic_disable(); - return; - } - /* * acpi lapic path already maps that address in * acpi_register_lapic_address() @@ -1602,7 +1595,15 @@ void __init init_apic_mappings(void) set_fixmap_nocache(FIX_APIC_BASE, apic_phys); apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", - APIC_BASE, apic_phys); + APIC_BASE, apic_phys); + + /* lets check if we may NOP'ify apic operations */ + if (!cpu_has_apic) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + return; + } + /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). From d756f4adb9d8a86e347a2d5435bb5cc95744733e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 4 May 2009 03:29:52 +0400 Subject: [PATCH 420/900] x86, 32-bit: ifdef out struct thread_struct::fs After commit 464d1a78fbf8cf6c7fd970e7b3e2db50a320ce28 aka "[PATCH] i386: Convert i386 PDA code to use %fs" %fs saved during context switch moved from thread_struct to pt_regs and value on thread_struct became unused. [ Impact: reduce thread_struct size on 32-bit ] Signed-off-by: Alexey Dobriyan Cc: containers@lists.linux-foundation.org LKML-Reference: <20090503232952.GI16631@x200.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c2cceae709c..a6732ff7b01 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -428,7 +428,9 @@ struct thread_struct { unsigned short gsindex; #endif unsigned long ip; +#ifdef CONFIG_X86_64 unsigned long fs; +#endif unsigned long gs; /* Hardware debugging registers: */ unsigned long debugreg0; @@ -874,7 +876,6 @@ static inline void spin_lock_prefetch(const void *x) .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ - .fs = __KERNEL_PERCPU, \ } /* From 0c23590f00f85467b318ad0c20c36796a5bd4c60 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 4 May 2009 03:30:15 +0400 Subject: [PATCH 421/900] x86, 64-bit: ifdef out struct thread_struct::ip struct thread_struct::ip isn't used on x86_64, struct pt_regs::ip is used instead. kgdb should be reading 0 always, but I can't check it. [ Impact: (potentially) reduce thread_struct size on 64-bit ] Signed-off-by: Alexey Dobriyan Cc: containers@lists.linux-foundation.org LKML-Reference: <20090503233015.GJ16631@x200.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/kgdb.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a6732ff7b01..a9ba7436821 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -427,7 +427,9 @@ struct thread_struct { unsigned short fsindex; unsigned short gsindex; #endif +#ifdef CONFIG_X86_32 unsigned long ip; +#endif #ifdef CONFIG_X86_64 unsigned long fs; #endif diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index eedfaebe106..d07706f1976 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -141,7 +141,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_PC] = p->thread.ip; + gdb_regs[GDB_PC] = 0; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; From 5a772b2b3c68e7e0b503c5a48469113bb0634314 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 10:56:33 -0400 Subject: [PATCH 422/900] ring-buffer: replace constants with time macros in ring-buffer-benchmark The use of numeric constants is discouraged. It is cleaner and more descriptive to use macros for constant time conversions. This patch also removes an extra new line. [ Impact: more descriptive time conversions ] Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a21aa7b3d05..7d3aef93c49 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -253,7 +253,7 @@ static void ring_buffer_producer(void) } time = end_tv.tv_sec - start_tv.tv_sec; - time *= 1000000; + time *= USEC_PER_SEC; time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); entries = ring_buffer_entries(buffer); @@ -273,7 +273,8 @@ static void ring_buffer_producer(void) pr_info("Missed: %ld\n", missed); pr_info("Hit: %ld\n", hit); - do_div(time, 1000); + /* Convert time from usecs to millisecs */ + do_div(time, USEC_PER_MSEC); if (time) hit /= (long)time; else @@ -282,18 +283,19 @@ static void ring_buffer_producer(void) pr_info("Entries per millisec: %ld\n", hit); if (hit) { - avg = 1000000 / hit; + /* Calculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / hit; pr_info("%ld ns per entry\n", avg); } - if (missed) { if (time) missed /= (long)time; pr_info("Total iterations per millisec: %ld\n", hit + missed); - avg = 1000000 / (hit + missed); + /* Caculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / (hit + missed); pr_info("%ld ns per entry\n", avg); } } From d988ff94c1074c4c914235c8591bcceafb585ecf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 11:03:57 -0400 Subject: [PATCH 423/900] ring-buffer: check for divide by zero in ring-buffer-benchmark Although we check if "missed" is not zero, we divide by hit + missed, and the addition can possible overflow and become a divide by zero. This patch checks for this case, and will report it when it happens then modify "hit" to make the calculation be non zero. [ Impact: prevent possible divide by zero in ring-buffer-benchmark ] Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 7d3aef93c49..8d68e149a8b 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -294,6 +294,12 @@ static void ring_buffer_producer(void) pr_info("Total iterations per millisec: %ld\n", hit + missed); + /* it is possible that hit + missed will overflow and be zero */ + if (!(hit + missed)) { + pr_info("hit + missed overflowed and totalled zero!\n"); + hit--; /* make it non zero */ + } + /* Caculate the average time in nanosecs */ avg = NSEC_PER_MSEC / (hit + missed); pr_info("%ld ns per entry\n", avg); From 1cd8d7358948909ab80b254eb14bcebc555ad417 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 14:08:09 -0400 Subject: [PATCH 424/900] ring-buffer: remove type parameter from rb_reserve_next_event The rb_reserve_next_event is only called for the data type (type = 0). There is no reason to pass in the type to the function. Before: text data bss dec hex filename 16554 24 12 16590 40ce kernel/trace/ring_buffer.o After: text data bss dec hex filename 16538 24 12 16574 40be kernel/trace/ring_buffer.o [ Impact: cleaner, smaller and slightly more efficient code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 361170609bd..fe40f6c3507 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1389,7 +1389,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length) + unsigned long length) { struct ring_buffer_event *event; u64 ts, delta; @@ -1448,7 +1448,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, /* Non commits have zero deltas */ delta = 0; - event = __rb_reserve_next(cpu_buffer, type, length, &ts); + event = __rb_reserve_next(cpu_buffer, 0, length, &ts); if (PTR_ERR(event) == -EAGAIN) goto again; @@ -1556,7 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_PAGE_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, 0, length); + event = rb_reserve_next_event(cpu_buffer, length); if (!event) goto out; @@ -1782,7 +1782,7 @@ int ring_buffer_write(struct ring_buffer *buffer, goto out; event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, 0, event_length); + event = rb_reserve_next_event(cpu_buffer, event_length); if (!event) goto out; From be957c447f7233a67904a1b11eb3ab61e702bf4d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 14:42:53 -0400 Subject: [PATCH 425/900] ring-buffer: move calculation of event length The event length is calculated and passed in to rb_reserve_next_event in two different locations. Having rb_reserve_next_event do the calculations directly makes only one location to do the change and causes the calculation to be inlined by gcc. Before: text data bss dec hex filename 16538 24 12 16574 40be kernel/trace/ring_buffer.o After: text data bss dec hex filename 16490 24 12 16526 408e kernel/trace/ring_buffer.o [ Impact: smaller more efficient code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fe40f6c3507..493cba46abc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -367,6 +367,9 @@ static inline int test_time_stamp(u64 delta) #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) +/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ +#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) + int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; @@ -1396,6 +1399,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, int commit = 0; int nr_loops = 0; + length = rb_calculate_event_length(length); again: /* * We allow for interrupts to reenter here and do a trace. @@ -1552,8 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (atomic_read(&cpu_buffer->record_disabled)) goto out; - length = rb_calculate_event_length(length); - if (length > BUF_PAGE_SIZE) + if (length > BUF_MAX_DATA_SIZE) goto out; event = rb_reserve_next_event(cpu_buffer, length); @@ -1758,7 +1761,6 @@ int ring_buffer_write(struct ring_buffer *buffer, { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - unsigned long event_length; void *body; int ret = -EBUSY; int cpu, resched; @@ -1781,8 +1783,10 @@ int ring_buffer_write(struct ring_buffer *buffer, if (atomic_read(&cpu_buffer->record_disabled)) goto out; - event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, event_length); + if (length > BUF_MAX_DATA_SIZE) + goto out; + + event = rb_reserve_next_event(cpu_buffer, length); if (!event) goto out; From 77d1a4999502c260df0eb2de437d320bf8c64b36 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 14:21:12 -0700 Subject: [PATCH 426/900] x86, boot: make symbols from the main vmlinux available Make symbols from the main vmlinux, as opposed to just compressed/vmlinux, available to header.S. Also, export a few additional symbols. This will be used in a subsequent patch to export the total memory footprint of the kernel. [ Impact: enable future enhancement ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 24 ++++++++++++++++-------- arch/x86/boot/header.S | 7 ++++--- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 6633b6e7505..75e0301fc69 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -86,19 +86,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-offsets := -e 's/^00*/0/' \ - -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p' -quiet_cmd_offsets = OFFSETS $@ - cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@ +quiet_cmd_voffset = VOFFSET $@ + cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ -$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE - $(call if_changed,offsets) +targets += voffset.h +$(obj)/voffset.h: vmlinux FORCE + $(call if_changed,voffset) + +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' + +quiet_cmd_zoffset = ZOFFSET $@ + cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ + +targets += zoffset.h +$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE + $(call if_changed,zoffset) -targets += offsets.h AFLAGS_header.o += -I$(obj) -$(obj)/header.o: $(obj)/offsets.h +$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h LDFLAGS_setup.elf := -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 5d84d1c74e4..27285143ade 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -22,7 +22,8 @@ #include #include #include "boot.h" -#include "offsets.h" +#include "voffset.h" +#include "zoffset.h" BOOTSEG = 0x07C0 /* original address of boot-sector */ SYSSEG = 0x1000 /* historical load address >> 4 */ @@ -212,8 +213,8 @@ hardware_subarch: .long 0 # subarchitecture, added with 2.07 hardware_subarch_data: .quad 0 -payload_offset: .long input_data -payload_length: .long input_data_end-input_data +payload_offset: .long ZO_input_data +payload_length: .long ZO_z_input_len setup_data: .quad 0 # 64-bit physical pointer to # single linked list of From 40b387a8a9a821878ecdf9fb117958c426fc1385 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 14:41:55 -0700 Subject: [PATCH 427/900] x86, boot: use LOAD_PHYSICAL_ADDR on 64 bits Use LOAD_PHYSICAL_ADDR instead of CONFIG_PHYSICAL_START in the 64-bit decompression code, for equivalence with the 32-bit code. [ Impact: cleanup, increases code similarity ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 2b9f2510507..4135d438b66 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -87,7 +87,7 @@ ENTRY(startup_32) addl $(PMD_PAGE_SIZE -1), %ebx andl $PMD_PAGE_MASK, %ebx #else - movl $CONFIG_PHYSICAL_START, %ebx + movl $LOAD_PHYSICAL_ADDR, %ebx #endif /* Target address to relocate to for decompression */ @@ -215,7 +215,7 @@ ENTRY(startup_64) * * If it is a relocatable kernel then decompress and run the kernel * from load address aligned to 2MB addr, otherwise decompress and - * run the kernel from CONFIG_PHYSICAL_START + * run the kernel from LOAD_PHYSICAL_ADDR * * We cannot rely on the calculation done in 32-bit mode, since we * may have been invoked via the 64-bit entry point. @@ -228,7 +228,7 @@ ENTRY(startup_64) andq $PMD_PAGE_MASK, %rbp movq %rbp, %rbx #else - movq $CONFIG_PHYSICAL_START, %rbp + movq $LOAD_PHYSICAL_ADDR, %rbp movq %rbp, %rbx #endif From 2ff799d3cff1ecb274049378b28120ee5c1c5e5f Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Mon, 11 May 2009 20:09:14 +0530 Subject: [PATCH 428/900] sched: Don't export sched_mc_power_savings on multi-socket single core system Fix to prevent sched_mc_power_saving from being exported through sysfs for multi-scoket single core system. Max cores should be always greater than one (1). My earlier patch that introduced fix for not exporting 'sched_mc_power_saving' on laptops broke it on multi-socket single core system. This fix addresses issue on both laptop and multi-socket single core system. Below are the Test results: 1. Single socket - multi-core Before Patch: Does not export 'sched_mc_power_saving' After Patch: Does not export 'sched_mc_power_saving' Result: Pass 2. Multi Socket - single core Before Patch: exports 'sched_mc_power_saving' After Patch: Does not export 'sched_mc_power_saving' Result: Pass 3. Multi Socket - Multi core Before Patch: exports 'sched_mc_power_saving' After Patch: exports 'sched_mc_power_saving' [ Impact: make the sched_mc_power_saving control available more consistently ] Signed-off-by: Mahesh Salgaonkar Cc: Suresh B Siddha Cc: Venkatesh Pallipadi Cc: Peter Zijlstra LKML-Reference: <20090511143914.GB4853@dirshya.in.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/topology.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index f44b49abca4..066ef590d7e 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -203,7 +203,8 @@ struct pci_bus; void x86_pci_root_bus_res_quirks(struct pci_bus *b); #ifdef CONFIG_SMP -#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids) +#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \ + (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)) #define smt_capable() (smp_num_siblings > 1) #endif From 99aa45595f45603526513d5e29fc00f8afbf3913 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 16:02:10 -0700 Subject: [PATCH 429/900] x86, boot: remove dead code from boot/compressed/head_*.S Remove a couple of lines of dead code from arch/x86/boot/compressed/head_*.S; all of these update registers that are dead in the current code. [ Impact: cleanup ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 10 ---------- arch/x86/boot/compressed/head_64.S | 2 -- 2 files changed, 12 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 470474bafc4..2b8e0dfa4b2 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -99,16 +99,6 @@ ENTRY(startup_32) cld popl %esi -/* - * Compute the kernel start address. - */ -#ifdef CONFIG_RELOCATABLE - addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp - andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp -#else - movl $LOAD_PHYSICAL_ADDR, %ebp -#endif - /* * Jump to the relocated address. */ diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 4135d438b66..2bb500af1bd 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -226,10 +226,8 @@ ENTRY(startup_64) leaq startup_32(%rip) /* - $startup_32 */, %rbp addq $(PMD_PAGE_SIZE - 1), %rbp andq $PMD_PAGE_MASK, %rbp - movq %rbp, %rbx #else movq $LOAD_PHYSICAL_ADDR, %rbp - movq %rbp, %rbx #endif /* Target address to relocate to for decompression */ From 37ba7ab5e33cebc25c68fffe33e9f21e7c2014e8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 15:56:08 -0700 Subject: [PATCH 430/900] x86, boot: make kernel_alignment adjustable; new bzImage fields Make the kernel_alignment field adjustable; this allows us to set it to a large value (intended to be 16 MB to avoid ZONE_DMA contention, memory holes and other weirdness) while a smart bootloader can still force a loading at a lesser alignment if absolutely necessary. Also export pref_address (preferred loading address, corresponding to the link-time address) and init_size, the total amount of linear memory the kernel will require during initialization. [ Impact: allows better kernel placement, gives bootloader more info ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 7 +++++-- arch/x86/boot/compressed/head_64.S | 14 ++++++++++---- arch/x86/boot/header.S | 15 +++++++++++++-- arch/x86/include/asm/boot.h | 15 +++++++++++++++ arch/x86/kernel/asm-offsets_32.c | 1 + arch/x86/kernel/asm-offsets_64.c | 1 + 6 files changed, 45 insertions(+), 8 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 2b8e0dfa4b2..75e4f001e70 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -69,8 +69,11 @@ ENTRY(startup_32) #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx - addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx - andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx + movl BP_kernel_alignment(%esi), %eax + decl %eax + addl %eax, %ebx + notl %eax + andl %eax, %ebx #else movl $LOAD_PHYSICAL_ADDR, %ebx #endif diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 2bb500af1bd..f62c284db9e 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -84,8 +84,11 @@ ENTRY(startup_32) #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx - addl $(PMD_PAGE_SIZE -1), %ebx - andl $PMD_PAGE_MASK, %ebx + movl BP_kernel_alignment(%esi), %eax + decl %eax + addl %eax, %ebx + notl %eax + andl %eax, %ebx #else movl $LOAD_PHYSICAL_ADDR, %ebx #endif @@ -224,8 +227,11 @@ ENTRY(startup_64) /* Start with the delta to where the kernel will run at. */ #ifdef CONFIG_RELOCATABLE leaq startup_32(%rip) /* - $startup_32 */, %rbp - addq $(PMD_PAGE_SIZE - 1), %rbp - andq $PMD_PAGE_MASK, %rbp + movl BP_kernel_alignment(%rsi), %eax + decl %eax + addq %rax, %rbp + notq %rax + andq %rax, %rbp #else movq $LOAD_PHYSICAL_ADDR, %rbp #endif diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 27285143ade..a0b426978d5 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -116,7 +116,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x0209 # header version number (>= 0x0105) + .word 0x020a # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -201,7 +201,7 @@ relocatable_kernel: .byte 1 #else relocatable_kernel: .byte 0 #endif -pad2: .byte 0 +min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment pad3: .word 0 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, @@ -220,6 +220,17 @@ setup_data: .quad 0 # 64-bit physical pointer to # single linked list of # struct setup_data +pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr + +#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_extract_offset) +#define VO_INIT_SIZE (VO__end - VO__text) +#if ZO_INIT_SIZE > VO_INIT_SIZE +#define INIT_SIZE ZO_INIT_SIZE +#else +#define INIT_SIZE VO_INIT_SIZE +#endif +init_size: .long INIT_SIZE # kernel initialization size + # End of setup header ##################################################### .section ".inittext", "ax" diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 6ba23dd9fc9..418e632d4a8 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -8,11 +8,26 @@ #ifdef __KERNEL__ +#include + /* Physical address where kernel should be loaded. */ #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ + (CONFIG_PHYSICAL_ALIGN - 1)) \ & ~(CONFIG_PHYSICAL_ALIGN - 1)) +/* Minimum kernel alignment, as a power of two */ +#ifdef CONFIG_x86_64 +#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT +#else +#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1) +#endif +#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) + +#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \ + (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)) +#error "Invalid value for CONFIG_PHYSICAL_ALIGN" +#endif + #ifdef CONFIG_KERNEL_BZIP2 #define BOOT_HEAP_SIZE 0x400000 #else /* !CONFIG_KERNEL_BZIP2 */ diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 5a6aa1c1162..1a830cbd701 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -146,4 +146,5 @@ void foo(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72f062fb4b..898ecc47e12 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -125,6 +125,7 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); BLANK(); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); From d297366ba692faf1f0384811a6ff0b20c3470b1b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 16:06:23 -0700 Subject: [PATCH 431/900] x86: document new bzImage fields Document the new bzImage fields for kernel memory placement. [ Impact: adds documentation ] Signed-off-by: H. Peter Anvin --- Documentation/x86/boot.txt | 69 +++++++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 4 deletions(-) diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index e0203662f9e..cf8dfc70a11 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -50,6 +50,11 @@ Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical pointer to single linked list of struct setup_data. +Protocol 2.10: (Kernel 2.6.31) A protocol for relaxed alignment + beyond the kernel_alignment added, new init_size and + pref_address fields. + + **** MEMORY LAYOUT The traditional memory map for the kernel loader, used for Image or @@ -173,7 +178,7 @@ Offset Proto Name Meaning 022C/4 2.03+ ramdisk_max Highest legal initrd address 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not -0235/1 N/A pad2 Unused +0235/1 2.10+ min_alignment Minimum alignment, as a power of two 0236/2 N/A pad3 Unused 0238/4 2.06+ cmdline_size Maximum size of the kernel command line 023C/4 2.07+ hardware_subarch Hardware subarchitecture @@ -182,6 +187,8 @@ Offset Proto Name Meaning 024C/4 2.08+ payload_length Length of kernel payload 0250/8 2.09+ setup_data 64-bit physical pointer to linked list of struct setup_data +0258/8 2.10+ pref_address Preferred loading address +0260/4 2.10+ init_size Linear memory required during initialization (1) For backwards compatibility, if the setup_sects field contains 0, the real value is 4. @@ -482,11 +489,19 @@ Protocol: 2.03+ 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) Field name: kernel_alignment -Type: read (reloc) +Type: read/modify (reloc) Offset/size: 0x230/4 -Protocol: 2.05+ +Protocol: 2.05+ (read), 2.10+ (modify) - Alignment unit required by the kernel (if relocatable_kernel is true.) + Alignment unit required by the kernel (if relocatable_kernel is + true.) A relocatable kernel that is loaded at an alignment + incompatible with the value in this field will be realigned during + kernel initialization. + + Starting with protocol version 2.10, this reflects the kernel + alignment preferred for optimal performance; it is possible for the + loader to modify this field to permit a lesser alignment. See the + min_alignment and pref_address field below. Field name: relocatable_kernel Type: read (reloc) @@ -498,6 +513,22 @@ Protocol: 2.05+ After loading, the boot loader must set the code32_start field to point to the loaded code, or to a boot loader hook. +Field name: min_alignment +Type: read (reloc) +Offset/size: 0x235/1 +Protocol: 2.10+ + + This field, if nonzero, indicates as a power of two the minimum + alignment required, as opposed to preferred, by the kernel to boot. + If a boot loader makes use of this field, it should update the + kernel_alignment field with the alignment unit desired; typically: + + kernel_alignment = 1 << min_alignment + + There may be a considerable performance cost with an excessively + misaligned kernel. Therefore, a loader should typically try each + power-of-two alignment from kernel_alignment down to this alignment. + Field name: cmdline_size Type: read Offset/size: 0x238/4 @@ -582,6 +613,36 @@ Protocol: 2.09+ sure to consider the case where the linked list already contains entries. +Field name: pref_address +Type: read (reloc) +Offset/size: 0x258/8 +Protocol: 2.10+ + + This field, if nonzero, represents a preferred load address for the + kernel. A relocating bootloader should attempt to load at this + address if possible. + + A non-relocatable kernel will unconditionally move itself and to run + at this address. + +Field name: init_size +Type: read +Offset/size: 0x25c/4 + + This field indicates the amount of linear contiguous memory starting + at the kernel runtime start address that the kernel needs before it + is capable of examining its memory map. This is not the same thing + as the total amount of memory the kernel needs to boot, but it can + be used by a relocating boot loader to help select a safe load + address for the kernel. + + The kernel runtime start address is determined by the following algorithm: + + if (relocatable_kernel) + runtime_start = align_up(load_address, kernel_alignment) + else + runtime_start = pref_address + **** THE IMAGE CHECKSUM From ceefccc93932b920a8ec6f35f596db05202a12fe Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 16:12:16 -0700 Subject: [PATCH 432/900] x86: default CONFIG_PHYSICAL_START and CONFIG_PHYSICAL_ALIGN to 16 MB Default CONFIG_PHYSICAL_START and CONFIG_PHYSICAL_ALIGN each to 16 MB, so that both non-relocatable and relocatable kernels are loaded at 16 MB by a non-relocating bootloader. This is somewhat hacky, but it appears to be the only way to do this that does not break some some set of existing bootloaders. We want to avoid the bottom 16 MB because of large page breakup, memory holes, and ZONE_DMA. Embedded systems may need to reduce this, or update their bootloaders to be aware of the new min_alignment field. [ Impact: performance improvement, avoids problems on some systems ] Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5aee45356b5..50fbb47f529 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1455,9 +1455,7 @@ config KEXEC_JUMP config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) - default "0x1000000" if X86_NUMAQ - default "0x200000" if X86_64 - default "0x100000" + default "0x1000000" ---help--- This gives the physical address where the kernel is loaded. @@ -1476,15 +1474,15 @@ config PHYSICAL_START to be specifically compiled to run from a specific memory area (normally a reserved region) and this option comes handy. - So if you are using bzImage for capturing the crash dump, leave - the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y. - Otherwise if you plan to use vmlinux for capturing the crash dump - change this value to start of the reserved region (Typically 16MB - 0x1000000). In other words, it can be set based on the "X" value as - specified in the "crashkernel=YM@XM" command line boot parameter - passed to the panic-ed kernel. Typically this parameter is set as - crashkernel=64M@16M. Please take a look at - Documentation/kdump/kdump.txt for more details about crash dumps. + So if you are using bzImage for capturing the crash dump, + leave the value here unchanged to 0x1000000 and set + CONFIG_RELOCATABLE=y. Otherwise if you plan to use vmlinux + for capturing the crash dump change this value to start of + the reserved region. In other words, it can be set based on + the "X" value as specified in the "crashkernel=YM@XM" + command line boot parameter passed to the panic-ed + kernel. Please take a look at Documentation/kdump/kdump.txt + for more details about crash dumps. Usage of bzImage for capturing the crash dump is recommended as one does not have to build two kernels. Same kernel can be used @@ -1521,9 +1519,8 @@ config X86_NEED_RELOCS config PHYSICAL_ALIGN hex prompt "Alignment value to which kernel should be aligned" if X86_32 - default "0x100000" if X86_32 - default "0x200000" if X86_64 - range 0x2000 0x400000 + default "0x1000000" + range 0x2000 0x1000000 ---help--- This value puts the alignment restrictions on physical address where kernel is loaded and run from. Kernel is compiled for an From 26717808f93a27c22d4853c4fb17fa225f4ccc68 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 7 May 2009 14:19:34 -0700 Subject: [PATCH 433/900] x86: make CONFIG_RELOCATABLE the default Remove the EXPERIMENTAL tag from CONFIG_RELOCATABLE and make it the default. Relocatable kernels have been used for a while now, and should now have identical semantics to non-relocatable kernels when loaded by a non-relocating bootloader. Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 50fbb47f529..3e0f80a764a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1495,8 +1495,8 @@ config PHYSICAL_START Don't change this unless you know what you are doing. config RELOCATABLE - bool "Build a relocatable kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "Build a relocatable kernel" + default y ---help--- This builds a kernel image that retains relocation information so it can be loaded someplace besides the default 1MB. From c4a994645d04d5fa6bfa52a3204af87dd92168d5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 16:20:12 -0700 Subject: [PATCH 434/900] x86, defconfig: update to current, no material changes Update defconfigs to reflect current configuration files. No other changes. [ Impact: updates defconfigs to match what "make defconfig" generates ] Signed-off-by: H. Peter Anvin --- arch/x86/configs/i386_defconfig | 143 ++++++++++++++++++++++------- arch/x86/configs/x86_64_defconfig | 148 ++++++++++++++++++++++-------- 2 files changed, 223 insertions(+), 68 deletions(-) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 235b81d0f6f..70036a7a950 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,12 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29-rc4 -# Tue Feb 24 15:50:58 2009 +# Linux kernel version: 2.6.30-rc2 +# Mon May 11 16:19:47 2009 # # CONFIG_64BIT is not set CONFIG_X86_32=y # CONFIG_X86_64 is not set CONFIG_X86=y +CONFIG_OUTPUT_FORMAT="elf32-i386" CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig" CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CMOS_UPDATE=y @@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y CONFIG_ARCH_HAS_DEFAULT_IDLE=y CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y # CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y @@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y CONFIG_ARCH_POPULATES_NODE_MAP=y # CONFIG_AUDIT_ARCH is not set CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_X86_SMP=y CONFIG_USE_GENERIC_SMP_HELPERS=y CONFIG_X86_32_SMP=y CONFIG_X86_HT=y -CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y +CONFIG_X86_32_LAZY_GS=y CONFIG_KTIME_SCALAR=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" @@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_LOCALVERSION="" # CONFIG_LOCALVERSION_AUTO is not set +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_KERNEL_GZIP=y +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set CONFIG_TASKSTATS=y @@ -113,23 +123,26 @@ CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_PCSPKR_PLATFORM=y -# CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y -CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y CONFIG_TIMERFD=y @@ -139,6 +152,7 @@ CONFIG_AIO=y CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y +# CONFIG_COMPAT_BRK is not set # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set @@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y # CONFIG_LBD is not set -CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BLK_DEV_BSG=y # CONFIG_BLK_DEV_INTEGRITY is not set @@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y CONFIG_SPARSE_IRQ=y -CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y +# CONFIG_X86_BIGSMP is not set +CONFIG_X86_EXTENDED_PLATFORM=y # CONFIG_X86_ELAN is not set -# CONFIG_X86_GENERICARCH is not set -# CONFIG_X86_VSMP is not set # CONFIG_X86_RDC321X is not set +# CONFIG_X86_32_NON_STANDARD is not set CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_PARAVIRT_GUEST is not set # CONFIG_MEMTEST is not set @@ -230,8 +245,10 @@ CONFIG_M686=y # CONFIG_GENERIC_CPU is not set CONFIG_X86_GENERIC=y CONFIG_X86_CPU=y +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_L1_CACHE_SHIFT=5 CONFIG_X86_XADD=y # CONFIG_X86_PPRO_FENCE is not set CONFIG_X86_WP_WORKS_OK=y @@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y CONFIG_CPU_SUP_INTEL=y CONFIG_CPU_SUP_CYRIX_32=y CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_CENTAUR_32=y +CONFIG_CPU_SUP_CENTAUR=y CONFIG_CPU_SUP_TRANSMETA_32=y CONFIG_CPU_SUP_UMC_32=y CONFIG_X86_DS=y @@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y +# CONFIG_X86_CPU_DEBUG is not set # CONFIG_NOHIGHMEM is not set CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set @@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y CONFIG_HIGHPTE=y CONFIG_X86_CHECK_BIOS_CORRUPTION=y CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y @@ -312,6 +332,7 @@ CONFIG_MTRR=y CONFIG_X86_PAT=y CONFIG_EFI=y CONFIG_SECCOMP=y +# CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set @@ -363,7 +384,6 @@ CONFIG_ACPI_THERMAL=y CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set # CONFIG_ACPI_PCI_SLOT is not set -CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y CONFIG_ACPI_CONTAINER=y # CONFIG_ACPI_SBS is not set @@ -425,6 +445,7 @@ CONFIG_PCI_BIOS=y CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y CONFIG_PCI_DOMAINS=y +# CONFIG_DMAR is not set CONFIG_PCIEPORTBUS=y # CONFIG_HOTPLUG_PCI_PCIE is not set CONFIG_PCIEAER=y @@ -435,6 +456,7 @@ CONFIG_PCI_MSI=y # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_STUB is not set CONFIG_HT_IRQ=y +# CONFIG_PCI_IOV is not set CONFIG_ISA_DMA_API=y # CONFIG_ISA is not set # CONFIG_MCA is not set @@ -481,7 +503,6 @@ CONFIG_NET=y # # Networking options # -CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y @@ -639,6 +660,7 @@ CONFIG_LLC=y # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set +# CONFIG_PHONET is not set CONFIG_NET_SCHED=y # @@ -696,6 +718,7 @@ CONFIG_NET_SCH_FIFO=y # # CONFIG_NET_PKTGEN is not set # CONFIG_NET_TCPPROBE is not set +# CONFIG_NET_DROP_MONITOR is not set CONFIG_HAMRADIO=y # @@ -706,12 +729,10 @@ CONFIG_HAMRADIO=y # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set -# CONFIG_PHONET is not set CONFIG_FIB_RULES=y CONFIG_WIRELESS=y CONFIG_CFG80211=y # CONFIG_CFG80211_REG_DEBUG is not set -CONFIG_NL80211=y CONFIG_WIRELESS_OLD_REGULATORY=y CONFIG_WIRELESS_EXT=y CONFIG_WIRELESS_EXT_SYSFS=y @@ -789,6 +810,7 @@ CONFIG_MISC_DEVICES=y # CONFIG_ICS932S401 is not set # CONFIG_ENCLOSURE_SERVICES is not set # CONFIG_HP_ILO is not set +# CONFIG_ISL29003 is not set # CONFIG_C2PORT is not set # @@ -842,6 +864,7 @@ CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_LOWLEVEL is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set # CONFIG_SCSI_DH is not set +# CONFIG_SCSI_OSD_INITIATOR is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y @@ -940,6 +963,7 @@ CONFIG_DM_ZERO=y CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y +CONFIG_COMPAT_NET_DEV_OPS=y # CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -977,6 +1001,8 @@ CONFIG_MII=y CONFIG_NET_VENDOR_3COM=y # CONFIG_VORTEX is not set # CONFIG_TYPHOON is not set +# CONFIG_ETHOC is not set +# CONFIG_DNET is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set # CONFIG_TULIP is not set @@ -1026,6 +1052,7 @@ CONFIG_E1000=y CONFIG_E1000E=y # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -1040,6 +1067,7 @@ CONFIG_BNX2=y # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set # CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set # CONFIG_JME is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set @@ -1049,6 +1077,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -1058,6 +1087,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_BNX2X is not set # CONFIG_QLGE is not set # CONFIG_SFC is not set +# CONFIG_BE2NET is not set CONFIG_TR=y # CONFIG_IBMOL is not set # CONFIG_IBMLS is not set @@ -1073,8 +1103,8 @@ CONFIG_WLAN_80211=y # CONFIG_LIBERTAS is not set # CONFIG_LIBERTAS_THINFIRM is not set # CONFIG_AIRO is not set -# CONFIG_HERMES is not set # CONFIG_ATMEL is not set +# CONFIG_AT76C50X_USB is not set # CONFIG_AIRO_CS is not set # CONFIG_PCMCIA_WL3501 is not set # CONFIG_PRISM54 is not set @@ -1084,21 +1114,21 @@ CONFIG_WLAN_80211=y # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set # CONFIG_MAC80211_HWSIM is not set +# CONFIG_MWL8K is not set # CONFIG_P54_COMMON is not set CONFIG_ATH5K=y # CONFIG_ATH5K_DEBUG is not set # CONFIG_ATH9K is not set +# CONFIG_AR9170_USB is not set # CONFIG_IPW2100 is not set # CONFIG_IPW2200 is not set -# CONFIG_IWLCORE is not set -# CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWLAGN is not set -# CONFIG_IWL3945 is not set +# CONFIG_IWLWIFI is not set # CONFIG_HOSTAP is not set # CONFIG_B43 is not set # CONFIG_B43LEGACY is not set # CONFIG_ZD1211RW is not set # CONFIG_RT2X00 is not set +# CONFIG_HERMES is not set # # Enable WiMAX (Networking options) to see the WiMAX drivers @@ -1209,6 +1239,8 @@ CONFIG_INPUT_TABLET=y # CONFIG_TABLET_USB_KBTAB is not set # CONFIG_TABLET_USB_WACOM is not set CONFIG_INPUT_TOUCHSCREEN=y +# CONFIG_TOUCHSCREEN_AD7879_I2C is not set +# CONFIG_TOUCHSCREEN_AD7879 is not set # CONFIG_TOUCHSCREEN_FUJITSU is not set # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set @@ -1303,6 +1335,7 @@ CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set # CONFIG_IPMI_HANDLER is not set CONFIG_HW_RANDOM=y +# CONFIG_HW_RANDOM_TIMERIOMEM is not set CONFIG_HW_RANDOM_INTEL=y CONFIG_HW_RANDOM_AMD=y CONFIG_HW_RANDOM_GEODE=y @@ -1390,7 +1423,6 @@ CONFIG_I2C_I801=y # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set # CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set # CONFIG_I2C_DEBUG_CORE is not set @@ -1424,6 +1456,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_ADT7475 is not set # CONFIG_SENSORS_K8TEMP is not set # CONFIG_SENSORS_ASB100 is not set +# CONFIG_SENSORS_ATK0110 is not set # CONFIG_SENSORS_ATXP1 is not set # CONFIG_SENSORS_DS1621 is not set # CONFIG_SENSORS_I5K_AMB is not set @@ -1433,6 +1466,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_FSCHER is not set # CONFIG_SENSORS_FSCPOS is not set # CONFIG_SENSORS_FSCHMD is not set +# CONFIG_SENSORS_G760A is not set # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set @@ -1448,11 +1482,14 @@ CONFIG_HWMON=y # CONFIG_SENSORS_LM90 is not set # CONFIG_SENSORS_LM92 is not set # CONFIG_SENSORS_LM93 is not set +# CONFIG_SENSORS_LTC4215 is not set # CONFIG_SENSORS_LTC4245 is not set +# CONFIG_SENSORS_LM95241 is not set # CONFIG_SENSORS_MAX1619 is not set # CONFIG_SENSORS_MAX6650 is not set # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_SIS5595 is not set # CONFIG_SENSORS_DME1737 is not set # CONFIG_SENSORS_SMSC47M1 is not set @@ -1643,7 +1680,6 @@ CONFIG_FB_EFI=y # CONFIG_FB_3DFX is not set # CONFIG_FB_VOODOO1 is not set # CONFIG_FB_VT8623 is not set -# CONFIG_FB_CYBLA is not set # CONFIG_FB_TRIDENT is not set # CONFIG_FB_ARK is not set # CONFIG_FB_PM3 is not set @@ -1652,6 +1688,7 @@ CONFIG_FB_EFI=y # CONFIG_FB_VIRTUAL is not set # CONFIG_FB_METRONOME is not set # CONFIG_FB_MB862XX is not set +# CONFIG_FB_BROADSHEET is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y @@ -1738,6 +1775,8 @@ CONFIG_SND_PCI=y # CONFIG_SND_INDIGO is not set # CONFIG_SND_INDIGOIO is not set # CONFIG_SND_INDIGODJ is not set +# CONFIG_SND_INDIGOIOX is not set +# CONFIG_SND_INDIGODJX is not set # CONFIG_SND_EMU10K1 is not set # CONFIG_SND_EMU10K1X is not set # CONFIG_SND_ENS1370 is not set @@ -1811,15 +1850,17 @@ CONFIG_USB_HIDDEV=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +CONFIG_HID_KYE=y CONFIG_HID_GYRATION=y +CONFIG_HID_KENSINGTON=y CONFIG_HID_LOGITECH=y CONFIG_LOGITECH_FF=y # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1885,11 +1926,11 @@ CONFIG_USB_PRINTER=y # CONFIG_USB_TMC is not set # -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may # # -# see USB_STORAGE Help for more information +# also be needed; see USB_STORAGE Help for more info # CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set @@ -1931,7 +1972,6 @@ CONFIG_USB_LIBUSUAL=y # CONFIG_USB_LED is not set # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set -# CONFIG_USB_PHIDGET is not set # CONFIG_USB_IDMOUSE is not set # CONFIG_USB_FTDI_ELAN is not set # CONFIG_USB_APPLEDISPLAY is not set @@ -1947,6 +1987,7 @@ CONFIG_USB_LIBUSUAL=y # # OTG and related infrastructure # +# CONFIG_NOP_USB_XCEIV is not set # CONFIG_UWB is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1958,8 +1999,10 @@ CONFIG_LEDS_CLASS=y # # CONFIG_LEDS_ALIX2 is not set # CONFIG_LEDS_PCA9532 is not set +# CONFIG_LEDS_LP5521 is not set # CONFIG_LEDS_CLEVO_MAIL is not set # CONFIG_LEDS_PCA955X is not set +# CONFIG_LEDS_BD2802 is not set # # LED Triggers @@ -1969,6 +2012,10 @@ CONFIG_LEDS_TRIGGERS=y # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set + +# +# iptables trigger is under Netfilter config (LED target) +# # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set CONFIG_EDAC=y @@ -2037,6 +2084,7 @@ CONFIG_DMADEVICES=y # DMA Devices # # CONFIG_INTEL_IOATDMA is not set +# CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set # CONFIG_STAGING is not set CONFIG_X86_PLATFORM_DEVICES=y @@ -2071,6 +2119,7 @@ CONFIG_DMIID=y # # CONFIG_EXT2_FS is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y CONFIG_EXT3_FS_SECURITY=y @@ -2100,6 +2149,11 @@ CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set CONFIG_GENERIC_ACL=y +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -2151,6 +2205,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -2164,7 +2219,6 @@ CONFIG_NFS_ACL_SUPPORT=y CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y CONFIG_SUNRPC_GSS=y -# CONFIG_SUNRPC_REGISTER_V4 is not set CONFIG_RPCSEC_GSS_KRB5=y # CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set @@ -2251,6 +2305,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y @@ -2266,6 +2321,7 @@ CONFIG_TIMER_STATS=y # CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_HIGHMEM is not set CONFIG_DEBUG_BUGVERBOSE=y @@ -2289,13 +2345,19 @@ CONFIG_FRAME_POINTER=y # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set CONFIG_SYSCTL_SYSCALL_CHECK=y +# CONFIG_DEBUG_PAGEALLOC is not set CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y +CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_RING_BUFFER=y +CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -2305,13 +2367,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y # CONFIG_SYSPROF_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_FTRACE_SYSCALLS is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_POWER_TRACER is not set # CONFIG_STACK_TRACER is not set # CONFIG_HW_BRANCH_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +CONFIG_BLK_DEV_IO_TRACE=y +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_MMIOTRACE is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y -# CONFIG_DYNAMIC_PRINTK_DEBUG is not set +# CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -2321,7 +2391,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y -# CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_DEBUG_PER_CPU_MAPS is not set # CONFIG_X86_PTDUMP is not set CONFIG_DEBUG_RODATA=y @@ -2329,7 +2398,7 @@ CONFIG_DEBUG_RODATA=y CONFIG_DEBUG_NX_TEST=m # CONFIG_4KSTACKS is not set CONFIG_DOUBLEFAULT=y -# CONFIG_MMIOTRACE is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 @@ -2365,6 +2434,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set # CONFIG_SECURITY_SMACK is not set +# CONFIG_SECURITY_TOMOYO is not set +# CONFIG_IMA is not set CONFIG_CRYPTO=y # @@ -2380,10 +2451,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y CONFIG_CRYPTO_HASH=y CONFIG_CRYPTO_HASH2=y CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_PCOMP=y CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER2=y # CONFIG_CRYPTO_GF128MUL is not set # CONFIG_CRYPTO_NULL is not set +CONFIG_CRYPTO_WORKQUEUE=y # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_AUTHENC=y # CONFIG_CRYPTO_TEST is not set @@ -2456,6 +2529,7 @@ CONFIG_CRYPTO_DES=y # Compression # # CONFIG_CRYPTO_DEFLATE is not set +# CONFIG_CRYPTO_ZLIB is not set # CONFIG_CRYPTO_LZO is not set # @@ -2467,11 +2541,13 @@ CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_GEODE is not set # CONFIG_CRYPTO_DEV_HIFN_795X is not set CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y CONFIG_VIRTUALIZATION=y # CONFIG_KVM is not set # CONFIG_LGUEST is not set # CONFIG_VIRTIO_PCI is not set # CONFIG_VIRTIO_BALLOON is not set +CONFIG_BINARY_PRINTF=y # # Library routines @@ -2489,7 +2565,10 @@ CONFIG_CRC32=y # CONFIG_LIBCRC32C is not set CONFIG_AUDIT_GENERIC=y CONFIG_ZLIB_INFLATE=y -CONFIG_PLIST=y +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y +CONFIG_NLATTR=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 9fe5d212ab4..f3e53e21f43 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,12 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29-rc4 -# Tue Feb 24 15:44:16 2009 +# Linux kernel version: 2.6.30-rc2 +# Mon May 11 16:19:24 2009 # CONFIG_64BIT=y # CONFIG_X86_32 is not set CONFIG_X86_64=y CONFIG_X86=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CMOS_UPDATE=y @@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y CONFIG_ARCH_HAS_DEFAULT_IDLE=y CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y @@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y CONFIG_ARCH_POPULATES_NODE_MAP=y CONFIG_AUDIT_ARCH=y CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_X86_SMP=y CONFIG_USE_GENERIC_SMP_HELPERS=y CONFIG_X86_64_SMP=y CONFIG_X86_HT=y -CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y # CONFIG_KTIME_SCALAR is not set CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" @@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_LOCALVERSION="" # CONFIG_LOCALVERSION_AUTO is not set +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_KERNEL_GZIP=y +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set CONFIG_TASKSTATS=y @@ -114,23 +123,26 @@ CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_PCSPKR_PLATFORM=y -# CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y -CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y CONFIG_TIMERFD=y @@ -140,6 +152,7 @@ CONFIG_AIO=y CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y +# CONFIG_COMPAT_BRK is not set # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set @@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y -CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BLK_DEV_BSG=y # CONFIG_BLK_DEV_INTEGRITY is not set CONFIG_BLOCK_COMPAT=y @@ -196,11 +210,10 @@ CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y CONFIG_SPARSE_IRQ=y # CONFIG_NUMA_MIGRATE_IRQ_DESC is not set -CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y -# CONFIG_X86_ELAN is not set -# CONFIG_X86_GENERICARCH is not set +CONFIG_X86_EXTENDED_PLATFORM=y # CONFIG_X86_VSMP is not set +# CONFIG_X86_UV is not set CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_PARAVIRT_GUEST is not set # CONFIG_MEMTEST is not set @@ -230,10 +243,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_MCORE2 is not set CONFIG_GENERIC_CPU=y CONFIG_X86_CPU=y -CONFIG_X86_L1_CACHE_BYTES=128 -CONFIG_X86_INTERNODE_CACHE_BYTES=128 +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_L1_CACHE_SHIFT=6 CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_TSC=y CONFIG_X86_CMPXCHG64=y @@ -242,7 +255,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64 CONFIG_X86_DEBUGCTLMSR=y CONFIG_CPU_SUP_INTEL=y CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_CENTAUR_64=y +CONFIG_CPU_SUP_CENTAUR=y CONFIG_X86_DS=y CONFIG_X86_PTRACE_BTS=y CONFIG_HPET_TIMER=y @@ -269,6 +282,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y CONFIG_X86_MCE=y CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y # CONFIG_I8K is not set CONFIG_MICROCODE=y CONFIG_MICROCODE_INTEL=y @@ -276,6 +290,7 @@ CONFIG_MICROCODE_AMD=y CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y +# CONFIG_X86_CPU_DEBUG is not set CONFIG_ARCH_PHYS_ADDR_T_64BIT=y CONFIG_DIRECT_GBPAGES=y CONFIG_NUMA=y @@ -309,6 +324,8 @@ CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y CONFIG_X86_CHECK_BIOS_CORRUPTION=y CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y CONFIG_X86_RESERVE_LOW_64K=y @@ -317,6 +334,7 @@ CONFIG_MTRR=y CONFIG_X86_PAT=y CONFIG_EFI=y CONFIG_SECCOMP=y +# CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set @@ -325,9 +343,10 @@ CONFIG_HZ=1000 CONFIG_SCHED_HRTICK=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y +# CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 # CONFIG_RELOCATABLE is not set -CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set # CONFIG_CMDLINE_BOOL is not set @@ -370,7 +389,6 @@ CONFIG_ACPI_NUMA=y CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set # CONFIG_ACPI_PCI_SLOT is not set -CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y CONFIG_ACPI_CONTAINER=y # CONFIG_ACPI_SBS is not set @@ -436,6 +454,7 @@ CONFIG_PCI_MSI=y # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_STUB is not set CONFIG_HT_IRQ=y +# CONFIG_PCI_IOV is not set CONFIG_ISA_DMA_API=y CONFIG_K8_NB=y CONFIG_PCCARD=y @@ -481,7 +500,6 @@ CONFIG_NET=y # # Networking options # -CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y @@ -639,6 +657,7 @@ CONFIG_LLC=y # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set +# CONFIG_PHONET is not set CONFIG_NET_SCHED=y # @@ -696,6 +715,7 @@ CONFIG_NET_SCH_FIFO=y # # CONFIG_NET_PKTGEN is not set # CONFIG_NET_TCPPROBE is not set +# CONFIG_NET_DROP_MONITOR is not set CONFIG_HAMRADIO=y # @@ -706,12 +726,10 @@ CONFIG_HAMRADIO=y # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set -# CONFIG_PHONET is not set CONFIG_FIB_RULES=y CONFIG_WIRELESS=y CONFIG_CFG80211=y # CONFIG_CFG80211_REG_DEBUG is not set -CONFIG_NL80211=y CONFIG_WIRELESS_OLD_REGULATORY=y CONFIG_WIRELESS_EXT=y CONFIG_WIRELESS_EXT_SYSFS=y @@ -788,9 +806,8 @@ CONFIG_MISC_DEVICES=y # CONFIG_TIFM_CORE is not set # CONFIG_ICS932S401 is not set # CONFIG_ENCLOSURE_SERVICES is not set -# CONFIG_SGI_XP is not set # CONFIG_HP_ILO is not set -# CONFIG_SGI_GRU is not set +# CONFIG_ISL29003 is not set # CONFIG_C2PORT is not set # @@ -844,6 +861,7 @@ CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_LOWLEVEL is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set # CONFIG_SCSI_DH is not set +# CONFIG_SCSI_OSD_INITIATOR is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y @@ -940,6 +958,7 @@ CONFIG_DM_ZERO=y CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y +CONFIG_COMPAT_NET_DEV_OPS=y # CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -977,6 +996,8 @@ CONFIG_MII=y CONFIG_NET_VENDOR_3COM=y # CONFIG_VORTEX is not set # CONFIG_TYPHOON is not set +# CONFIG_ETHOC is not set +# CONFIG_DNET is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set # CONFIG_TULIP is not set @@ -1026,6 +1047,7 @@ CONFIG_E1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -1040,6 +1062,7 @@ CONFIG_TIGON3=y # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set # CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set # CONFIG_JME is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set @@ -1049,6 +1072,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -1058,6 +1082,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_BNX2X is not set # CONFIG_QLGE is not set # CONFIG_SFC is not set +# CONFIG_BE2NET is not set CONFIG_TR=y # CONFIG_IBMOL is not set # CONFIG_3C359 is not set @@ -1072,8 +1097,8 @@ CONFIG_WLAN_80211=y # CONFIG_LIBERTAS is not set # CONFIG_LIBERTAS_THINFIRM is not set # CONFIG_AIRO is not set -# CONFIG_HERMES is not set # CONFIG_ATMEL is not set +# CONFIG_AT76C50X_USB is not set # CONFIG_AIRO_CS is not set # CONFIG_PCMCIA_WL3501 is not set # CONFIG_PRISM54 is not set @@ -1083,21 +1108,21 @@ CONFIG_WLAN_80211=y # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set # CONFIG_MAC80211_HWSIM is not set +# CONFIG_MWL8K is not set # CONFIG_P54_COMMON is not set CONFIG_ATH5K=y # CONFIG_ATH5K_DEBUG is not set # CONFIG_ATH9K is not set +# CONFIG_AR9170_USB is not set # CONFIG_IPW2100 is not set # CONFIG_IPW2200 is not set -# CONFIG_IWLCORE is not set -# CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWLAGN is not set -# CONFIG_IWL3945 is not set +# CONFIG_IWLWIFI is not set # CONFIG_HOSTAP is not set # CONFIG_B43 is not set # CONFIG_B43LEGACY is not set # CONFIG_ZD1211RW is not set # CONFIG_RT2X00 is not set +# CONFIG_HERMES is not set # # Enable WiMAX (Networking options) to see the WiMAX drivers @@ -1208,6 +1233,8 @@ CONFIG_INPUT_TABLET=y # CONFIG_TABLET_USB_KBTAB is not set # CONFIG_TABLET_USB_WACOM is not set CONFIG_INPUT_TOUCHSCREEN=y +# CONFIG_TOUCHSCREEN_AD7879_I2C is not set +# CONFIG_TOUCHSCREEN_AD7879 is not set # CONFIG_TOUCHSCREEN_FUJITSU is not set # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set @@ -1301,6 +1328,7 @@ CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set # CONFIG_IPMI_HANDLER is not set CONFIG_HW_RANDOM=y +# CONFIG_HW_RANDOM_TIMERIOMEM is not set # CONFIG_HW_RANDOM_INTEL is not set # CONFIG_HW_RANDOM_AMD is not set CONFIG_NVRAM=y @@ -1382,7 +1410,6 @@ CONFIG_I2C_I801=y # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set # CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set # CONFIG_I2C_DEBUG_CORE is not set @@ -1416,6 +1443,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_ADT7475 is not set # CONFIG_SENSORS_K8TEMP is not set # CONFIG_SENSORS_ASB100 is not set +# CONFIG_SENSORS_ATK0110 is not set # CONFIG_SENSORS_ATXP1 is not set # CONFIG_SENSORS_DS1621 is not set # CONFIG_SENSORS_I5K_AMB is not set @@ -1425,6 +1453,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_FSCHER is not set # CONFIG_SENSORS_FSCPOS is not set # CONFIG_SENSORS_FSCHMD is not set +# CONFIG_SENSORS_G760A is not set # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set @@ -1440,11 +1469,14 @@ CONFIG_HWMON=y # CONFIG_SENSORS_LM90 is not set # CONFIG_SENSORS_LM92 is not set # CONFIG_SENSORS_LM93 is not set +# CONFIG_SENSORS_LTC4215 is not set # CONFIG_SENSORS_LTC4245 is not set +# CONFIG_SENSORS_LM95241 is not set # CONFIG_SENSORS_MAX1619 is not set # CONFIG_SENSORS_MAX6650 is not set # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_SIS5595 is not set # CONFIG_SENSORS_DME1737 is not set # CONFIG_SENSORS_SMSC47M1 is not set @@ -1635,6 +1667,7 @@ CONFIG_FB_EFI=y # CONFIG_FB_VIRTUAL is not set # CONFIG_FB_METRONOME is not set # CONFIG_FB_MB862XX is not set +# CONFIG_FB_BROADSHEET is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y @@ -1720,6 +1753,8 @@ CONFIG_SND_PCI=y # CONFIG_SND_INDIGO is not set # CONFIG_SND_INDIGOIO is not set # CONFIG_SND_INDIGODJ is not set +# CONFIG_SND_INDIGOIOX is not set +# CONFIG_SND_INDIGODJX is not set # CONFIG_SND_EMU10K1 is not set # CONFIG_SND_EMU10K1X is not set # CONFIG_SND_ENS1370 is not set @@ -1792,15 +1827,17 @@ CONFIG_USB_HIDDEV=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +CONFIG_HID_KYE=y CONFIG_HID_GYRATION=y +CONFIG_HID_KENSINGTON=y CONFIG_HID_LOGITECH=y CONFIG_LOGITECH_FF=y # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1866,11 +1903,11 @@ CONFIG_USB_PRINTER=y # CONFIG_USB_TMC is not set # -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may # # -# see USB_STORAGE Help for more information +# also be needed; see USB_STORAGE Help for more info # CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set @@ -1912,7 +1949,6 @@ CONFIG_USB_LIBUSUAL=y # CONFIG_USB_LED is not set # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set -# CONFIG_USB_PHIDGET is not set # CONFIG_USB_IDMOUSE is not set # CONFIG_USB_FTDI_ELAN is not set # CONFIG_USB_APPLEDISPLAY is not set @@ -1928,6 +1964,7 @@ CONFIG_USB_LIBUSUAL=y # # OTG and related infrastructure # +# CONFIG_NOP_USB_XCEIV is not set # CONFIG_UWB is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1939,8 +1976,10 @@ CONFIG_LEDS_CLASS=y # # CONFIG_LEDS_ALIX2 is not set # CONFIG_LEDS_PCA9532 is not set +# CONFIG_LEDS_LP5521 is not set # CONFIG_LEDS_CLEVO_MAIL is not set # CONFIG_LEDS_PCA955X is not set +# CONFIG_LEDS_BD2802 is not set # # LED Triggers @@ -1950,6 +1989,10 @@ CONFIG_LEDS_TRIGGERS=y # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set + +# +# iptables trigger is under Netfilter config (LED target) +# # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set CONFIG_EDAC=y @@ -2018,6 +2061,7 @@ CONFIG_DMADEVICES=y # DMA Devices # # CONFIG_INTEL_IOATDMA is not set +# CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set # CONFIG_STAGING is not set CONFIG_X86_PLATFORM_DEVICES=y @@ -2051,6 +2095,7 @@ CONFIG_DMIID=y # # CONFIG_EXT2_FS is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y CONFIG_EXT3_FS_SECURITY=y @@ -2081,6 +2126,11 @@ CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set CONFIG_GENERIC_ACL=y +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -2132,6 +2182,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -2145,7 +2196,6 @@ CONFIG_NFS_ACL_SUPPORT=y CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y CONFIG_SUNRPC_GSS=y -# CONFIG_SUNRPC_REGISTER_V4 is not set CONFIG_RPCSEC_GSS_KRB5=y # CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set @@ -2232,6 +2282,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y @@ -2247,6 +2298,7 @@ CONFIG_TIMER_STATS=y # CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set @@ -2269,13 +2321,19 @@ CONFIG_FRAME_POINTER=y # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set CONFIG_SYSCTL_SYSCALL_CHECK=y +# CONFIG_DEBUG_PAGEALLOC is not set CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y +CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_RING_BUFFER=y +CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -2285,13 +2343,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y # CONFIG_SYSPROF_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_FTRACE_SYSCALLS is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_POWER_TRACER is not set # CONFIG_STACK_TRACER is not set # CONFIG_HW_BRANCH_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +CONFIG_BLK_DEV_IO_TRACE=y +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_MMIOTRACE is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y -# CONFIG_DYNAMIC_PRINTK_DEBUG is not set +# CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -2301,14 +2367,13 @@ CONFIG_EARLY_PRINTK=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y -# CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_DEBUG_PER_CPU_MAPS is not set # CONFIG_X86_PTDUMP is not set CONFIG_DEBUG_RODATA=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_NX_TEST=m # CONFIG_IOMMU_DEBUG is not set -# CONFIG_MMIOTRACE is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 @@ -2344,6 +2409,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set # CONFIG_SECURITY_SMACK is not set +# CONFIG_SECURITY_TOMOYO is not set +# CONFIG_IMA is not set CONFIG_CRYPTO=y # @@ -2359,10 +2426,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y CONFIG_CRYPTO_HASH=y CONFIG_CRYPTO_HASH2=y CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_PCOMP=y CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER2=y # CONFIG_CRYPTO_GF128MUL is not set # CONFIG_CRYPTO_NULL is not set +CONFIG_CRYPTO_WORKQUEUE=y # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_AUTHENC=y # CONFIG_CRYPTO_TEST is not set @@ -2414,6 +2483,7 @@ CONFIG_CRYPTO_SHA1=y # CONFIG_CRYPTO_AES=y # CONFIG_CRYPTO_AES_X86_64 is not set +# CONFIG_CRYPTO_AES_NI_INTEL is not set # CONFIG_CRYPTO_ANUBIS is not set CONFIG_CRYPTO_ARC4=y # CONFIG_CRYPTO_BLOWFISH is not set @@ -2435,6 +2505,7 @@ CONFIG_CRYPTO_DES=y # Compression # # CONFIG_CRYPTO_DEFLATE is not set +# CONFIG_CRYPTO_ZLIB is not set # CONFIG_CRYPTO_LZO is not set # @@ -2444,10 +2515,12 @@ CONFIG_CRYPTO_DES=y CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y CONFIG_VIRTUALIZATION=y # CONFIG_KVM is not set # CONFIG_VIRTIO_PCI is not set # CONFIG_VIRTIO_BALLOON is not set +CONFIG_BINARY_PRINTF=y # # Library routines @@ -2464,7 +2537,10 @@ CONFIG_CRC32=y # CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y -CONFIG_PLIST=y +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y +CONFIG_NLATTR=y From fe83fcc0a14dcf71996de5eb84771b2369ba7abc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 16:23:16 -0700 Subject: [PATCH 435/900] x86, defconfig: update kernel position parameters Update CONFIG_RELOCATABLE, CONFIG_PHYSICAL_START and CONFIG_PHYSICAL_ALIGN to reflect the current defaults. [ Impact: make defconfig match Kconfig defaults ] Signed-off-by: H. Peter Anvin --- arch/x86/configs/i386_defconfig | 7 ++++--- arch/x86/configs/x86_64_defconfig | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 70036a7a950..edb992ebef9 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.30-rc2 -# Mon May 11 16:19:47 2009 +# Mon May 11 16:21:55 2009 # # CONFIG_64BIT is not set CONFIG_X86_32=y @@ -343,8 +343,9 @@ CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y # CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 -# CONFIG_RELOCATABLE is not set -CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_RELOCATABLE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set # CONFIG_CMDLINE_BOOL is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index f3e53e21f43..4ba7d4ef9aa 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.30-rc2 -# Mon May 11 16:19:24 2009 +# Mon May 11 16:22:00 2009 # CONFIG_64BIT=y # CONFIG_X86_32 is not set @@ -345,7 +345,7 @@ CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y # CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 -# CONFIG_RELOCATABLE is not set +CONFIG_RELOCATABLE=y CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set From 5031296c57024a78ddad4edfc993367dbf4abb98 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 7 May 2009 16:54:11 -0700 Subject: [PATCH 436/900] x86: add extension fields for bootloader type and version A long ago, in days of yore, it all began with a god named Thor. There were vikings and boats and some plans for a Linux kernel header. Unfortunately, a single 8-bit field was used for bootloader type and version. This has generally worked without *too* much pain, but we're getting close to flat running out of ID fields. Add extension fields for both type and version. The type will be extended if it the old field is 0xE; the version is a simple MSB extension. Keep /proc/sys/kernel/bootloader_type containing (type << 4) + (ver & 0xf) for backwards compatiblity, but also add /proc/sys/kernel/bootloader_version which contains the full version number. [ Impact: new feature to support more bootloaders ] Signed-off-by: H. Peter Anvin --- Documentation/x86/boot.txt | 59 ++++++++++++++++++++++++++++---- arch/x86/boot/header.S | 6 +++- arch/x86/include/asm/bootparam.h | 3 +- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/setup.c | 10 ++++-- kernel/sysctl.c | 8 +++++ 6 files changed, 76 insertions(+), 11 deletions(-) diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index cf8dfc70a11..8da3a795083 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -50,10 +50,9 @@ Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical pointer to single linked list of struct setup_data. -Protocol 2.10: (Kernel 2.6.31) A protocol for relaxed alignment +Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment beyond the kernel_alignment added, new init_size and - pref_address fields. - + pref_address fields. Added extended boot loader IDs. **** MEMORY LAYOUT @@ -173,7 +172,8 @@ Offset Proto Name Meaning 021C/4 2.00+ ramdisk_size initrd size (set by boot loader) 0220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only 0224/2 2.01+ heap_end_ptr Free memory after setup end -0226/2 N/A pad1 Unused +0226/1 2.02+(3 ext_loader_ver Extended boot loader version +0227/1 2.02+(3 ext_loader_type Extended boot loader ID 0228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line 022C/4 2.03+ ramdisk_max Highest legal initrd address 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel @@ -197,6 +197,8 @@ Offset Proto Name Meaning field are unusable, which means the size of a bzImage kernel cannot be determined. +(3) Ignored, but safe to set, for boot protocols 2.02-2.09. + If the "HdrS" (0x53726448) magic number is not found at offset 0x202, the boot protocol version is "old". Loading an old kernel, the following parameters should be assumed: @@ -350,18 +352,32 @@ Protocol: 2.00+ 0xTV here, where T is an identifier for the boot loader and V is a version number. Otherwise, enter 0xFF here. + For boot loader IDs above T = 0xD, write T = 0xE to this field and + write the extended ID minus 0x10 to the ext_loader_type field. + Similarly, the ext_loader_ver field can be used to provide more than + four bits for the bootloader version. + + For example, for T = 0x15, V = 0x234, write: + + type_of_loader <- 0xE4 + ext_loader_type <- 0x05 + ext_loader_ver <- 0x23 + Assigned boot loader ids: 0 LILO (0x00 reserved for pre-2.00 bootloader) 1 Loadlin 2 bootsect-loader (0x20, all other values reserved) - 3 SYSLINUX - 4 EtherBoot + 3 Syslinux + 4 Etherboot/gPXE 5 ELILO 7 GRUB - 8 U-BOOT + 8 U-Boot 9 Xen A Gujin B Qemu + C Arcturus Networks uCbootloader + E Extended (see ext_loader_type) + F Special (0xFF = undefined) Please contact if you need a bootloader ID value assigned. @@ -460,6 +476,35 @@ Protocol: 2.01+ Set this field to the offset (from the beginning of the real-mode code) of the end of the setup stack/heap, minus 0x0200. +Field name: ext_loader_ver +Type: write (optional) +Offset/size: 0x226/1 +Protocol: 2.02+ + + This field is used as an extension of the version number in the + type_of_loader field. The total version number is considered to be + (type_of_loader & 0x0f) + (ext_loader_ver << 4). + + The use of this field is boot loader specific. If not written, it + is zero. + + Kernels prior to 2.6.31 did not recognize this field, but it is safe + to write for protocol version 2.02 or higher. + +Field name: ext_loader_type +Type: write (obligatory if (type_of_loader & 0xf0) == 0xe0) +Offset/size: 0x227/1 +Protocol: 2.02+ + + This field is used as an extension of the type number in + type_of_loader field. If the type in type_of_loader is 0xE, then + the actual type is (ext_loader_type + 0x10). + + This field is ignored if the type in type_of_loader is not 0xE. + + Kernels prior to 2.6.31 did not recognize this field, but it is safe + to write for protocol version 2.02 or higher. + Field name: cmd_line_ptr Type: write (obligatory) Offset/size: 0x228/4 diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index a0b426978d5..68c3bfbaff2 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -169,7 +169,11 @@ heap_end_ptr: .word _end+STACK_SIZE-512 # end of setup code can be used by setup # for local heap purposes. -pad1: .word 0 +ext_loader_ver: + .byte 0 # Extended boot loader version +ext_loader_type: + .byte 0 # Extended boot loader type + cmd_line_ptr: .long 0 # (Header version 0x0202 or later) # If nonzero, a 32-bit pointer # to the kernel command line. diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 433adaebf9b..1724e8de317 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -50,7 +50,8 @@ struct setup_header { __u32 ramdisk_size; __u32 bootsect_kludge; __u16 heap_end_ptr; - __u16 _pad1; + __u8 ext_loader_ver; + __u8 ext_loader_type; __u32 cmd_line_ptr; __u32 initrd_addr_max; __u32 kernel_alignment; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index fcf4d92e7e0..6384d25121c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -814,6 +814,7 @@ extern unsigned int BIOS_revision; /* Boot loader type from the setup header: */ extern int bootloader_type; +extern int bootloader_version; extern char ignore_fpu_irq; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf6..2b093451aec 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -214,8 +214,8 @@ unsigned long mmu_cr4_features; unsigned long mmu_cr4_features = X86_CR4_PAE; #endif -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; +/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ +int bootloader_type, bootloader_version; /* * Setup options @@ -706,6 +706,12 @@ void __init setup_arch(char **cmdline_p) #endif saved_video_mode = boot_params.hdr.vid_mode; bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; #ifdef CONFIG_BLK_DEV_RAM rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e3d2c7dd59b..cf91c9317b2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -727,6 +727,14 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "kstack_depth_to_print", From 0f0c85fc80adbbd2265d89867d743f929d516805 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 16:08:00 -0400 Subject: [PATCH 437/900] ring-buffer: small optimizations Doing some small changes in the fast path of the ring buffer recording saves over 3% in the ring-buffer-benchmark test. [ Impact: a little faster ring buffer recording ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 493cba46abc..f452de2ce49 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1000,7 +1000,7 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); } -static int +static inline int rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -1423,9 +1423,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * also be made. But only the entry that did the actual * commit will be something other than zero. */ - if (cpu_buffer->tail_page == cpu_buffer->commit_page && - rb_page_write(cpu_buffer->tail_page) == - rb_commit_index(cpu_buffer)) { + if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && + rb_page_write(cpu_buffer->tail_page) == + rb_commit_index(cpu_buffer))) { delta = ts - cpu_buffer->write_stamp; @@ -1436,7 +1436,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(ts < cpu_buffer->write_stamp)) delta = 0; - if (test_time_stamp(delta)) { + else if (unlikely(test_time_stamp(delta))) { commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); @@ -1470,7 +1470,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * If the timestamp was commited, make the commit our entry * now so that we will update it when needed. */ - if (commit) + if (unlikely(commit)) rb_set_commit_event(cpu_buffer, event); else if (!rb_is_commit(cpu_buffer, event)) delta = 0; From 88eb0125362f2ab272cbaf84252cf101ddc2dec9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 16:28:23 -0400 Subject: [PATCH 438/900] ring-buffer: use internal time stamp function The ring_buffer_time_stamp that is exported adds a little more overhead than is needed for using it internally. This patch adds an internal timestamp function that can be inlined (a single line function) and used internally for the ring buffer. [ Impact: a little less overhead to the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f452de2ce49..a9e645a5bc1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -454,13 +454,18 @@ struct ring_buffer_iter { /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 +static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +{ + /* shift to debug/test normalization and TIME_EXTENTS */ + return buffer->clock() << DEBUG_SHIFT; +} + u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) { u64 time; preempt_disable_notrace(); - /* shift to debug/test normalization and TIME_EXTENTS */ - time = buffer->clock() << DEBUG_SHIFT; + time = rb_time_stamp(buffer, cpu); preempt_enable_no_resched_notrace(); return time; @@ -1247,7 +1252,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->tail_page = next_page; /* reread the time stamp */ - *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); + *ts = rb_time_stamp(buffer, cpu_buffer->cpu); cpu_buffer->tail_page->page->time_stamp = *ts; } @@ -1413,7 +1418,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) return NULL; - ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); /* * Only the first commit can update the timestamp. From 168b6b1d0594c7866caa73b12f3b8d91075695f2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 22:11:05 -0400 Subject: [PATCH 439/900] ring-buffer: move code around to remove some branches This is a bit of micro-optimizations. But since the ring buffer is used in tracing every function call, it is an extreme hot path. Every nanosecond counts. This change shows over 5% improvement in the ring-buffer-benchmark. [ Impact: more efficient code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a9e645a5bc1..16b24d49604 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1400,7 +1400,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta; + u64 ts, delta = 0; int commit = 0; int nr_loops = 0; @@ -1431,20 +1431,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && rb_page_write(cpu_buffer->tail_page) == rb_commit_index(cpu_buffer))) { + u64 diff; - delta = ts - cpu_buffer->write_stamp; + diff = ts - cpu_buffer->write_stamp; - /* make sure this delta is calculated here */ + /* make sure this diff is calculated here */ barrier(); /* Did the write stamp get updated already? */ if (unlikely(ts < cpu_buffer->write_stamp)) - delta = 0; + goto get_event; - else if (unlikely(test_time_stamp(delta))) { + delta = diff; + if (unlikely(test_time_stamp(delta))) { commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); - if (commit == -EBUSY) return NULL; @@ -1453,12 +1454,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, RB_WARN_ON(cpu_buffer, commit < 0); } - } else - /* Non commits have zero deltas */ - delta = 0; + } + get_event: event = __rb_reserve_next(cpu_buffer, 0, length, &ts); - if (PTR_ERR(event) == -EAGAIN) + if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; if (!event) { From 871b72dd1e12afc3f024479531d25a9339d2e3f9 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 11 May 2009 23:48:27 +0200 Subject: [PATCH 440/900] x86: microcode: use smp_call_function_single instead of set_cpus_allowed, cleanup of synchronization logic * Solve issues described in 6f66cbc63081fd70e3191b4dbb796746780e5ae1 in a way that doesn't resort to set_cpus_allowed(); * in fact, only collect_cpu_info and apply_microcode callbacks must run on a target cpu, others will do just fine on any other. smp_call_function_single() (as suggested by Ingo) is used to run these callbacks on a target cpu. * cleanup of synchronization logic of the 'microcode_core' part The generic 'microcode_core' part guarantees that only a single cpu (be it a full-fledged cpu, one of the cores or HT) is being updated at any particular moment of time. In general, there is no need for any additional sync. mechanism in arch-specific parts (the patch removes existing spinlocks). See also the "Synchronization" section in microcode_core.c. * return -EINVAL instead of -1 (which is translated into -EPERM) in microcode_write(), reload_cpu() and mc_sysdev_add(). Other suggestions for an error code? * use 'enum ucode_state' as return value of request_microcode_{fw, user} to gain more flexibility by distinguishing between real error cases and situations when an appropriate ucode was not found (which is not an error per-se). * some minor cleanups Thanks a lot to Hugh Dickins for review/suggestions/testing! Reference: http://marc.info/?l=linux-kernel&m=124025889012541&w=2 [ Impact: refactor and clean up microcode driver locking code ] Signed-off-by: Dmitry Adamushko Acked-by: Hugh Dickins Cc: Andrew Morton Cc: Rusty Russell Cc: Andreas Herrmann Cc: Peter Oruba Cc: Arjan van de Ven LKML-Reference: <1242078507.5560.9.camel@earth> [ did some more cleanups ] Signed-off-by: Ingo Molnar arch/x86/include/asm/microcode.h | 25 ++ arch/x86/kernel/microcode_amd.c | 58 ++---- arch/x86/kernel/microcode_core.c | 326 +++++++++++++++++++++----------------- arch/x86/kernel/microcode_intel.c | 92 +++------- 4 files changed, 261 insertions(+), 240 deletions(-) (~20 new comment lines) --- arch/x86/include/asm/microcode.h | 25 ++- arch/x86/kernel/microcode_amd.c | 58 ++---- arch/x86/kernel/microcode_core.c | 333 +++++++++++++++++------------- arch/x86/kernel/microcode_intel.c | 92 +++------ 4 files changed, 265 insertions(+), 243 deletions(-) diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index c882664716c..ef51b501e22 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -9,20 +9,31 @@ struct cpu_signature { struct device; +enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; + struct microcode_ops { - int (*request_microcode_user) (int cpu, const void __user *buf, size_t size); - int (*request_microcode_fw) (int cpu, struct device *device); + enum ucode_state (*request_microcode_user) (int cpu, + const void __user *buf, size_t size); - void (*apply_microcode) (int cpu); + enum ucode_state (*request_microcode_fw) (int cpu, + struct device *device); - int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); void (*microcode_fini_cpu) (int cpu); + + /* + * The generic 'microcode_core' part guarantees that + * the callbacks below run on a target cpu when they + * are being called. + * See also the "Synchronization" section in microcode_core.c. + */ + int (*apply_microcode) (int cpu); + int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); }; struct ucode_cpu_info { - struct cpu_signature cpu_sig; - int valid; - void *mc; + struct cpu_signature cpu_sig; + int valid; + void *mc; }; extern struct ucode_cpu_info ucode_cpu_info[]; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 453b5795a5c..c8be20f1644 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,25 +13,13 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ -#include -#include -#include #include -#include -#include #include #include #include #include #include -#include -#include -#include -#include -#include #include -#include -#include #include #include @@ -79,9 +67,6 @@ struct microcode_amd { #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 -/* serialize access to the physical write */ -static DEFINE_SPINLOCK(microcode_update_lock); - static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) @@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 1; } -static void apply_microcode_amd(int cpu) +static int apply_microcode_amd(int cpu) { - unsigned long flags; u32 rev, dummy; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu) BUG_ON(cpu_num != cpu); if (mc_amd == NULL) - return; + return 0; - spin_lock_irqsave(µcode_update_lock, flags); wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); /* get patch id after patching */ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { printk(KERN_ERR "microcode: CPU%d: update failed " "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); - return; + return -1; } printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; + + return 0; } static int get_ucode_data(void *to, const u8 *from, size_t n) @@ -263,7 +247,8 @@ static void free_equiv_cpu_table(void) } } -static int generic_load_microcode(int cpu, const u8 *data, size_t size) +static enum ucode_state +generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; const u8 *ucode_ptr = data; @@ -272,12 +257,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) int new_rev = uci->cpu_sig.rev; unsigned int leftover; unsigned long offset; + enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { printk(KERN_ERR "microcode: failed to create " "equivalent cpu table\n"); - return -EINVAL; + return UCODE_ERROR; } ucode_ptr += offset; @@ -312,28 +298,27 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) pr_debug("microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - } else + } else { vfree(new_mc); - } + state = UCODE_ERROR; + } + } else + state = UCODE_NFOUND; free_equiv_cpu_table(); - return (int)leftover; + return state; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; const struct firmware *firmware; - int ret; + enum ucode_state ret; - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - - ret = request_firmware(&firmware, fw_name, device); - if (ret) { + if (request_firmware(&firmware, fw_name, device)) { printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, firmware->data, firmware->size); @@ -343,11 +328,12 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { printk(KERN_INFO "microcode: AMD microcode update via " "/dev/cpu/microcode not supported\n"); - return -1; + return UCODE_ERROR; } static void microcode_fini_cpu_amd(int cpu) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 98c470c069d..9c4461501fc 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -71,27 +71,18 @@ * Thanks to Stuart Swales for pointing out this bug. */ #include -#include #include -#include +#include #include -#include -#include -#include -#include #include #include #include -#include -#include -#include #include #include #include #include #include -#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -101,36 +92,110 @@ MODULE_LICENSE("GPL"); static struct microcode_ops *microcode_ops; -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); +/* + * Operations that are run on a target cpu: + */ + +struct cpu_info_ctx { + struct cpu_signature *cpu_sig; + int err; +}; + +static void collect_cpu_info_local(void *arg) +{ + struct cpu_info_ctx *ctx = arg; + + ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), + ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ + struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +static int collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int ret; + + memset(uci, 0, sizeof(*uci)); + + ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); + if (!ret) + uci->valid = 1; + + return ret; +} + +struct apply_microcode_ctx { + int err; +}; + +static void apply_microcode_local(void *arg) +{ + struct apply_microcode_ctx *ctx = arg; + + ctx->err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ + struct apply_microcode_ctx ctx = { .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + #ifdef CONFIG_MICROCODE_OLD_INTERFACE static int do_microcode_update(const void __user *buf, size_t size) { - cpumask_t old; int error = 0; int cpu; - old = current->cpus_allowed; - for_each_online_cpu(cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; if (!uci->valid) continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = microcode_ops->request_microcode_user(cpu, buf, size); - if (error < 0) - goto out; - if (!error) - microcode_ops->apply_microcode(cpu); + ustate = microcode_ops->request_microcode_user(cpu, buf, size); + if (ustate == UCODE_ERROR) { + error = -1; + break; + } else if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); } -out: - set_cpus_allowed_ptr(current, &old); + return error; } @@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2) static ssize_t microcode_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { - ssize_t ret; + ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", - num_physpages); - return -EINVAL; + pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + return ret; } get_online_cpus(); mutex_lock(µcode_mutex); - ret = do_microcode_update(buf, len); - if (!ret) + if (do_microcode_update(buf, len) == 0) ret = (ssize_t)len; mutex_unlock(µcode_mutex); @@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, } static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, }; static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, }; static int __init microcode_dev_init(void) @@ -182,9 +245,7 @@ static int __init microcode_dev_init(void) error = misc_register(µcode_dev); if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); + pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); /* fake device for request_firmware */ static struct platform_device *microcode_pdev; -static long reload_for_cpu(void *unused) +static int reload_for_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; mutex_lock(µcode_mutex); if (uci->valid) { - err = microcode_ops->request_microcode_fw(smp_processor_id(), - µcode_pdev->dev); - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); + enum ucode_state ustate; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + else + if (ustate == UCODE_ERROR) + err = -EINVAL; } mutex_unlock(µcode_mutex); + return err; } static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, - const char *buf, size_t sz) + const char *buf, size_t size) { - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; + unsigned long val; int cpu = dev->id; + int ret = 0; + char *end; + val = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + if (val == 1) { get_online_cpus(); if (cpu_online(cpu)) - err = work_on_cpu(cpu, reload_for_cpu, NULL); + ret = reload_for_cpu(cpu); put_online_cpus(); } - if (err) - return err; - return sz; + + if (!ret) + ret = size; + + return ret; } static ssize_t version_show(struct sys_device *dev, @@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = { }; static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", + .attrs = mc_default_attrs, + .name = "microcode", }; -static void __microcode_fini_cpu(int cpu) +static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu) uci->valid = 0; } -static void microcode_fini_cpu(int cpu) -{ - mutex_lock(µcode_mutex); - __microcode_fini_cpu(cpu); - mutex_unlock(µcode_mutex); -} - -static void collect_cpu_info(int cpu) +static enum ucode_state microcode_resume_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - memset(uci, 0, sizeof(*uci)); - if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) - uci->valid = 1; -} - -static int microcode_resume_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct cpu_signature nsig; - - pr_debug("microcode: CPU%d resumed\n", cpu); - if (!uci->mc) - return 1; + return UCODE_NFOUND; - /* - * Let's verify that the 'cached' ucode does belong - * to this cpu (a bit of paranoia): - */ - if (microcode_ops->collect_cpu_info(cpu, &nsig)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", - cpu); - return -1; - } + pr_debug("microcode: CPU%d updated upon resume\n", cpu); + apply_microcode_on_target(cpu); - if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", - cpu); - /* Should we look for a new ucode here? */ - return 1; - } - - return 0; + return UCODE_OK; } -static long microcode_update_cpu(void *unused) +static enum ucode_state microcode_init_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); - int err = 0; + enum ucode_state ustate; - /* - * Check if the system resume is in progress (uci->valid != NULL), - * otherwise just request a firmware: - */ - if (uci->valid) { - err = microcode_resume_cpu(smp_processor_id()); - } else { - collect_cpu_info(smp_processor_id()); - if (uci->valid && system_state == SYSTEM_RUNNING) - err = microcode_ops->request_microcode_fw( - smp_processor_id(), - µcode_pdev->dev); + if (collect_cpu_info(cpu)) + return UCODE_ERROR; + + /* --dimm. Trigger a delayed update? */ + if (system_state != SYSTEM_RUNNING) + return UCODE_NFOUND; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + + if (ustate == UCODE_OK) { + pr_debug("microcode: CPU%d updated upon init\n", cpu); + apply_microcode_on_target(cpu); } - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); - return err; + + return ustate; } -static int microcode_init_cpu(int cpu) +static enum ucode_state microcode_update_cpu(int cpu) { - int err; - mutex_lock(µcode_mutex); - err = work_on_cpu(cpu, microcode_update_cpu, NULL); - mutex_unlock(µcode_mutex); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; - return err; + if (uci->valid) + ustate = microcode_resume_cpu(cpu); + else + ustate = microcode_init_cpu(cpu); + + return ustate; } static int mc_sysdev_add(struct sys_device *sys_dev) { int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); if (err) return err; - err = microcode_init_cpu(cpu); + if (microcode_init_cpu(cpu) == UCODE_ERROR) + err = -EINVAL; return err; } @@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) static int mc_sysdev_resume(struct sys_device *dev) { int cpu = dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; - /* only CPU 0 will apply ucode here */ - microcode_update_cpu(NULL); + /* + * All non-bootup cpus are still disabled, + * so only CPU 0 will apply ucode here. + * + * Moreover, there can be no concurrent + * updates from any other places at this point. + */ + WARN_ON(cpu != 0); + + if (uci->valid && uci->mc) + microcode_ops->apply_microcode(cpu); + return 0; } static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, }; static __cpuinit int @@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (microcode_init_cpu(cpu)) - printk(KERN_ERR "microcode: failed to init CPU%d\n", - cpu); + microcode_update_cpu(cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: pr_debug("microcode: CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); + pr_err("microcode: Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -465,13 +508,10 @@ static int __init microcode_init(void) microcode_ops = init_amd_microcode(); if (!microcode_ops) { - printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + pr_err("microcode: no support for this CPU vendor\n"); return -ENODEV; } - error = microcode_dev_init(); - if (error) - return error; microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) { @@ -480,23 +520,31 @@ static int __init microcode_init(void) } get_online_cpus(); + mutex_lock(µcode_mutex); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); + if (error) { - microcode_dev_exit(); platform_device_unregister(microcode_pdev); return error; } + error = microcode_dev_init(); + if (error) + return error; + register_hotcpu_notifier(&mc_cpu_notifier); - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " ," " Peter Oruba\n"); return 0; } +module_init(microcode_init); static void __exit microcode_exit(void) { @@ -505,16 +553,17 @@ static void __exit microcode_exit(void) unregister_hotcpu_notifier(&mc_cpu_notifier); get_online_cpus(); + mutex_lock(µcode_mutex); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); platform_device_unregister(microcode_pdev); microcode_ops = NULL; - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } - -module_init(microcode_init); module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 149b9ec7c1a..0d334ddd0a9 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,24 +70,11 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ -#include -#include -#include #include -#include -#include -#include #include -#include #include #include -#include -#include -#include -#include -#include -#include -#include +#include #include #include @@ -150,13 +137,9 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -/* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); - static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); - unsigned long flags; unsigned int val[2]; memset(csig, 0, sizeof(*csig)); @@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - spin_unlock_irqrestore(µcode_update_lock, flags); - pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - csig->sig, csig->pf, csig->rev); + printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); return 0; } @@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) return 0; } -static void apply_microcode(int cpu) +static int apply_microcode(int cpu) { struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; - unsigned long flags; unsigned int val[2]; int cpu_num; @@ -334,10 +312,7 @@ static void apply_microcode(int cpu) BUG_ON(cpu_num != cpu); if (mc_intel == NULL) - return; - - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); + return 0; /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, @@ -351,30 +326,32 @@ static void apply_microcode(int cpu) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - spin_unlock_irqrestore(µcode_update_lock, flags); if (val[1] != mc_intel->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", - cpu_num, uci->cpu_sig.rev, val[1]); - return; + printk(KERN_ERR "microcode: CPU%d update " + "to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); + return -1; } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %04x-%02x-%02x \n", - cpu_num, uci->cpu_sig.rev, val[1], + printk(KERN_INFO "microcode: CPU%d updated to revision " + "0x%x, date = %04x-%02x-%02x \n", + cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, (mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; + + return 0; } -static int generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u8 *ucode_ptr = data, *new_mc = NULL, *mc; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; + enum ucode_state state = UCODE_OK; while (leftover) { struct microcode_header_intel mc_header; @@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (!new_mc) - goto out; - if (leftover) { - vfree(new_mc); + if (new_mc) + vfree(new_mc); + state = UCODE_ERROR; + goto out; + } + + if (!new_mc) { + state = UCODE_NFOUND; goto out; } @@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - - out: - return (int)leftover; +out: + return state; } static int get_ucode_fw(void *to, const void *from, size_t n) @@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; - int ret; + enum ucode_state ret; - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); - ret = request_firmware(&firmware, name, device); - if (ret) { + + if (request_firmware(&firmware, name, device)) { pr_debug("microcode: data file %s load failed\n", name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, (void *)firmware->data, @@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n) return copy_from_user(to, from, n); } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } From 9d62dcdfa6f6fc843f7d9b494bcd48f00b94f883 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 11 May 2009 22:05:28 -0400 Subject: [PATCH 441/900] x86: merge process.c a bit Merge arch_align_stack() and arch_randomize_brk(), since they are the same. Tested on x86_64. [ Impact: cleanup ] Signed-off-by: Amerigo Wang Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 14 ++++++++++++++ arch/x86/kernel/process_32.c | 13 ------------- arch/x86/kernel/process_64.c | 13 ------------- 3 files changed, 14 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e84..2b9a8d0fb47 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -613,3 +614,16 @@ static int __init idle_setup(char *str) } early_param("idle", idle_setup); +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} + diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84043a..a3bb049ad08 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -497,15 +496,3 @@ unsigned long get_wchan(struct task_struct *p) return 0; } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b751a41392b..34386f72bdc 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -660,15 +659,3 @@ long sys_arch_prctl(int code, unsigned long addr) return do_arch_prctl(current, code, addr); } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} From bf78ad69cd351798b9447a269c6bd41ce1f111f4 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 11 May 2009 23:29:09 -0400 Subject: [PATCH 442/900] x86: process.c, remove useless headers is not needed by these files, remove them. [ Impact: cleanup ] Signed-off-by: WANG Cong Cc: akpm@linux-foundation.org LKML-Reference: <20090512032956.5040.77055.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 2 -- arch/x86/kernel/process_64.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a3bb049ad08..56d50b7d71d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -9,8 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include - #include #include #include diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 34386f72bdc..9d6b20e6cd8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -14,8 +14,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include - #include #include #include From ed077b58f6146684069975122b1728a9d248a501 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 12 May 2009 16:40:00 +0800 Subject: [PATCH 443/900] x86: make sparse mem work in non-NUMA mode With sparse memory, holes should not be marked present for memmap. This patch makes sure sparsemem really works on SMP mode (!NUMA). [ Impact: use less memory to map fragmented RAM, avoid boot-OOM/crash ] Signed-off-by: Shaohua Li Signed-off-by: Sheng Yang LKML-Reference: <1242117600.22431.0.camel@sli10-desk.sh.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index fef1d90d4f1..949708d7a48 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -706,15 +706,15 @@ void __init initmem_init(unsigned long start_pfn, highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; - memory_present(0, 0, highend_pfn); e820_register_active_regions(0, 0, highend_pfn); + sparse_memory_present_with_active_regions(0); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - memory_present(0, 0, max_low_pfn); e820_register_active_regions(0, 0, max_low_pfn); + sparse_memory_present_with_active_regions(0); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif From 4797f6b021a3fa399942245d07a1feb30df81bb8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 2 May 2009 10:40:57 -0700 Subject: [PATCH 444/900] x86: read apic ID in the !acpi_lapic case Ed found that on 32-bit, boot_cpu_physical_apicid is not read right, when the mptable is broken. Interestingly, actually three paths use/set it: 1. acpi: at that time that is already read from reg 2. mptable: only read from mptable 3. no madt, and no mptable, that use default apic id 0 for 64-bit, -1 for 32-bit so we could read the apic id for the 2/3 path. We trust the hardware register more than we trust a BIOS data structure (the mptable). We can also avoid the double set_fixmap() when acpi_lapic is used, and also need to move cpu_has_apic earlier and call apic_disable(). Also when need to update the apic id, we'd better read and set the apic version as well - so that quirks are applied precisely. v2: make path 3 with 64bit, use -1 as apic id, so could read it later. v3: fix whitespace problem pointed out by Ed Swierk v5: fix boot crash [ Impact: get correct apic id for bsp other than acpi path ] Reported-by: Ed Swierk Signed-off-by: Yinghai Lu Acked-by: Cyrill Gorcunov LKML-Reference: <49FC85A9.2070702@kernel.org> [ v4: sanity-check in the ACPI case too ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/smp.h | 2 +- arch/x86/kernel/apic/apic.c | 48 ++++++++++++++++------------------ arch/x86/kernel/apic/io_apic.c | 5 ++++ 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 19e0d88b966..6a84ed166ae 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -180,7 +180,7 @@ extern int safe_smp_processor_id(void); static inline int logical_smp_processor_id(void) { /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); + return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); } #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 07cffc1214c..b0fd26442c4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -242,17 +242,24 @@ static int modern_apic(void) * bare function to substitute write operation * and it's _that_ fast :) */ -void native_apic_write_dummy(u32 reg, u32 v) +static void native_apic_write_dummy(u32 reg, u32 v) { WARN_ON_ONCE((cpu_has_apic || !disable_apic)); } +static u32 native_apic_read_dummy(u32 reg) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); + return 0; +} + /* - * right after this call apic->write doesn't do anything + * right after this call apic->write/read doesn't do anything * note that there is no restore operation it works one way */ void apic_disable(void) { + apic->read = native_apic_read_dummy; apic->write = native_apic_write_dummy; } @@ -1576,32 +1583,23 @@ void __init init_apic_mappings(void) return; } - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ + /* If no local APIC can be found return early */ if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - /* - * acpi lapic path already maps that address in - * acpi_register_lapic_address() - */ - if (!acpi_lapic) - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - - apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", - APIC_BASE, apic_phys); - - /* lets check if we may NOP'ify apic operations */ - if (!cpu_has_apic) { + /* lets NOP'ify apic operations */ pr_info("APIC: disable apic facility\n"); apic_disable(); - return; + } else { + apic_phys = mp_lapic_addr; + + /* + * acpi lapic path already maps that address in + * acpi_register_lapic_address() + */ + if (!acpi_lapic) + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", + APIC_BASE, apic_phys); } /* diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1f3d3669dae..74d2b480a20 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1878,6 +1878,11 @@ __apicdebuginit(void) print_PIC(void) __apicdebuginit(int) print_all_ICs(void) { print_PIC(); + + /* don't print out if apic is not there */ + if (!cpu_has_apic || disable_apic) + return 0; + print_all_local_APICs(); print_IO_APIC(); From b5710ce92a8cf8e3fc0ffc230cfdbfa23463f1c8 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 12 May 2009 18:51:28 +0400 Subject: [PATCH 445/900] x86/pci: add 4 more return parameters to IO_APIC_get_PCI_irq_vector(), fix Fix trivial typo in the drivers/pci/hotplug/ibmphp_core.c changes. [ Impact: build fix ] Signed-off-by: Cyrill Gorcunov CC: Yinghai Lu Cc: eswierk@aristanetworks.com LKML-Reference: <20090512145128.GA10220@lenovo> Signed-off-by: Ingo Molnar --- drivers/pci/hotplug/ibmphp_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c index ef53b05a411..79901a0db88 100644 --- a/drivers/pci/hotplug/ibmphp_core.c +++ b/drivers/pci/hotplug/ibmphp_core.c @@ -161,7 +161,7 @@ int ibmphp_init_devno(struct slot **cur_slot) (*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn); for (i = 0; i < 4; i++) (*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus, - (int) (*cur_slot)->device, i. + (int) (*cur_slot)->device, i, &ioapic, &ioapic_pin, &triggering, &polarity); From 7ed42a28b269f8682eefae27f5c11187eb56e63b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 12 May 2009 11:33:08 -0700 Subject: [PATCH 446/900] x86, boot: correct sanity checks in boot/compressed/misc.c arch/x86/boot/compressed/misc.c contains several sanity checks on the output address. Correct constraints that are no longer correct: - the alignment test should be MIN_KERNEL_ALIGN on both 32 and 64 bits. - the 64 bit maximum address was set to 2^40, which was the limit of one specific x86-64 implementation. Change the test to 2^46, the current Linux limit, and at least try to test the end rather than the beginning. - for non-relocatable kernels, test against LOAD_PHYSICAL_ADDR on both 32 and 64 bits. [ Impact: fix potential boot failure due to invalid tests ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/misc.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e45be73684f..842b2a36174 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -325,20 +325,18 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; + if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) + error("Destination address inappropriately aligned"); #ifdef CONFIG_X86_64 - if ((unsigned long)output & (__KERNEL_ALIGN - 1)) - error("Destination address not 2M aligned"); - if ((unsigned long)output >= 0xffffffffffUL) + if (heap > 0x3fffffffffffUL) error("Destination address too large"); #else - if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1)) - error("Destination address not CONFIG_PHYSICAL_ALIGN aligned"); if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) error("Destination address too large"); -#ifndef CONFIG_RELOCATABLE - if ((u32)output != LOAD_PHYSICAL_ADDR) - error("Wrong destination address"); #endif +#ifndef CONFIG_RELOCATABLE + if ((unsigned long)output != LOAD_PHYSICAL_ADDR) + error("Wrong destination address"); #endif if (!quiet) From c4f68236e41641494f9c8a418ccc0678c335bbb5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 12 May 2009 11:37:34 -0700 Subject: [PATCH 447/900] x86-64: align __PHYSICAL_START, remove __KERNEL_ALIGN Handle the misconfiguration where CONFIG_PHYSICAL_START is incompatible with CONFIG_PHYSICAL_ALIGN. This is a configuration error, but one which arises easily since Kconfig doesn't have the smarts to express the true relationship between these two variables. Hence, align __PHYSICAL_START the same way we align LOAD_PHYSICAL_ADDR in . For non-relocatable kernels, this would cause the boot to fail. [ Impact: fix boot failures for non-relocatable kernels ] Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_64_types.h | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index d38c91b7024..e11900f2500 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -32,17 +32,9 @@ */ #define __PAGE_OFFSET _AC(0xffff880000000000, UL) -#define __PHYSICAL_START CONFIG_PHYSICAL_START -#define __KERNEL_ALIGN 0x200000 - -/* - * Make sure kernel is aligned to 2MB address. Catching it at compile - * time is better. Change your config file and compile the kernel - * for a 2MB aligned address (CONFIG_PHYSICAL_START) - */ -#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0 -#error "CONFIG_PHYSICAL_START must be a multiple of 2MB" -#endif +#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \ + (CONFIG_PHYSICAL_ALIGN - 1)) & \ + ~(CONFIG_PHYSICAL_ALIGN - 1)) #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map _AC(0xffffffff80000000, UL) From 5df6d737dd4b0fe9eccf943abb3677cfea05a6c4 Mon Sep 17 00:00:00 2001 From: Abhijeet Joglekar Date: Fri, 17 Apr 2009 18:33:26 -0700 Subject: [PATCH 448/900] [SCSI] fnic: Add new Cisco PCI-Express FCoE HBA fnic is a driver for the Cisco PCI-Express FCoE HBA Signed-off-by: Abhijeet Joglekar Signed-off-by: Joe Eykholt Signed-off-by: Mike Christie Signed-off-by: James Bottomley --- MAINTAINERS | 8 + drivers/scsi/Kconfig | 11 + drivers/scsi/Makefile | 1 + drivers/scsi/fnic/Makefile | 15 + drivers/scsi/fnic/cq_desc.h | 78 ++ drivers/scsi/fnic/cq_enet_desc.h | 167 +++ drivers/scsi/fnic/cq_exch_desc.h | 182 +++ drivers/scsi/fnic/fcpio.h | 780 ++++++++++++ drivers/scsi/fnic/fnic.h | 265 +++++ drivers/scsi/fnic/fnic_attrs.c | 56 + drivers/scsi/fnic/fnic_fcs.c | 742 ++++++++++++ drivers/scsi/fnic/fnic_io.h | 67 ++ drivers/scsi/fnic/fnic_isr.c | 332 ++++++ drivers/scsi/fnic/fnic_main.c | 942 +++++++++++++++ drivers/scsi/fnic/fnic_res.c | 444 +++++++ drivers/scsi/fnic/fnic_res.h | 197 +++ drivers/scsi/fnic/fnic_scsi.c | 1850 +++++++++++++++++++++++++++++ drivers/scsi/fnic/rq_enet_desc.h | 58 + drivers/scsi/fnic/vnic_cq.c | 85 ++ drivers/scsi/fnic/vnic_cq.h | 121 ++ drivers/scsi/fnic/vnic_cq_copy.h | 62 + drivers/scsi/fnic/vnic_dev.c | 690 +++++++++++ drivers/scsi/fnic/vnic_dev.h | 161 +++ drivers/scsi/fnic/vnic_devcmd.h | 281 +++++ drivers/scsi/fnic/vnic_intr.c | 60 + drivers/scsi/fnic/vnic_intr.h | 118 ++ drivers/scsi/fnic/vnic_nic.h | 69 ++ drivers/scsi/fnic/vnic_resource.h | 61 + drivers/scsi/fnic/vnic_rq.c | 196 +++ drivers/scsi/fnic/vnic_rq.h | 235 ++++ drivers/scsi/fnic/vnic_scsi.h | 99 ++ drivers/scsi/fnic/vnic_stats.h | 68 ++ drivers/scsi/fnic/vnic_wq.c | 182 +++ drivers/scsi/fnic/vnic_wq.h | 175 +++ drivers/scsi/fnic/vnic_wq_copy.c | 117 ++ drivers/scsi/fnic/vnic_wq_copy.h | 128 ++ drivers/scsi/fnic/wq_enet_desc.h | 96 ++ 37 files changed, 9199 insertions(+) create mode 100644 drivers/scsi/fnic/Makefile create mode 100644 drivers/scsi/fnic/cq_desc.h create mode 100644 drivers/scsi/fnic/cq_enet_desc.h create mode 100644 drivers/scsi/fnic/cq_exch_desc.h create mode 100644 drivers/scsi/fnic/fcpio.h create mode 100644 drivers/scsi/fnic/fnic.h create mode 100644 drivers/scsi/fnic/fnic_attrs.c create mode 100644 drivers/scsi/fnic/fnic_fcs.c create mode 100644 drivers/scsi/fnic/fnic_io.h create mode 100644 drivers/scsi/fnic/fnic_isr.c create mode 100644 drivers/scsi/fnic/fnic_main.c create mode 100644 drivers/scsi/fnic/fnic_res.c create mode 100644 drivers/scsi/fnic/fnic_res.h create mode 100644 drivers/scsi/fnic/fnic_scsi.c create mode 100644 drivers/scsi/fnic/rq_enet_desc.h create mode 100644 drivers/scsi/fnic/vnic_cq.c create mode 100644 drivers/scsi/fnic/vnic_cq.h create mode 100644 drivers/scsi/fnic/vnic_cq_copy.h create mode 100644 drivers/scsi/fnic/vnic_dev.c create mode 100644 drivers/scsi/fnic/vnic_dev.h create mode 100644 drivers/scsi/fnic/vnic_devcmd.h create mode 100644 drivers/scsi/fnic/vnic_intr.c create mode 100644 drivers/scsi/fnic/vnic_intr.h create mode 100644 drivers/scsi/fnic/vnic_nic.h create mode 100644 drivers/scsi/fnic/vnic_resource.h create mode 100644 drivers/scsi/fnic/vnic_rq.c create mode 100644 drivers/scsi/fnic/vnic_rq.h create mode 100644 drivers/scsi/fnic/vnic_scsi.h create mode 100644 drivers/scsi/fnic/vnic_stats.h create mode 100644 drivers/scsi/fnic/vnic_wq.c create mode 100644 drivers/scsi/fnic/vnic_wq.h create mode 100644 drivers/scsi/fnic/vnic_wq_copy.c create mode 100644 drivers/scsi/fnic/vnic_wq_copy.h create mode 100644 drivers/scsi/fnic/wq_enet_desc.h diff --git a/MAINTAINERS b/MAINTAINERS index 2b349ba4add..c7bed166ad6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1431,6 +1431,14 @@ P: Russell King M: linux@arm.linux.org.uk F: include/linux/clk.h +CISCO FCOE HBA DRIVER +P: Abhijeet Joglekar +M: abjoglek@cisco.com +P: Joe Eykholt +M: jeykholt@cisco.com +L: linux-scsi@vger.kernel.org +S: Supported + CODA FILE SYSTEM P: Jan Harkes M: jaharkes@cs.cmu.edu diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 8ed2990c826..fb2740789b6 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -628,6 +628,17 @@ config FCOE ---help--- Fibre Channel over Ethernet module +config FCOE_FNIC + tristate "Cisco FNIC Driver" + depends on PCI && X86 + select LIBFC + help + This is support for the Cisco PCI-Express FCoE HBA. + + To compile this driver as a module, choose M here and read + . + The module will be called fnic. + config SCSI_DMX3191D tristate "DMX3191D SCSI support" depends on PCI && SCSI diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile index e7c861ac417..a5049cfb40e 100644 --- a/drivers/scsi/Makefile +++ b/drivers/scsi/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_SCSI_DH) += device_handler/ obj-$(CONFIG_LIBFC) += libfc/ obj-$(CONFIG_LIBFCOE) += fcoe/ obj-$(CONFIG_FCOE) += fcoe/ +obj-$(CONFIG_FCOE_FNIC) += fnic/ obj-$(CONFIG_ISCSI_TCP) += libiscsi.o libiscsi_tcp.o iscsi_tcp.o obj-$(CONFIG_INFINIBAND_ISER) += libiscsi.o obj-$(CONFIG_SCSI_A4000T) += 53c700.o a4000t.o diff --git a/drivers/scsi/fnic/Makefile b/drivers/scsi/fnic/Makefile new file mode 100644 index 00000000000..37c3440bc17 --- /dev/null +++ b/drivers/scsi/fnic/Makefile @@ -0,0 +1,15 @@ +obj-$(CONFIG_FCOE_FNIC) += fnic.o + +fnic-y := \ + fnic_attrs.o \ + fnic_isr.o \ + fnic_main.o \ + fnic_res.o \ + fnic_fcs.o \ + fnic_scsi.o \ + vnic_cq.o \ + vnic_dev.o \ + vnic_intr.o \ + vnic_rq.o \ + vnic_wq_copy.o \ + vnic_wq.o diff --git a/drivers/scsi/fnic/cq_desc.h b/drivers/scsi/fnic/cq_desc.h new file mode 100644 index 00000000000..d1225cf6320 --- /dev/null +++ b/drivers/scsi/fnic/cq_desc.h @@ -0,0 +1,78 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _CQ_DESC_H_ +#define _CQ_DESC_H_ + +/* + * Completion queue descriptor types + */ +enum cq_desc_types { + CQ_DESC_TYPE_WQ_ENET = 0, + CQ_DESC_TYPE_DESC_COPY = 1, + CQ_DESC_TYPE_WQ_EXCH = 2, + CQ_DESC_TYPE_RQ_ENET = 3, + CQ_DESC_TYPE_RQ_FCP = 4, +}; + +/* Completion queue descriptor: 16B + * + * All completion queues have this basic layout. The + * type_specfic area is unique for each completion + * queue type. + */ +struct cq_desc { + __le16 completed_index; + __le16 q_number; + u8 type_specfic[11]; + u8 type_color; +}; + +#define CQ_DESC_TYPE_BITS 4 +#define CQ_DESC_TYPE_MASK ((1 << CQ_DESC_TYPE_BITS) - 1) +#define CQ_DESC_COLOR_MASK 1 +#define CQ_DESC_COLOR_SHIFT 7 +#define CQ_DESC_Q_NUM_BITS 10 +#define CQ_DESC_Q_NUM_MASK ((1 << CQ_DESC_Q_NUM_BITS) - 1) +#define CQ_DESC_COMP_NDX_BITS 12 +#define CQ_DESC_COMP_NDX_MASK ((1 << CQ_DESC_COMP_NDX_BITS) - 1) + +static inline void cq_desc_dec(const struct cq_desc *desc_arg, + u8 *type, u8 *color, u16 *q_number, u16 *completed_index) +{ + const struct cq_desc *desc = desc_arg; + const u8 type_color = desc->type_color; + + *color = (type_color >> CQ_DESC_COLOR_SHIFT) & CQ_DESC_COLOR_MASK; + + /* + * Make sure color bit is read from desc *before* other fields + * are read from desc. Hardware guarantees color bit is last + * bit (byte) written. Adding the rmb() prevents the compiler + * and/or CPU from reordering the reads which would potentially + * result in reading stale values. + */ + + rmb(); + + *type = type_color & CQ_DESC_TYPE_MASK; + *q_number = le16_to_cpu(desc->q_number) & CQ_DESC_Q_NUM_MASK; + *completed_index = le16_to_cpu(desc->completed_index) & + CQ_DESC_COMP_NDX_MASK; +} + +#endif /* _CQ_DESC_H_ */ diff --git a/drivers/scsi/fnic/cq_enet_desc.h b/drivers/scsi/fnic/cq_enet_desc.h new file mode 100644 index 00000000000..a9fa26f82dd --- /dev/null +++ b/drivers/scsi/fnic/cq_enet_desc.h @@ -0,0 +1,167 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _CQ_ENET_DESC_H_ +#define _CQ_ENET_DESC_H_ + +#include "cq_desc.h" + +/* Ethernet completion queue descriptor: 16B */ +struct cq_enet_wq_desc { + __le16 completed_index; + __le16 q_number; + u8 reserved[11]; + u8 type_color; +}; + +static inline void cq_enet_wq_desc_dec(struct cq_enet_wq_desc *desc, + u8 *type, u8 *color, u16 *q_number, u16 *completed_index) +{ + cq_desc_dec((struct cq_desc *)desc, type, + color, q_number, completed_index); +} + +/* Completion queue descriptor: Ethernet receive queue, 16B */ +struct cq_enet_rq_desc { + __le16 completed_index_flags; + __le16 q_number_rss_type_flags; + __le32 rss_hash; + __le16 bytes_written_flags; + __le16 vlan; + __le16 checksum_fcoe; + u8 flags; + u8 type_color; +}; + +#define CQ_ENET_RQ_DESC_FLAGS_INGRESS_PORT (0x1 << 12) +#define CQ_ENET_RQ_DESC_FLAGS_FCOE (0x1 << 13) +#define CQ_ENET_RQ_DESC_FLAGS_EOP (0x1 << 14) +#define CQ_ENET_RQ_DESC_FLAGS_SOP (0x1 << 15) + +#define CQ_ENET_RQ_DESC_RSS_TYPE_BITS 4 +#define CQ_ENET_RQ_DESC_RSS_TYPE_MASK \ + ((1 << CQ_ENET_RQ_DESC_RSS_TYPE_BITS) - 1) +#define CQ_ENET_RQ_DESC_RSS_TYPE_NONE 0 +#define CQ_ENET_RQ_DESC_RSS_TYPE_IPv4 1 +#define CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv4 2 +#define CQ_ENET_RQ_DESC_RSS_TYPE_IPv6 3 +#define CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv6 4 +#define CQ_ENET_RQ_DESC_RSS_TYPE_IPv6_EX 5 +#define CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv6_EX 6 + +#define CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC (0x1 << 14) + +#define CQ_ENET_RQ_DESC_BYTES_WRITTEN_BITS 14 +#define CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK \ + ((1 << CQ_ENET_RQ_DESC_BYTES_WRITTEN_BITS) - 1) +#define CQ_ENET_RQ_DESC_FLAGS_TRUNCATED (0x1 << 14) +#define CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED (0x1 << 15) + +#define CQ_ENET_RQ_DESC_FCOE_SOF_BITS 4 +#define CQ_ENET_RQ_DESC_FCOE_SOF_MASK \ + ((1 << CQ_ENET_RQ_DESC_FCOE_SOF_BITS) - 1) +#define CQ_ENET_RQ_DESC_FCOE_EOF_BITS 8 +#define CQ_ENET_RQ_DESC_FCOE_EOF_MASK \ + ((1 << CQ_ENET_RQ_DESC_FCOE_EOF_BITS) - 1) +#define CQ_ENET_RQ_DESC_FCOE_EOF_SHIFT 8 + +#define CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK (0x1 << 0) +#define CQ_ENET_RQ_DESC_FCOE_FC_CRC_OK (0x1 << 0) +#define CQ_ENET_RQ_DESC_FLAGS_UDP (0x1 << 1) +#define CQ_ENET_RQ_DESC_FCOE_ENC_ERROR (0x1 << 1) +#define CQ_ENET_RQ_DESC_FLAGS_TCP (0x1 << 2) +#define CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK (0x1 << 3) +#define CQ_ENET_RQ_DESC_FLAGS_IPV6 (0x1 << 4) +#define CQ_ENET_RQ_DESC_FLAGS_IPV4 (0x1 << 5) +#define CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT (0x1 << 6) +#define CQ_ENET_RQ_DESC_FLAGS_FCS_OK (0x1 << 7) + +static inline void cq_enet_rq_desc_dec(struct cq_enet_rq_desc *desc, + u8 *type, u8 *color, u16 *q_number, u16 *completed_index, + u8 *ingress_port, u8 *fcoe, u8 *eop, u8 *sop, u8 *rss_type, + u8 *csum_not_calc, u32 *rss_hash, u16 *bytes_written, u8 *packet_error, + u8 *vlan_stripped, u16 *vlan, u16 *checksum, u8 *fcoe_sof, + u8 *fcoe_fc_crc_ok, u8 *fcoe_enc_error, u8 *fcoe_eof, + u8 *tcp_udp_csum_ok, u8 *udp, u8 *tcp, u8 *ipv4_csum_ok, + u8 *ipv6, u8 *ipv4, u8 *ipv4_fragment, u8 *fcs_ok) +{ + u16 completed_index_flags = le16_to_cpu(desc->completed_index_flags); + u16 q_number_rss_type_flags = + le16_to_cpu(desc->q_number_rss_type_flags); + u16 bytes_written_flags = le16_to_cpu(desc->bytes_written_flags); + + cq_desc_dec((struct cq_desc *)desc, type, + color, q_number, completed_index); + + *ingress_port = (completed_index_flags & + CQ_ENET_RQ_DESC_FLAGS_INGRESS_PORT) ? 1 : 0; + *fcoe = (completed_index_flags & CQ_ENET_RQ_DESC_FLAGS_FCOE) ? + 1 : 0; + *eop = (completed_index_flags & CQ_ENET_RQ_DESC_FLAGS_EOP) ? + 1 : 0; + *sop = (completed_index_flags & CQ_ENET_RQ_DESC_FLAGS_SOP) ? + 1 : 0; + + *rss_type = (u8)((q_number_rss_type_flags >> CQ_DESC_Q_NUM_BITS) & + CQ_ENET_RQ_DESC_RSS_TYPE_MASK); + *csum_not_calc = (q_number_rss_type_flags & + CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC) ? 1 : 0; + + *rss_hash = le32_to_cpu(desc->rss_hash); + + *bytes_written = bytes_written_flags & + CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK; + *packet_error = (bytes_written_flags & + CQ_ENET_RQ_DESC_FLAGS_TRUNCATED) ? 1 : 0; + *vlan_stripped = (bytes_written_flags & + CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) ? 1 : 0; + + *vlan = le16_to_cpu(desc->vlan); + + if (*fcoe) { + *fcoe_sof = (u8)(le16_to_cpu(desc->checksum_fcoe) & + CQ_ENET_RQ_DESC_FCOE_SOF_MASK); + *fcoe_fc_crc_ok = (desc->flags & + CQ_ENET_RQ_DESC_FCOE_FC_CRC_OK) ? 1 : 0; + *fcoe_enc_error = (desc->flags & + CQ_ENET_RQ_DESC_FCOE_ENC_ERROR) ? 1 : 0; + *fcoe_eof = (u8)((desc->checksum_fcoe >> + CQ_ENET_RQ_DESC_FCOE_EOF_SHIFT) & + CQ_ENET_RQ_DESC_FCOE_EOF_MASK); + *checksum = 0; + } else { + *fcoe_sof = 0; + *fcoe_fc_crc_ok = 0; + *fcoe_enc_error = 0; + *fcoe_eof = 0; + *checksum = le16_to_cpu(desc->checksum_fcoe); + } + + *tcp_udp_csum_ok = + (desc->flags & CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK) ? 1 : 0; + *udp = (desc->flags & CQ_ENET_RQ_DESC_FLAGS_UDP) ? 1 : 0; + *tcp = (desc->flags & CQ_ENET_RQ_DESC_FLAGS_TCP) ? 1 : 0; + *ipv4_csum_ok = + (desc->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK) ? 1 : 0; + *ipv6 = (desc->flags & CQ_ENET_RQ_DESC_FLAGS_IPV6) ? 1 : 0; + *ipv4 = (desc->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4) ? 1 : 0; + *ipv4_fragment = + (desc->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT) ? 1 : 0; + *fcs_ok = (desc->flags & CQ_ENET_RQ_DESC_FLAGS_FCS_OK) ? 1 : 0; +} + +#endif /* _CQ_ENET_DESC_H_ */ diff --git a/drivers/scsi/fnic/cq_exch_desc.h b/drivers/scsi/fnic/cq_exch_desc.h new file mode 100644 index 00000000000..501660cfe22 --- /dev/null +++ b/drivers/scsi/fnic/cq_exch_desc.h @@ -0,0 +1,182 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _CQ_EXCH_DESC_H_ +#define _CQ_EXCH_DESC_H_ + +#include "cq_desc.h" + +/* Exchange completion queue descriptor: 16B */ +struct cq_exch_wq_desc { + u16 completed_index; + u16 q_number; + u16 exchange_id; + u8 tmpl; + u8 reserved0; + u32 reserved1; + u8 exch_status; + u8 reserved2[2]; + u8 type_color; +}; + +#define CQ_EXCH_WQ_STATUS_BITS 2 +#define CQ_EXCH_WQ_STATUS_MASK ((1 << CQ_EXCH_WQ_STATUS_BITS) - 1) + +enum cq_exch_status_types { + CQ_EXCH_WQ_STATUS_TYPE_COMPLETE = 0, + CQ_EXCH_WQ_STATUS_TYPE_ABORT = 1, + CQ_EXCH_WQ_STATUS_TYPE_SGL_EOF = 2, + CQ_EXCH_WQ_STATUS_TYPE_TMPL_ERR = 3, +}; + +static inline void cq_exch_wq_desc_dec(struct cq_exch_wq_desc *desc_ptr, + u8 *type, + u8 *color, + u16 *q_number, + u16 *completed_index, + u8 *exch_status) +{ + cq_desc_dec((struct cq_desc *)desc_ptr, type, + color, q_number, completed_index); + *exch_status = desc_ptr->exch_status & CQ_EXCH_WQ_STATUS_MASK; +} + +struct cq_fcp_rq_desc { + u16 completed_index_eop_sop_prt; + u16 q_number; + u16 exchange_id; + u16 tmpl; + u16 bytes_written; + u16 vlan; + u8 sof; + u8 eof; + u8 fcs_fer_fck; + u8 type_color; +}; + +#define CQ_FCP_RQ_DESC_FLAGS_SOP (1 << 15) +#define CQ_FCP_RQ_DESC_FLAGS_EOP (1 << 14) +#define CQ_FCP_RQ_DESC_FLAGS_PRT (1 << 12) +#define CQ_FCP_RQ_DESC_TMPL_MASK 0x1f +#define CQ_FCP_RQ_DESC_BYTES_WRITTEN_MASK 0x3fff +#define CQ_FCP_RQ_DESC_PACKET_ERR_SHIFT 14 +#define CQ_FCP_RQ_DESC_PACKET_ERR_MASK (1 << CQ_FCP_RQ_DESC_PACKET_ERR_SHIFT) +#define CQ_FCP_RQ_DESC_VS_STRIPPED_SHIFT 15 +#define CQ_FCP_RQ_DESC_VS_STRIPPED_MASK (1 << CQ_FCP_RQ_DESC_VS_STRIPPED_SHIFT) +#define CQ_FCP_RQ_DESC_FC_CRC_OK_MASK 0x1 +#define CQ_FCP_RQ_DESC_FCOE_ERR_SHIFT 1 +#define CQ_FCP_RQ_DESC_FCOE_ERR_MASK (1 << CQ_FCP_RQ_DESC_FCOE_ERR_SHIFT) +#define CQ_FCP_RQ_DESC_FCS_OK_SHIFT 7 +#define CQ_FCP_RQ_DESC_FCS_OK_MASK (1 << CQ_FCP_RQ_DESC_FCS_OK_SHIFT) + +static inline void cq_fcp_rq_desc_dec(struct cq_fcp_rq_desc *desc_ptr, + u8 *type, + u8 *color, + u16 *q_number, + u16 *completed_index, + u8 *eop, + u8 *sop, + u8 *fck, + u16 *exchange_id, + u16 *tmpl, + u32 *bytes_written, + u8 *sof, + u8 *eof, + u8 *ingress_port, + u8 *packet_err, + u8 *fcoe_err, + u8 *fcs_ok, + u8 *vlan_stripped, + u16 *vlan) +{ + cq_desc_dec((struct cq_desc *)desc_ptr, type, + color, q_number, completed_index); + *eop = (desc_ptr->completed_index_eop_sop_prt & + CQ_FCP_RQ_DESC_FLAGS_EOP) ? 1 : 0; + *sop = (desc_ptr->completed_index_eop_sop_prt & + CQ_FCP_RQ_DESC_FLAGS_SOP) ? 1 : 0; + *ingress_port = + (desc_ptr->completed_index_eop_sop_prt & + CQ_FCP_RQ_DESC_FLAGS_PRT) ? 1 : 0; + *exchange_id = desc_ptr->exchange_id; + *tmpl = desc_ptr->tmpl & CQ_FCP_RQ_DESC_TMPL_MASK; + *bytes_written = + desc_ptr->bytes_written & CQ_FCP_RQ_DESC_BYTES_WRITTEN_MASK; + *packet_err = + (desc_ptr->bytes_written & CQ_FCP_RQ_DESC_PACKET_ERR_MASK) >> + CQ_FCP_RQ_DESC_PACKET_ERR_SHIFT; + *vlan_stripped = + (desc_ptr->bytes_written & CQ_FCP_RQ_DESC_VS_STRIPPED_MASK) >> + CQ_FCP_RQ_DESC_VS_STRIPPED_SHIFT; + *vlan = desc_ptr->vlan; + *sof = desc_ptr->sof; + *fck = desc_ptr->fcs_fer_fck & CQ_FCP_RQ_DESC_FC_CRC_OK_MASK; + *fcoe_err = (desc_ptr->fcs_fer_fck & CQ_FCP_RQ_DESC_FCOE_ERR_MASK) >> + CQ_FCP_RQ_DESC_FCOE_ERR_SHIFT; + *eof = desc_ptr->eof; + *fcs_ok = + (desc_ptr->fcs_fer_fck & CQ_FCP_RQ_DESC_FCS_OK_MASK) >> + CQ_FCP_RQ_DESC_FCS_OK_SHIFT; +} + +struct cq_sgl_desc { + u16 exchange_id; + u16 q_number; + u32 active_burst_offset; + u32 tot_data_bytes; + u16 tmpl; + u8 sgl_err; + u8 type_color; +}; + +enum cq_sgl_err_types { + CQ_SGL_ERR_NO_ERROR = 0, + CQ_SGL_ERR_OVERFLOW, /* data ran beyond end of SGL */ + CQ_SGL_ERR_SGL_LCL_ADDR_ERR, /* sgl access to local vnic addr illegal*/ + CQ_SGL_ERR_ADDR_RSP_ERR, /* sgl address error */ + CQ_SGL_ERR_DATA_RSP_ERR, /* sgl data rsp error */ + CQ_SGL_ERR_CNT_ZERO_ERR, /* SGL count is 0 */ + CQ_SGL_ERR_CNT_MAX_ERR, /* SGL count is larger than supported */ + CQ_SGL_ERR_ORDER_ERR, /* frames recv on both ports, order err */ + CQ_SGL_ERR_DATA_LCL_ADDR_ERR,/* sgl data buf to local vnic addr ill */ + CQ_SGL_ERR_HOST_CQ_ERR, /* host cq entry to local vnic addr ill */ +}; + +#define CQ_SGL_SGL_ERR_MASK 0x1f +#define CQ_SGL_TMPL_MASK 0x1f + +static inline void cq_sgl_desc_dec(struct cq_sgl_desc *desc_ptr, + u8 *type, + u8 *color, + u16 *q_number, + u16 *exchange_id, + u32 *active_burst_offset, + u32 *tot_data_bytes, + u16 *tmpl, + u8 *sgl_err) +{ + /* Cheat a little by assuming exchange_id is the same as completed + index */ + cq_desc_dec((struct cq_desc *)desc_ptr, type, color, q_number, + exchange_id); + *active_burst_offset = desc_ptr->active_burst_offset; + *tot_data_bytes = desc_ptr->tot_data_bytes; + *tmpl = desc_ptr->tmpl & CQ_SGL_TMPL_MASK; + *sgl_err = desc_ptr->sgl_err & CQ_SGL_SGL_ERR_MASK; +} + +#endif /* _CQ_EXCH_DESC_H_ */ diff --git a/drivers/scsi/fnic/fcpio.h b/drivers/scsi/fnic/fcpio.h new file mode 100644 index 00000000000..12d770d885c --- /dev/null +++ b/drivers/scsi/fnic/fcpio.h @@ -0,0 +1,780 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _FCPIO_H_ +#define _FCPIO_H_ + +#include + +/* + * This header file includes all of the data structures used for + * communication by the host driver to the fcp firmware. + */ + +/* + * Exchange and sequence id space allocated to the host driver + */ +#define FCPIO_HOST_EXCH_RANGE_START 0x1000 +#define FCPIO_HOST_EXCH_RANGE_END 0x1fff +#define FCPIO_HOST_SEQ_ID_RANGE_START 0x80 +#define FCPIO_HOST_SEQ_ID_RANGE_END 0xff + +/* + * Command entry type + */ +enum fcpio_type { + /* + * Initiator request types + */ + FCPIO_ICMND_16 = 0x1, + FCPIO_ICMND_32, + FCPIO_ICMND_CMPL, + FCPIO_ITMF, + FCPIO_ITMF_CMPL, + + /* + * Target request types + */ + FCPIO_TCMND_16 = 0x11, + FCPIO_TCMND_32, + FCPIO_TDATA, + FCPIO_TXRDY, + FCPIO_TRSP, + FCPIO_TDRSP_CMPL, + FCPIO_TTMF, + FCPIO_TTMF_ACK, + FCPIO_TABORT, + FCPIO_TABORT_CMPL, + + /* + * Misc request types + */ + FCPIO_ACK = 0x20, + FCPIO_RESET, + FCPIO_RESET_CMPL, + FCPIO_FLOGI_REG, + FCPIO_FLOGI_REG_CMPL, + FCPIO_ECHO, + FCPIO_ECHO_CMPL, + FCPIO_LUNMAP_CHNG, + FCPIO_LUNMAP_REQ, + FCPIO_LUNMAP_REQ_CMPL, + FCPIO_FLOGI_FIP_REG, + FCPIO_FLOGI_FIP_REG_CMPL, +}; + +/* + * Header status codes from the firmware + */ +enum fcpio_status { + FCPIO_SUCCESS = 0, /* request was successful */ + + /* + * If a request to the firmware is rejected, the original request + * header will be returned with the status set to one of the following: + */ + FCPIO_INVALID_HEADER, /* header contains invalid data */ + FCPIO_OUT_OF_RESOURCE, /* out of resources to complete request */ + FCPIO_INVALID_PARAM, /* some parameter in request is invalid */ + FCPIO_REQ_NOT_SUPPORTED, /* request type is not supported */ + FCPIO_IO_NOT_FOUND, /* requested I/O was not found */ + + /* + * Once a request is processed, the firmware will usually return + * a cmpl message type. In cases where errors occurred, + * the header status field would be filled in with one of the following: + */ + FCPIO_ABORTED = 0x41, /* request was aborted */ + FCPIO_TIMEOUT, /* request was timed out */ + FCPIO_SGL_INVALID, /* request was aborted due to sgl error */ + FCPIO_MSS_INVALID, /* request was aborted due to mss error */ + FCPIO_DATA_CNT_MISMATCH, /* recv/sent more/less data than exp. */ + FCPIO_FW_ERR, /* request was terminated due to fw error */ + FCPIO_ITMF_REJECTED, /* itmf req was rejected by remote node */ + FCPIO_ITMF_FAILED, /* itmf req was failed by remote node */ + FCPIO_ITMF_INCORRECT_LUN, /* itmf req targeted incorrect LUN */ + FCPIO_CMND_REJECTED, /* request was invalid and rejected */ + FCPIO_NO_PATH_AVAIL, /* no paths to the lun was available */ + FCPIO_PATH_FAILED, /* i/o sent to current path failed */ + FCPIO_LUNMAP_CHNG_PEND, /* i/o rejected due to lunmap change */ +}; + +/* + * The header command tag. All host requests will use the "tag" field + * to mark commands with a unique tag. When the firmware responds to + * a host request, it will copy the tag field into the response. + * + * The only firmware requests that will use the rx_id/ox_id fields instead + * of the tag field will be the target command and target task management + * requests. These two requests do not have corresponding host requests + * since they come directly from the FC initiator on the network. + */ +struct fcpio_tag { + union { + u32 req_id; + struct { + u16 rx_id; + u16 ox_id; + } ex_id; + } u; +}; + +static inline void +fcpio_tag_id_enc(struct fcpio_tag *tag, u32 id) +{ + tag->u.req_id = id; +} + +static inline void +fcpio_tag_id_dec(struct fcpio_tag *tag, u32 *id) +{ + *id = tag->u.req_id; +} + +static inline void +fcpio_tag_exid_enc(struct fcpio_tag *tag, u16 ox_id, u16 rx_id) +{ + tag->u.ex_id.rx_id = rx_id; + tag->u.ex_id.ox_id = ox_id; +} + +static inline void +fcpio_tag_exid_dec(struct fcpio_tag *tag, u16 *ox_id, u16 *rx_id) +{ + *rx_id = tag->u.ex_id.rx_id; + *ox_id = tag->u.ex_id.ox_id; +} + +/* + * The header for an fcpio request, whether from the firmware or from the + * host driver + */ +struct fcpio_header { + u8 type; /* enum fcpio_type */ + u8 status; /* header status entry */ + u16 _resvd; /* reserved */ + struct fcpio_tag tag; /* header tag */ +}; + +static inline void +fcpio_header_enc(struct fcpio_header *hdr, + u8 type, u8 status, + struct fcpio_tag tag) +{ + hdr->type = type; + hdr->status = status; + hdr->_resvd = 0; + hdr->tag = tag; +} + +static inline void +fcpio_header_dec(struct fcpio_header *hdr, + u8 *type, u8 *status, + struct fcpio_tag *tag) +{ + *type = hdr->type; + *status = hdr->status; + *tag = hdr->tag; +} + +#define CDB_16 16 +#define CDB_32 32 +#define LUN_ADDRESS 8 + +/* + * fcpio_icmnd_16: host -> firmware request + * + * used for sending out an initiator SCSI 16-byte command + */ +struct fcpio_icmnd_16 { + u32 lunmap_id; /* index into lunmap table */ + u8 special_req_flags; /* special exchange request flags */ + u8 _resvd0[3]; /* reserved */ + u32 sgl_cnt; /* scatter-gather list count */ + u32 sense_len; /* sense buffer length */ + u64 sgl_addr; /* scatter-gather list addr */ + u64 sense_addr; /* sense buffer address */ + u8 crn; /* SCSI Command Reference No. */ + u8 pri_ta; /* SCSI Priority and Task attribute */ + u8 _resvd1; /* reserved: should be 0 */ + u8 flags; /* command flags */ + u8 scsi_cdb[CDB_16]; /* SCSI Cmnd Descriptor Block */ + u32 data_len; /* length of data expected */ + u8 lun[LUN_ADDRESS]; /* FC vNIC only: LUN address */ + u8 _resvd2; /* reserved */ + u8 d_id[3]; /* FC vNIC only: Target D_ID */ + u16 mss; /* FC vNIC only: max burst */ + u16 _resvd3; /* reserved */ + u32 r_a_tov; /* FC vNIC only: Res. Alloc Timeout */ + u32 e_d_tov; /* FC vNIC only: Err Detect Timeout */ +}; + +/* + * Special request flags + */ +#define FCPIO_ICMND_SRFLAG_RETRY 0x01 /* Enable Retry handling on exchange */ + +/* + * Priority/Task Attribute settings + */ +#define FCPIO_ICMND_PTA_SIMPLE 0 /* simple task attribute */ +#define FCPIO_ICMND_PTA_HEADQ 1 /* head of queue task attribute */ +#define FCPIO_ICMND_PTA_ORDERED 2 /* ordered task attribute */ +#define FCPIO_ICMND_PTA_ACA 4 /* auto contingent allegiance */ +#define FCPIO_ICMND_PRI_SHIFT 3 /* priority field starts in bit 3 */ + +/* + * Command flags + */ +#define FCPIO_ICMND_RDDATA 0x02 /* read data */ +#define FCPIO_ICMND_WRDATA 0x01 /* write data */ + +/* + * fcpio_icmnd_32: host -> firmware request + * + * used for sending out an initiator SCSI 32-byte command + */ +struct fcpio_icmnd_32 { + u32 lunmap_id; /* index into lunmap table */ + u8 special_req_flags; /* special exchange request flags */ + u8 _resvd0[3]; /* reserved */ + u32 sgl_cnt; /* scatter-gather list count */ + u32 sense_len; /* sense buffer length */ + u64 sgl_addr; /* scatter-gather list addr */ + u64 sense_addr; /* sense buffer address */ + u8 crn; /* SCSI Command Reference No. */ + u8 pri_ta; /* SCSI Priority and Task attribute */ + u8 _resvd1; /* reserved: should be 0 */ + u8 flags; /* command flags */ + u8 scsi_cdb[CDB_32]; /* SCSI Cmnd Descriptor Block */ + u32 data_len; /* length of data expected */ + u8 lun[LUN_ADDRESS]; /* FC vNIC only: LUN address */ + u8 _resvd2; /* reserved */ + u8 d_id[3]; /* FC vNIC only: Target D_ID */ + u16 mss; /* FC vNIC only: max burst */ + u16 _resvd3; /* reserved */ + u32 r_a_tov; /* FC vNIC only: Res. Alloc Timeout */ + u32 e_d_tov; /* FC vNIC only: Error Detect Timeout */ +}; + +/* + * fcpio_itmf: host -> firmware request + * + * used for requesting the firmware to abort a request and/or send out + * a task management function + * + * The t_tag field is only needed when the request type is ABT_TASK. + */ +struct fcpio_itmf { + u32 lunmap_id; /* index into lunmap table */ + u32 tm_req; /* SCSI Task Management request */ + u32 t_tag; /* header tag of fcpio to be aborted */ + u32 _resvd; /* _reserved */ + u8 lun[LUN_ADDRESS]; /* FC vNIC only: LUN address */ + u8 _resvd1; /* reserved */ + u8 d_id[3]; /* FC vNIC only: Target D_ID */ + u32 r_a_tov; /* FC vNIC only: R_A_TOV in msec */ + u32 e_d_tov; /* FC vNIC only: E_D_TOV in msec */ +}; + +/* + * Task Management request + */ +enum fcpio_itmf_tm_req_type { + FCPIO_ITMF_ABT_TASK_TERM = 0x01, /* abort task and terminate */ + FCPIO_ITMF_ABT_TASK, /* abort task and issue abts */ + FCPIO_ITMF_ABT_TASK_SET, /* abort task set */ + FCPIO_ITMF_CLR_TASK_SET, /* clear task set */ + FCPIO_ITMF_LUN_RESET, /* logical unit reset task mgmt */ + FCPIO_ITMF_CLR_ACA, /* Clear ACA condition */ +}; + +/* + * fcpio_tdata: host -> firmware request + * + * used for requesting the firmware to send out a read data transfer for a + * target command + */ +struct fcpio_tdata { + u16 rx_id; /* FC rx_id of target command */ + u16 flags; /* command flags */ + u32 rel_offset; /* data sequence relative offset */ + u32 sgl_cnt; /* scatter-gather list count */ + u32 data_len; /* length of data expected to send */ + u64 sgl_addr; /* scatter-gather list address */ +}; + +/* + * Command flags + */ +#define FCPIO_TDATA_SCSI_RSP 0x01 /* send a scsi resp. after last frame */ + +/* + * fcpio_txrdy: host -> firmware request + * + * used for requesting the firmware to send out a write data transfer for a + * target command + */ +struct fcpio_txrdy { + u16 rx_id; /* FC rx_id of target command */ + u16 _resvd0; /* reserved */ + u32 rel_offset; /* data sequence relative offset */ + u32 sgl_cnt; /* scatter-gather list count */ + u32 data_len; /* length of data expected to send */ + u64 sgl_addr; /* scatter-gather list address */ +}; + +/* + * fcpio_trsp: host -> firmware request + * + * used for requesting the firmware to send out a response for a target + * command + */ +struct fcpio_trsp { + u16 rx_id; /* FC rx_id of target command */ + u16 _resvd0; /* reserved */ + u32 sense_len; /* sense data buffer length */ + u64 sense_addr; /* sense data buffer address */ + u16 _resvd1; /* reserved */ + u8 flags; /* response request flags */ + u8 scsi_status; /* SCSI status */ + u32 residual; /* SCSI data residual value of I/O */ +}; + +/* + * resposnse request flags + */ +#define FCPIO_TRSP_RESID_UNDER 0x08 /* residual is valid and is underflow */ +#define FCPIO_TRSP_RESID_OVER 0x04 /* residual is valid and is overflow */ + +/* + * fcpio_ttmf_ack: host -> firmware response + * + * used by the host to indicate to the firmware it has received and processed + * the target tmf request + */ +struct fcpio_ttmf_ack { + u16 rx_id; /* FC rx_id of target command */ + u16 _resvd0; /* reserved */ + u32 tmf_status; /* SCSI task management status */ +}; + +/* + * fcpio_tabort: host -> firmware request + * + * used by the host to request the firmware to abort a target request that was + * received by the firmware + */ +struct fcpio_tabort { + u16 rx_id; /* rx_id of the target request */ +}; + +/* + * fcpio_reset: host -> firmware request + * + * used by the host to signal a reset of the driver to the firmware + * and to request firmware to clean up all outstanding I/O + */ +struct fcpio_reset { + u32 _resvd; +}; + +enum fcpio_flogi_reg_format_type { + FCPIO_FLOGI_REG_DEF_DEST = 0, /* Use the oui | s_id mac format */ + FCPIO_FLOGI_REG_GW_DEST, /* Use the fixed gateway mac */ +}; + +/* + * fcpio_flogi_reg: host -> firmware request + * + * fc vnic only + * used by the host to notify the firmware of the lif's s_id + * and destination mac address format + */ +struct fcpio_flogi_reg { + u8 format; + u8 s_id[3]; /* FC vNIC only: Source S_ID */ + u8 gateway_mac[ETH_ALEN]; /* Destination gateway mac */ + u16 _resvd; + u32 r_a_tov; /* R_A_TOV in msec */ + u32 e_d_tov; /* E_D_TOV in msec */ +}; + +/* + * fcpio_echo: host -> firmware request + * + * sends a heartbeat echo request to the firmware + */ +struct fcpio_echo { + u32 _resvd; +}; + +/* + * fcpio_lunmap_req: host -> firmware request + * + * scsi vnic only + * sends a request to retrieve the lunmap table for scsi vnics + */ +struct fcpio_lunmap_req { + u64 addr; /* address of the buffer */ + u32 len; /* len of the buffer */ +}; + +/* + * fcpio_flogi_fip_reg: host -> firmware request + * + * fc vnic only + * used by the host to notify the firmware of the lif's s_id + * and destination mac address format + */ +struct fcpio_flogi_fip_reg { + u8 _resvd0; + u8 s_id[3]; /* FC vNIC only: Source S_ID */ + u8 fcf_mac[ETH_ALEN]; /* FCF Target destination mac */ + u16 _resvd1; + u32 r_a_tov; /* R_A_TOV in msec */ + u32 e_d_tov; /* E_D_TOV in msec */ + u8 ha_mac[ETH_ALEN]; /* Host adapter source mac */ + u16 _resvd2; +}; + +/* + * Basic structure for all fcpio structures that are sent from the host to the + * firmware. They are 128 bytes per structure. + */ +#define FCPIO_HOST_REQ_LEN 128 /* expected length of host requests */ + +struct fcpio_host_req { + struct fcpio_header hdr; + + union { + /* + * Defines space needed for request + */ + u8 buf[FCPIO_HOST_REQ_LEN - sizeof(struct fcpio_header)]; + + /* + * Initiator host requests + */ + struct fcpio_icmnd_16 icmnd_16; + struct fcpio_icmnd_32 icmnd_32; + struct fcpio_itmf itmf; + + /* + * Target host requests + */ + struct fcpio_tdata tdata; + struct fcpio_txrdy txrdy; + struct fcpio_trsp trsp; + struct fcpio_ttmf_ack ttmf_ack; + struct fcpio_tabort tabort; + + /* + * Misc requests + */ + struct fcpio_reset reset; + struct fcpio_flogi_reg flogi_reg; + struct fcpio_echo echo; + struct fcpio_lunmap_req lunmap_req; + struct fcpio_flogi_fip_reg flogi_fip_reg; + } u; +}; + +/* + * fcpio_icmnd_cmpl: firmware -> host response + * + * used for sending the host a response to an initiator command + */ +struct fcpio_icmnd_cmpl { + u8 _resvd0[6]; /* reserved */ + u8 flags; /* response flags */ + u8 scsi_status; /* SCSI status */ + u32 residual; /* SCSI data residual length */ + u32 sense_len; /* SCSI sense length */ +}; + +/* + * response flags + */ +#define FCPIO_ICMND_CMPL_RESID_UNDER 0x08 /* resid under and valid */ +#define FCPIO_ICMND_CMPL_RESID_OVER 0x04 /* resid over and valid */ + +/* + * fcpio_itmf_cmpl: firmware -> host response + * + * used for sending the host a response for a itmf request + */ +struct fcpio_itmf_cmpl { + u32 _resvd; /* reserved */ +}; + +/* + * fcpio_tcmnd_16: firmware -> host request + * + * used by the firmware to notify the host of an incoming target SCSI 16-Byte + * request + */ +struct fcpio_tcmnd_16 { + u8 lun[LUN_ADDRESS]; /* FC vNIC only: LUN address */ + u8 crn; /* SCSI Command Reference No. */ + u8 pri_ta; /* SCSI Priority and Task attribute */ + u8 _resvd2; /* reserved: should be 0 */ + u8 flags; /* command flags */ + u8 scsi_cdb[CDB_16]; /* SCSI Cmnd Descriptor Block */ + u32 data_len; /* length of data expected */ + u8 _resvd1; /* reserved */ + u8 s_id[3]; /* FC vNIC only: Source S_ID */ +}; + +/* + * Priority/Task Attribute settings + */ +#define FCPIO_TCMND_PTA_SIMPLE 0 /* simple task attribute */ +#define FCPIO_TCMND_PTA_HEADQ 1 /* head of queue task attribute */ +#define FCPIO_TCMND_PTA_ORDERED 2 /* ordered task attribute */ +#define FCPIO_TCMND_PTA_ACA 4 /* auto contingent allegiance */ +#define FCPIO_TCMND_PRI_SHIFT 3 /* priority field starts in bit 3 */ + +/* + * Command flags + */ +#define FCPIO_TCMND_RDDATA 0x02 /* read data */ +#define FCPIO_TCMND_WRDATA 0x01 /* write data */ + +/* + * fcpio_tcmnd_32: firmware -> host request + * + * used by the firmware to notify the host of an incoming target SCSI 32-Byte + * request + */ +struct fcpio_tcmnd_32 { + u8 lun[LUN_ADDRESS]; /* FC vNIC only: LUN address */ + u8 crn; /* SCSI Command Reference No. */ + u8 pri_ta; /* SCSI Priority and Task attribute */ + u8 _resvd2; /* reserved: should be 0 */ + u8 flags; /* command flags */ + u8 scsi_cdb[CDB_32]; /* SCSI Cmnd Descriptor Block */ + u32 data_len; /* length of data expected */ + u8 _resvd0; /* reserved */ + u8 s_id[3]; /* FC vNIC only: Source S_ID */ +}; + +/* + * fcpio_tdrsp_cmpl: firmware -> host response + * + * used by the firmware to notify the host of a response to a host target + * command + */ +struct fcpio_tdrsp_cmpl { + u16 rx_id; /* rx_id of the target request */ + u16 _resvd0; /* reserved */ +}; + +/* + * fcpio_ttmf: firmware -> host request + * + * used by the firmware to notify the host of an incoming task management + * function request + */ +struct fcpio_ttmf { + u8 _resvd0; /* reserved */ + u8 s_id[3]; /* FC vNIC only: Source S_ID */ + u8 lun[LUN_ADDRESS]; /* FC vNIC only: LUN address */ + u8 crn; /* SCSI Command Reference No. */ + u8 _resvd2[3]; /* reserved */ + u32 tmf_type; /* task management request type */ +}; + +/* + * Task Management request + */ +#define FCPIO_TTMF_CLR_ACA 0x40 /* Clear ACA condition */ +#define FCPIO_TTMF_LUN_RESET 0x10 /* logical unit reset task mgmt */ +#define FCPIO_TTMF_CLR_TASK_SET 0x04 /* clear task set */ +#define FCPIO_TTMF_ABT_TASK_SET 0x02 /* abort task set */ +#define FCPIO_TTMF_ABT_TASK 0x01 /* abort task */ + +/* + * fcpio_tabort_cmpl: firmware -> host response + * + * used by the firmware to respond to a host's tabort request + */ +struct fcpio_tabort_cmpl { + u16 rx_id; /* rx_id of the target request */ + u16 _resvd0; /* reserved */ +}; + +/* + * fcpio_ack: firmware -> host response + * + * used by firmware to notify the host of the last work request received + */ +struct fcpio_ack { + u16 request_out; /* last host entry received */ + u16 _resvd; +}; + +/* + * fcpio_reset_cmpl: firmware -> host response + * + * use by firmware to respond to the host's reset request + */ +struct fcpio_reset_cmpl { + u16 vnic_id; +}; + +/* + * fcpio_flogi_reg_cmpl: firmware -> host response + * + * fc vnic only + * response to the fcpio_flogi_reg request + */ +struct fcpio_flogi_reg_cmpl { + u32 _resvd; +}; + +/* + * fcpio_echo_cmpl: firmware -> host response + * + * response to the fcpio_echo request + */ +struct fcpio_echo_cmpl { + u32 _resvd; +}; + +/* + * fcpio_lunmap_chng: firmware -> host notification + * + * scsi vnic only + * notifies the host that the lunmap tables have changed + */ +struct fcpio_lunmap_chng { + u32 _resvd; +}; + +/* + * fcpio_lunmap_req_cmpl: firmware -> host response + * + * scsi vnic only + * response for lunmap table request from the host + */ +struct fcpio_lunmap_req_cmpl { + u32 _resvd; +}; + +/* + * Basic structure for all fcpio structures that are sent from the firmware to + * the host. They are 64 bytes per structure. + */ +#define FCPIO_FW_REQ_LEN 64 /* expected length of fw requests */ +struct fcpio_fw_req { + struct fcpio_header hdr; + + union { + /* + * Defines space needed for request + */ + u8 buf[FCPIO_FW_REQ_LEN - sizeof(struct fcpio_header)]; + + /* + * Initiator firmware responses + */ + struct fcpio_icmnd_cmpl icmnd_cmpl; + struct fcpio_itmf_cmpl itmf_cmpl; + + /* + * Target firmware new requests + */ + struct fcpio_tcmnd_16 tcmnd_16; + struct fcpio_tcmnd_32 tcmnd_32; + + /* + * Target firmware responses + */ + struct fcpio_tdrsp_cmpl tdrsp_cmpl; + struct fcpio_ttmf ttmf; + struct fcpio_tabort_cmpl tabort_cmpl; + + /* + * Firmware response to work received + */ + struct fcpio_ack ack; + + /* + * Misc requests + */ + struct fcpio_reset_cmpl reset_cmpl; + struct fcpio_flogi_reg_cmpl flogi_reg_cmpl; + struct fcpio_echo_cmpl echo_cmpl; + struct fcpio_lunmap_chng lunmap_chng; + struct fcpio_lunmap_req_cmpl lunmap_req_cmpl; + } u; +}; + +/* + * Access routines to encode and decode the color bit, which is the most + * significant bit of the MSB of the structure + */ +static inline void fcpio_color_enc(struct fcpio_fw_req *fw_req, u8 color) +{ + u8 *c = ((u8 *) fw_req) + sizeof(struct fcpio_fw_req) - 1; + + if (color) + *c |= 0x80; + else + *c &= ~0x80; +} + +static inline void fcpio_color_dec(struct fcpio_fw_req *fw_req, u8 *color) +{ + u8 *c = ((u8 *) fw_req) + sizeof(struct fcpio_fw_req) - 1; + + *color = *c >> 7; + + /* + * Make sure color bit is read from desc *before* other fields + * are read from desc. Hardware guarantees color bit is last + * bit (byte) written. Adding the rmb() prevents the compiler + * and/or CPU from reordering the reads which would potentially + * result in reading stale values. + */ + + rmb(); + +} + +/* + * Lunmap table entry for scsi vnics + */ +#define FCPIO_LUNMAP_TABLE_SIZE 256 +#define FCPIO_FLAGS_LUNMAP_VALID 0x80 +#define FCPIO_FLAGS_BOOT 0x01 +struct fcpio_lunmap_entry { + u8 bus; + u8 target; + u8 lun; + u8 path_cnt; + u16 flags; + u16 update_cnt; +}; + +struct fcpio_lunmap_tbl { + u32 update_cnt; + struct fcpio_lunmap_entry lunmaps[FCPIO_LUNMAP_TABLE_SIZE]; +}; + +#endif /* _FCPIO_H_ */ diff --git a/drivers/scsi/fnic/fnic.h b/drivers/scsi/fnic/fnic.h new file mode 100644 index 00000000000..e4c0a3d7d87 --- /dev/null +++ b/drivers/scsi/fnic/fnic.h @@ -0,0 +1,265 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _FNIC_H_ +#define _FNIC_H_ + +#include +#include +#include +#include +#include "fnic_io.h" +#include "fnic_res.h" +#include "vnic_dev.h" +#include "vnic_wq.h" +#include "vnic_rq.h" +#include "vnic_cq.h" +#include "vnic_wq_copy.h" +#include "vnic_intr.h" +#include "vnic_stats.h" +#include "vnic_scsi.h" + +#define DRV_NAME "fnic" +#define DRV_DESCRIPTION "Cisco FCoE HBA Driver" +#define DRV_VERSION "1.0.0.1121" +#define PFX DRV_NAME ": " +#define DFX DRV_NAME "%d: " + +#define DESC_CLEAN_LOW_WATERMARK 8 +#define FNIC_MAX_IO_REQ 2048 /* scsi_cmnd tag map entries */ +#define FNIC_IO_LOCKS 64 /* IO locks: power of 2 */ +#define FNIC_DFLT_QUEUE_DEPTH 32 +#define FNIC_STATS_RATE_LIMIT 4 /* limit rate at which stats are pulled up */ + +/* + * Tag bits used for special requests. + */ +#define BIT(nr) (1UL << (nr)) +#define FNIC_TAG_ABORT BIT(30) /* tag bit indicating abort */ +#define FNIC_TAG_DEV_RST BIT(29) /* indicates device reset */ +#define FNIC_TAG_MASK (BIT(24) - 1) /* mask for lookup */ +#define FNIC_NO_TAG -1 + +/* + * Usage of the scsi_cmnd scratchpad. + * These fields are locked by the hashed io_req_lock. + */ +#define CMD_SP(Cmnd) ((Cmnd)->SCp.ptr) +#define CMD_STATE(Cmnd) ((Cmnd)->SCp.phase) +#define CMD_ABTS_STATUS(Cmnd) ((Cmnd)->SCp.Message) +#define CMD_LR_STATUS(Cmnd) ((Cmnd)->SCp.have_data_in) +#define CMD_TAG(Cmnd) ((Cmnd)->SCp.sent_command) + +#define FCPIO_INVALID_CODE 0x100 /* hdr_status value unused by firmware */ + +#define FNIC_LUN_RESET_TIMEOUT 10000 /* mSec */ +#define FNIC_HOST_RESET_TIMEOUT 10000 /* mSec */ +#define FNIC_RMDEVICE_TIMEOUT 1000 /* mSec */ +#define FNIC_HOST_RESET_SETTLE_TIME 30 /* Sec */ + +#define FNIC_MAX_FCP_TARGET 256 + +extern unsigned int fnic_log_level; + +#define FNIC_MAIN_LOGGING 0x01 +#define FNIC_FCS_LOGGING 0x02 +#define FNIC_SCSI_LOGGING 0x04 +#define FNIC_ISR_LOGGING 0x08 + +#define FNIC_CHECK_LOGGING(LEVEL, CMD) \ +do { \ + if (unlikely(fnic_log_level & LEVEL)) \ + do { \ + CMD; \ + } while (0); \ +} while (0) + +#define FNIC_MAIN_DBG(kern_level, host, fmt, args...) \ + FNIC_CHECK_LOGGING(FNIC_MAIN_LOGGING, \ + shost_printk(kern_level, host, fmt, ##args);) + +#define FNIC_FCS_DBG(kern_level, host, fmt, args...) \ + FNIC_CHECK_LOGGING(FNIC_FCS_LOGGING, \ + shost_printk(kern_level, host, fmt, ##args);) + +#define FNIC_SCSI_DBG(kern_level, host, fmt, args...) \ + FNIC_CHECK_LOGGING(FNIC_SCSI_LOGGING, \ + shost_printk(kern_level, host, fmt, ##args);) + +#define FNIC_ISR_DBG(kern_level, host, fmt, args...) \ + FNIC_CHECK_LOGGING(FNIC_ISR_LOGGING, \ + shost_printk(kern_level, host, fmt, ##args);) + +extern const char *fnic_state_str[]; + +enum fnic_intx_intr_index { + FNIC_INTX_WQ_RQ_COPYWQ, + FNIC_INTX_ERR, + FNIC_INTX_NOTIFY, + FNIC_INTX_INTR_MAX, +}; + +enum fnic_msix_intr_index { + FNIC_MSIX_RQ, + FNIC_MSIX_WQ, + FNIC_MSIX_WQ_COPY, + FNIC_MSIX_ERR_NOTIFY, + FNIC_MSIX_INTR_MAX, +}; + +struct fnic_msix_entry { + int requested; + char devname[IFNAMSIZ]; + irqreturn_t (*isr)(int, void *); + void *devid; +}; + +enum fnic_state { + FNIC_IN_FC_MODE = 0, + FNIC_IN_FC_TRANS_ETH_MODE, + FNIC_IN_ETH_MODE, + FNIC_IN_ETH_TRANS_FC_MODE, +}; + +#define FNIC_WQ_COPY_MAX 1 +#define FNIC_WQ_MAX 1 +#define FNIC_RQ_MAX 1 +#define FNIC_CQ_MAX (FNIC_WQ_COPY_MAX + FNIC_WQ_MAX + FNIC_RQ_MAX) + +struct mempool; + +/* Per-instance private data structure */ +struct fnic { + struct fc_lport *lport; + struct vnic_dev_bar bar0; + + struct msix_entry msix_entry[FNIC_MSIX_INTR_MAX]; + struct fnic_msix_entry msix[FNIC_MSIX_INTR_MAX]; + + struct vnic_stats *stats; + unsigned long stats_time; /* time of stats update */ + struct vnic_nic_cfg *nic_cfg; + char name[IFNAMSIZ]; + struct timer_list notify_timer; /* used for MSI interrupts */ + + unsigned int err_intr_offset; + unsigned int link_intr_offset; + + unsigned int wq_count; + unsigned int cq_count; + + u32 fcoui_mode:1; /* use fcoui address*/ + u32 vlan_hw_insert:1; /* let hw insert the tag */ + u32 in_remove:1; /* fnic device in removal */ + u32 stop_rx_link_events:1; /* stop proc. rx frames, link events */ + + struct completion *remove_wait; /* device remove thread blocks */ + + struct fc_frame *flogi; + struct fc_frame *flogi_resp; + u16 flogi_oxid; + unsigned long s_id; + enum fnic_state state; + spinlock_t fnic_lock; + + u16 vlan_id; /* VLAN tag including priority */ + u8 mac_addr[ETH_ALEN]; + u8 dest_addr[ETH_ALEN]; + u8 data_src_addr[ETH_ALEN]; + u64 fcp_input_bytes; /* internal statistic */ + u64 fcp_output_bytes; /* internal statistic */ + u32 link_down_cnt; + int link_status; + + struct list_head list; + struct pci_dev *pdev; + struct vnic_fc_config config; + struct vnic_dev *vdev; + unsigned int raw_wq_count; + unsigned int wq_copy_count; + unsigned int rq_count; + int fw_ack_index[FNIC_WQ_COPY_MAX]; + unsigned short fw_ack_recd[FNIC_WQ_COPY_MAX]; + unsigned short wq_copy_desc_low[FNIC_WQ_COPY_MAX]; + unsigned int intr_count; + u32 __iomem *legacy_pba; + struct fnic_host_tag *tags; + mempool_t *io_req_pool; + mempool_t *io_sgl_pool[FNIC_SGL_NUM_CACHES]; + spinlock_t io_req_lock[FNIC_IO_LOCKS]; /* locks for scsi cmnds */ + + struct work_struct link_work; + struct work_struct frame_work; + struct sk_buff_head frame_queue; + + /* copy work queue cache line section */ + ____cacheline_aligned struct vnic_wq_copy wq_copy[FNIC_WQ_COPY_MAX]; + /* completion queue cache line section */ + ____cacheline_aligned struct vnic_cq cq[FNIC_CQ_MAX]; + + spinlock_t wq_copy_lock[FNIC_WQ_COPY_MAX]; + + /* work queue cache line section */ + ____cacheline_aligned struct vnic_wq wq[FNIC_WQ_MAX]; + spinlock_t wq_lock[FNIC_WQ_MAX]; + + /* receive queue cache line section */ + ____cacheline_aligned struct vnic_rq rq[FNIC_RQ_MAX]; + + /* interrupt resource cache line section */ + ____cacheline_aligned struct vnic_intr intr[FNIC_MSIX_INTR_MAX]; +}; + +extern struct workqueue_struct *fnic_event_queue; +extern struct device_attribute *fnic_attrs[]; + +void fnic_clear_intr_mode(struct fnic *fnic); +int fnic_set_intr_mode(struct fnic *fnic); +void fnic_free_intr(struct fnic *fnic); +int fnic_request_intr(struct fnic *fnic); + +int fnic_send(struct fc_lport *, struct fc_frame *); +void fnic_free_wq_buf(struct vnic_wq *wq, struct vnic_wq_buf *buf); +void fnic_handle_frame(struct work_struct *work); +void fnic_handle_link(struct work_struct *work); +int fnic_rq_cmpl_handler(struct fnic *fnic, int); +int fnic_alloc_rq_frame(struct vnic_rq *rq); +void fnic_free_rq_buf(struct vnic_rq *rq, struct vnic_rq_buf *buf); +int fnic_send_frame(struct fnic *fnic, struct fc_frame *fp); + +int fnic_queuecommand(struct scsi_cmnd *, void (*done)(struct scsi_cmnd *)); +int fnic_abort_cmd(struct scsi_cmnd *); +int fnic_device_reset(struct scsi_cmnd *); +int fnic_host_reset(struct scsi_cmnd *); +int fnic_reset(struct Scsi_Host *); +void fnic_scsi_cleanup(struct fc_lport *); +void fnic_scsi_abort_io(struct fc_lport *); +void fnic_empty_scsi_cleanup(struct fc_lport *); +void fnic_exch_mgr_reset(struct fc_lport *, u32, u32); +int fnic_wq_copy_cmpl_handler(struct fnic *fnic, int); +int fnic_wq_cmpl_handler(struct fnic *fnic, int); +int fnic_flogi_reg_handler(struct fnic *fnic); +void fnic_wq_copy_cleanup_handler(struct vnic_wq_copy *wq, + struct fcpio_host_req *desc); +int fnic_fw_reset_handler(struct fnic *fnic); +void fnic_terminate_rport_io(struct fc_rport *); +const char *fnic_state_to_str(unsigned int state); + +void fnic_log_q_error(struct fnic *fnic); +void fnic_handle_link_event(struct fnic *fnic); + +#endif /* _FNIC_H_ */ diff --git a/drivers/scsi/fnic/fnic_attrs.c b/drivers/scsi/fnic/fnic_attrs.c new file mode 100644 index 00000000000..aea0c3becfd --- /dev/null +++ b/drivers/scsi/fnic/fnic_attrs.c @@ -0,0 +1,56 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include "fnic.h" + +static ssize_t fnic_show_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct fc_lport *lp = shost_priv(class_to_shost(dev)); + struct fnic *fnic = lport_priv(lp); + + return snprintf(buf, PAGE_SIZE, "%s\n", fnic_state_str[fnic->state]); +} + +static ssize_t fnic_show_drv_version(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", DRV_VERSION); +} + +static ssize_t fnic_show_link_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct fc_lport *lp = shost_priv(class_to_shost(dev)); + + return snprintf(buf, PAGE_SIZE, "%s\n", (lp->link_up) + ? "Link Up" : "Link Down"); +} + +static DEVICE_ATTR(fnic_state, S_IRUGO, fnic_show_state, NULL); +static DEVICE_ATTR(drv_version, S_IRUGO, fnic_show_drv_version, NULL); +static DEVICE_ATTR(link_state, S_IRUGO, fnic_show_link_state, NULL); + +struct device_attribute *fnic_attrs[] = { + &dev_attr_fnic_state, + &dev_attr_drv_version, + &dev_attr_link_state, + NULL, +}; diff --git a/drivers/scsi/fnic/fnic_fcs.c b/drivers/scsi/fnic/fnic_fcs.c new file mode 100644 index 00000000000..07e6eedb83c --- /dev/null +++ b/drivers/scsi/fnic/fnic_fcs.c @@ -0,0 +1,742 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fnic_io.h" +#include "fnic.h" +#include "cq_enet_desc.h" +#include "cq_exch_desc.h" + +struct workqueue_struct *fnic_event_queue; + +void fnic_handle_link(struct work_struct *work) +{ + struct fnic *fnic = container_of(work, struct fnic, link_work); + unsigned long flags; + int old_link_status; + u32 old_link_down_cnt; + + spin_lock_irqsave(&fnic->fnic_lock, flags); + + if (fnic->stop_rx_link_events) { + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + return; + } + + old_link_down_cnt = fnic->link_down_cnt; + old_link_status = fnic->link_status; + fnic->link_status = vnic_dev_link_status(fnic->vdev); + fnic->link_down_cnt = vnic_dev_link_down_cnt(fnic->vdev); + + if (old_link_status == fnic->link_status) { + if (!fnic->link_status) + /* DOWN -> DOWN */ + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + else { + if (old_link_down_cnt != fnic->link_down_cnt) { + /* UP -> DOWN -> UP */ + fnic->lport->host_stats.link_failure_count++; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + FNIC_FCS_DBG(KERN_DEBUG, fnic->lport->host, + "link down\n"); + fc_linkdown(fnic->lport); + FNIC_FCS_DBG(KERN_DEBUG, fnic->lport->host, + "link up\n"); + fc_linkup(fnic->lport); + } else + /* UP -> UP */ + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + } + } else if (fnic->link_status) { + /* DOWN -> UP */ + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + FNIC_FCS_DBG(KERN_DEBUG, fnic->lport->host, "link up\n"); + fc_linkup(fnic->lport); + } else { + /* UP -> DOWN */ + fnic->lport->host_stats.link_failure_count++; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + FNIC_FCS_DBG(KERN_DEBUG, fnic->lport->host, "link down\n"); + fc_linkdown(fnic->lport); + } + +} + +/* + * This function passes incoming fabric frames to libFC + */ +void fnic_handle_frame(struct work_struct *work) +{ + struct fnic *fnic = container_of(work, struct fnic, frame_work); + struct fc_lport *lp = fnic->lport; + unsigned long flags; + struct sk_buff *skb; + struct fc_frame *fp; + + while ((skb = skb_dequeue(&fnic->frame_queue))) { + + spin_lock_irqsave(&fnic->fnic_lock, flags); + if (fnic->stop_rx_link_events) { + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + dev_kfree_skb(skb); + return; + } + fp = (struct fc_frame *)skb; + /* if Flogi resp frame, register the address */ + if (fr_flags(fp)) { + vnic_dev_add_addr(fnic->vdev, + fnic->data_src_addr); + fr_flags(fp) = 0; + } + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + fc_exch_recv(lp, lp->emp, fp); + } + +} + +static inline void fnic_import_rq_fc_frame(struct sk_buff *skb, + u32 len, u8 sof, u8 eof) +{ + struct fc_frame *fp = (struct fc_frame *)skb; + + skb_trim(skb, len); + fr_eof(fp) = eof; + fr_sof(fp) = sof; +} + + +static inline int fnic_import_rq_eth_pkt(struct sk_buff *skb, u32 len) +{ + struct fc_frame *fp; + struct ethhdr *eh; + struct vlan_ethhdr *vh; + struct fcoe_hdr *fcoe_hdr; + struct fcoe_crc_eof *ft; + u32 transport_len = 0; + + eh = (struct ethhdr *)skb->data; + vh = (struct vlan_ethhdr *)skb->data; + if (vh->h_vlan_proto == htons(ETH_P_8021Q) && + vh->h_vlan_encapsulated_proto == htons(ETH_P_FCOE)) { + skb_pull(skb, sizeof(struct vlan_ethhdr)); + transport_len += sizeof(struct vlan_ethhdr); + } else if (eh->h_proto == htons(ETH_P_FCOE)) { + transport_len += sizeof(struct ethhdr); + skb_pull(skb, sizeof(struct ethhdr)); + } else + return -1; + + fcoe_hdr = (struct fcoe_hdr *)skb->data; + if (FC_FCOE_DECAPS_VER(fcoe_hdr) != FC_FCOE_VER) + return -1; + + fp = (struct fc_frame *)skb; + fc_frame_init(fp); + fr_sof(fp) = fcoe_hdr->fcoe_sof; + skb_pull(skb, sizeof(struct fcoe_hdr)); + transport_len += sizeof(struct fcoe_hdr); + + ft = (struct fcoe_crc_eof *)(skb->data + len - + transport_len - sizeof(*ft)); + fr_eof(fp) = ft->fcoe_eof; + skb_trim(skb, len - transport_len - sizeof(*ft)); + return 0; +} + +static inline int fnic_handle_flogi_resp(struct fnic *fnic, + struct fc_frame *fp) +{ + u8 mac[ETH_ALEN] = FC_FCOE_FLOGI_MAC; + struct ethhdr *eth_hdr; + struct fc_frame_header *fh; + int ret = 0; + unsigned long flags; + struct fc_frame *old_flogi_resp = NULL; + + fh = (struct fc_frame_header *)fr_hdr(fp); + + spin_lock_irqsave(&fnic->fnic_lock, flags); + + if (fnic->state == FNIC_IN_ETH_MODE) { + + /* + * Check if oxid matches on taking the lock. A new Flogi + * issued by libFC might have changed the fnic cached oxid + */ + if (fnic->flogi_oxid != ntohs(fh->fh_ox_id)) { + FNIC_FCS_DBG(KERN_DEBUG, fnic->lport->host, + "Flogi response oxid not" + " matching cached oxid, dropping frame" + "\n"); + ret = -1; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + dev_kfree_skb_irq(fp_skb(fp)); + goto handle_flogi_resp_end; + } + + /* Drop older cached flogi response frame, cache this frame */ + old_flogi_resp = fnic->flogi_resp; + fnic->flogi_resp = fp; + fnic->flogi_oxid = FC_XID_UNKNOWN; + + /* + * this frame is part of flogi get the src mac addr from this + * frame if the src mac is fcoui based then we mark the + * address mode flag to use fcoui base for dst mac addr + * otherwise we have to store the fcoe gateway addr + */ + eth_hdr = (struct ethhdr *)skb_mac_header(fp_skb(fp)); + memcpy(mac, eth_hdr->h_source, ETH_ALEN); + + if (ntoh24(mac) == FC_FCOE_OUI) + fnic->fcoui_mode = 1; + else { + fnic->fcoui_mode = 0; + memcpy(fnic->dest_addr, mac, ETH_ALEN); + } + + /* + * Except for Flogi frame, all outbound frames from us have the + * Eth Src address as FC_FCOE_OUI"our_sid". Flogi frame uses + * the vnic MAC address as the Eth Src address + */ + fc_fcoe_set_mac(fnic->data_src_addr, fh->fh_d_id); + + /* We get our s_id from the d_id of the flogi resp frame */ + fnic->s_id = ntoh24(fh->fh_d_id); + + /* Change state to reflect transition from Eth to FC mode */ + fnic->state = FNIC_IN_ETH_TRANS_FC_MODE; + + } else { + FNIC_FCS_DBG(KERN_DEBUG, fnic->lport->host, + "Unexpected fnic state %s while" + " processing flogi resp\n", + fnic_state_to_str(fnic->state)); + ret = -1; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + dev_kfree_skb_irq(fp_skb(fp)); + goto handle_flogi_resp_end; + } + + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + /* Drop older cached frame */ + if (old_flogi_resp) + dev_kfree_skb_irq(fp_skb(old_flogi_resp)); + + /* + * send flogi reg request to firmware, this will put the fnic in + * in FC mode + */ + ret = fnic_flogi_reg_handler(fnic); + + if (ret < 0) { + int free_fp = 1; + spin_lock_irqsave(&fnic->fnic_lock, flags); + /* + * free the frame is some other thread is not + * pointing to it + */ + if (fnic->flogi_resp != fp) + free_fp = 0; + else + fnic->flogi_resp = NULL; + + if (fnic->state == FNIC_IN_ETH_TRANS_FC_MODE) + fnic->state = FNIC_IN_ETH_MODE; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + if (free_fp) + dev_kfree_skb_irq(fp_skb(fp)); + } + + handle_flogi_resp_end: + return ret; +} + +/* Returns 1 for a response that matches cached flogi oxid */ +static inline int is_matching_flogi_resp_frame(struct fnic *fnic, + struct fc_frame *fp) +{ + struct fc_frame_header *fh; + int ret = 0; + u32 f_ctl; + + fh = fc_frame_header_get(fp); + f_ctl = ntoh24(fh->fh_f_ctl); + + if (fnic->flogi_oxid == ntohs(fh->fh_ox_id) && + fh->fh_r_ctl == FC_RCTL_ELS_REP && + (f_ctl & (FC_FC_EX_CTX | FC_FC_SEQ_CTX)) == FC_FC_EX_CTX && + fh->fh_type == FC_TYPE_ELS) + ret = 1; + + return ret; +} + +static void fnic_rq_cmpl_frame_recv(struct vnic_rq *rq, struct cq_desc + *cq_desc, struct vnic_rq_buf *buf, + int skipped __attribute__((unused)), + void *opaque) +{ + struct fnic *fnic = vnic_dev_priv(rq->vdev); + struct sk_buff *skb; + struct fc_frame *fp; + unsigned int eth_hdrs_stripped; + u8 type, color, eop, sop, ingress_port, vlan_stripped; + u8 fcoe = 0, fcoe_sof, fcoe_eof; + u8 fcoe_fc_crc_ok = 1, fcoe_enc_error = 0; + u8 tcp_udp_csum_ok, udp, tcp, ipv4_csum_ok; + u8 ipv6, ipv4, ipv4_fragment, rss_type, csum_not_calc; + u8 fcs_ok = 1, packet_error = 0; + u16 q_number, completed_index, bytes_written = 0, vlan, checksum; + u32 rss_hash; + u16 exchange_id, tmpl; + u8 sof = 0; + u8 eof = 0; + u32 fcp_bytes_written = 0; + unsigned long flags; + + pci_unmap_single(fnic->pdev, buf->dma_addr, buf->len, + PCI_DMA_FROMDEVICE); + skb = buf->os_buf; + buf->os_buf = NULL; + + cq_desc_dec(cq_desc, &type, &color, &q_number, &completed_index); + if (type == CQ_DESC_TYPE_RQ_FCP) { + cq_fcp_rq_desc_dec((struct cq_fcp_rq_desc *)cq_desc, + &type, &color, &q_number, &completed_index, + &eop, &sop, &fcoe_fc_crc_ok, &exchange_id, + &tmpl, &fcp_bytes_written, &sof, &eof, + &ingress_port, &packet_error, + &fcoe_enc_error, &fcs_ok, &vlan_stripped, + &vlan); + eth_hdrs_stripped = 1; + + } else if (type == CQ_DESC_TYPE_RQ_ENET) { + cq_enet_rq_desc_dec((struct cq_enet_rq_desc *)cq_desc, + &type, &color, &q_number, &completed_index, + &ingress_port, &fcoe, &eop, &sop, + &rss_type, &csum_not_calc, &rss_hash, + &bytes_written, &packet_error, + &vlan_stripped, &vlan, &checksum, + &fcoe_sof, &fcoe_fc_crc_ok, + &fcoe_enc_error, &fcoe_eof, + &tcp_udp_csum_ok, &udp, &tcp, + &ipv4_csum_ok, &ipv6, &ipv4, + &ipv4_fragment, &fcs_ok); + eth_hdrs_stripped = 0; + + } else { + /* wrong CQ type*/ + shost_printk(KERN_ERR, fnic->lport->host, + "fnic rq_cmpl wrong cq type x%x\n", type); + goto drop; + } + + if (!fcs_ok || packet_error || !fcoe_fc_crc_ok || fcoe_enc_error) { + FNIC_FCS_DBG(KERN_DEBUG, fnic->lport->host, + "fnic rq_cmpl fcoe x%x fcsok x%x" + " pkterr x%x fcoe_fc_crc_ok x%x, fcoe_enc_err" + " x%x\n", + fcoe, fcs_ok, packet_error, + fcoe_fc_crc_ok, fcoe_enc_error); + goto drop; + } + + if (eth_hdrs_stripped) + fnic_import_rq_fc_frame(skb, fcp_bytes_written, sof, eof); + else if (fnic_import_rq_eth_pkt(skb, bytes_written)) + goto drop; + + fp = (struct fc_frame *)skb; + + /* + * If frame is an ELS response that matches the cached FLOGI OX_ID, + * and is accept, issue flogi_reg_request copy wq request to firmware + * to register the S_ID and determine whether FC_OUI mode or GW mode. + */ + if (is_matching_flogi_resp_frame(fnic, fp)) { + if (!eth_hdrs_stripped) { + if (fc_frame_payload_op(fp) == ELS_LS_ACC) { + fnic_handle_flogi_resp(fnic, fp); + return; + } + /* + * Recd. Flogi reject. No point registering + * with fw, but forward to libFC + */ + goto forward; + } + goto drop; + } + if (!eth_hdrs_stripped) + goto drop; + +forward: + spin_lock_irqsave(&fnic->fnic_lock, flags); + if (fnic->stop_rx_link_events) { + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + goto drop; + } + /* Use fr_flags to indicate whether succ. flogi resp or not */ + fr_flags(fp) = 0; + fr_dev(fp) = fnic->lport; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + skb_queue_tail(&fnic->frame_queue, skb); + queue_work(fnic_event_queue, &fnic->frame_work); + + return; +drop: + dev_kfree_skb_irq(skb); +} + +static int fnic_rq_cmpl_handler_cont(struct vnic_dev *vdev, + struct cq_desc *cq_desc, u8 type, + u16 q_number, u16 completed_index, + void *opaque) +{ + struct fnic *fnic = vnic_dev_priv(vdev); + + vnic_rq_service(&fnic->rq[q_number], cq_desc, completed_index, + VNIC_RQ_RETURN_DESC, fnic_rq_cmpl_frame_recv, + NULL); + return 0; +} + +int fnic_rq_cmpl_handler(struct fnic *fnic, int rq_work_to_do) +{ + unsigned int tot_rq_work_done = 0, cur_work_done; + unsigned int i; + int err; + + for (i = 0; i < fnic->rq_count; i++) { + cur_work_done = vnic_cq_service(&fnic->cq[i], rq_work_to_do, + fnic_rq_cmpl_handler_cont, + NULL); + if (cur_work_done) { + err = vnic_rq_fill(&fnic->rq[i], fnic_alloc_rq_frame); + if (err) + shost_printk(KERN_ERR, fnic->lport->host, + "fnic_alloc_rq_frame cant alloc" + " frame\n"); + } + tot_rq_work_done += cur_work_done; + } + + return tot_rq_work_done; +} + +/* + * This function is called once at init time to allocate and fill RQ + * buffers. Subsequently, it is called in the interrupt context after RQ + * buffer processing to replenish the buffers in the RQ + */ +int fnic_alloc_rq_frame(struct vnic_rq *rq) +{ + struct fnic *fnic = vnic_dev_priv(rq->vdev); + struct sk_buff *skb; + u16 len; + dma_addr_t pa; + + len = FC_FRAME_HEADROOM + FC_MAX_FRAME + FC_FRAME_TAILROOM; + skb = dev_alloc_skb(len); + if (!skb) { + FNIC_FCS_DBG(KERN_DEBUG, fnic->lport->host, + "Unable to allocate RQ sk_buff\n"); + return -ENOMEM; + } + skb_reset_mac_header(skb); + skb_reset_transport_header(skb); + skb_reset_network_header(skb); + skb_put(skb, len); + pa = pci_map_single(fnic->pdev, skb->data, len, PCI_DMA_FROMDEVICE); + fnic_queue_rq_desc(rq, skb, pa, len); + return 0; +} + +void fnic_free_rq_buf(struct vnic_rq *rq, struct vnic_rq_buf *buf) +{ + struct fc_frame *fp = buf->os_buf; + struct fnic *fnic = vnic_dev_priv(rq->vdev); + + pci_unmap_single(fnic->pdev, buf->dma_addr, buf->len, + PCI_DMA_FROMDEVICE); + + dev_kfree_skb(fp_skb(fp)); + buf->os_buf = NULL; +} + +static inline int is_flogi_frame(struct fc_frame_header *fh) +{ + return fh->fh_r_ctl == FC_RCTL_ELS_REQ && *(u8 *)(fh + 1) == ELS_FLOGI; +} + +int fnic_send_frame(struct fnic *fnic, struct fc_frame *fp) +{ + struct vnic_wq *wq = &fnic->wq[0]; + struct sk_buff *skb; + dma_addr_t pa; + struct ethhdr *eth_hdr; + struct vlan_ethhdr *vlan_hdr; + struct fcoe_hdr *fcoe_hdr; + struct fc_frame_header *fh; + u32 tot_len, eth_hdr_len; + int ret = 0; + unsigned long flags; + + fh = fc_frame_header_get(fp); + skb = fp_skb(fp); + + if (!fnic->vlan_hw_insert) { + eth_hdr_len = sizeof(*vlan_hdr) + sizeof(*fcoe_hdr); + vlan_hdr = (struct vlan_ethhdr *)skb_push(skb, eth_hdr_len); + eth_hdr = (struct ethhdr *)vlan_hdr; + vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q); + vlan_hdr->h_vlan_encapsulated_proto = htons(ETH_P_FCOE); + vlan_hdr->h_vlan_TCI = htons(fnic->vlan_id); + fcoe_hdr = (struct fcoe_hdr *)(vlan_hdr + 1); + } else { + eth_hdr_len = sizeof(*eth_hdr) + sizeof(*fcoe_hdr); + eth_hdr = (struct ethhdr *)skb_push(skb, eth_hdr_len); + eth_hdr->h_proto = htons(ETH_P_FCOE); + fcoe_hdr = (struct fcoe_hdr *)(eth_hdr + 1); + } + + if (is_flogi_frame(fh)) { + fc_fcoe_set_mac(eth_hdr->h_dest, fh->fh_d_id); + memcpy(eth_hdr->h_source, fnic->mac_addr, ETH_ALEN); + } else { + if (fnic->fcoui_mode) + fc_fcoe_set_mac(eth_hdr->h_dest, fh->fh_d_id); + else + memcpy(eth_hdr->h_dest, fnic->dest_addr, ETH_ALEN); + memcpy(eth_hdr->h_source, fnic->data_src_addr, ETH_ALEN); + } + + tot_len = skb->len; + BUG_ON(tot_len % 4); + + memset(fcoe_hdr, 0, sizeof(*fcoe_hdr)); + fcoe_hdr->fcoe_sof = fr_sof(fp); + if (FC_FCOE_VER) + FC_FCOE_ENCAPS_VER(fcoe_hdr, FC_FCOE_VER); + + pa = pci_map_single(fnic->pdev, eth_hdr, tot_len, PCI_DMA_TODEVICE); + + spin_lock_irqsave(&fnic->wq_lock[0], flags); + + if (!vnic_wq_desc_avail(wq)) { + pci_unmap_single(fnic->pdev, pa, + tot_len, PCI_DMA_TODEVICE); + ret = -1; + goto fnic_send_frame_end; + } + + fnic_queue_wq_desc(wq, skb, pa, tot_len, fr_eof(fp), + fnic->vlan_hw_insert, fnic->vlan_id, 1, 1, 1); +fnic_send_frame_end: + spin_unlock_irqrestore(&fnic->wq_lock[0], flags); + + if (ret) + dev_kfree_skb_any(fp_skb(fp)); + + return ret; +} + +/* + * fnic_send + * Routine to send a raw frame + */ +int fnic_send(struct fc_lport *lp, struct fc_frame *fp) +{ + struct fnic *fnic = lport_priv(lp); + struct fc_frame_header *fh; + int ret = 0; + enum fnic_state old_state; + unsigned long flags; + struct fc_frame *old_flogi = NULL; + struct fc_frame *old_flogi_resp = NULL; + + if (fnic->in_remove) { + dev_kfree_skb(fp_skb(fp)); + ret = -1; + goto fnic_send_end; + } + + fh = fc_frame_header_get(fp); + /* if not an Flogi frame, send it out, this is the common case */ + if (!is_flogi_frame(fh)) + return fnic_send_frame(fnic, fp); + + /* Flogi frame, now enter the state machine */ + + spin_lock_irqsave(&fnic->fnic_lock, flags); +again: + /* Get any old cached frames, free them after dropping lock */ + old_flogi = fnic->flogi; + fnic->flogi = NULL; + old_flogi_resp = fnic->flogi_resp; + fnic->flogi_resp = NULL; + + fnic->flogi_oxid = FC_XID_UNKNOWN; + + old_state = fnic->state; + switch (old_state) { + case FNIC_IN_FC_MODE: + case FNIC_IN_ETH_TRANS_FC_MODE: + default: + fnic->state = FNIC_IN_FC_TRANS_ETH_MODE; + vnic_dev_del_addr(fnic->vdev, fnic->data_src_addr); + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + if (old_flogi) { + dev_kfree_skb(fp_skb(old_flogi)); + old_flogi = NULL; + } + if (old_flogi_resp) { + dev_kfree_skb(fp_skb(old_flogi_resp)); + old_flogi_resp = NULL; + } + + ret = fnic_fw_reset_handler(fnic); + + spin_lock_irqsave(&fnic->fnic_lock, flags); + if (fnic->state != FNIC_IN_FC_TRANS_ETH_MODE) + goto again; + if (ret) { + fnic->state = old_state; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + dev_kfree_skb(fp_skb(fp)); + goto fnic_send_end; + } + old_flogi = fnic->flogi; + fnic->flogi = fp; + fnic->flogi_oxid = ntohs(fh->fh_ox_id); + old_flogi_resp = fnic->flogi_resp; + fnic->flogi_resp = NULL; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + break; + + case FNIC_IN_FC_TRANS_ETH_MODE: + /* + * A reset is pending with the firmware. Store the flogi + * and its oxid. The transition out of this state happens + * only when Firmware completes the reset, either with + * success or failed. If success, transition to + * FNIC_IN_ETH_MODE, if fail, then transition to + * FNIC_IN_FC_MODE + */ + fnic->flogi = fp; + fnic->flogi_oxid = ntohs(fh->fh_ox_id); + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + break; + + case FNIC_IN_ETH_MODE: + /* + * The fw/hw is already in eth mode. Store the oxid, + * and send the flogi frame out. The transition out of this + * state happens only we receive flogi response from the + * network, and the oxid matches the cached oxid when the + * flogi frame was sent out. If they match, then we issue + * a flogi_reg request and transition to state + * FNIC_IN_ETH_TRANS_FC_MODE + */ + fnic->flogi_oxid = ntohs(fh->fh_ox_id); + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + ret = fnic_send_frame(fnic, fp); + break; + } + +fnic_send_end: + if (old_flogi) + dev_kfree_skb(fp_skb(old_flogi)); + if (old_flogi_resp) + dev_kfree_skb(fp_skb(old_flogi_resp)); + return ret; +} + +static void fnic_wq_complete_frame_send(struct vnic_wq *wq, + struct cq_desc *cq_desc, + struct vnic_wq_buf *buf, void *opaque) +{ + struct sk_buff *skb = buf->os_buf; + struct fc_frame *fp = (struct fc_frame *)skb; + struct fnic *fnic = vnic_dev_priv(wq->vdev); + + pci_unmap_single(fnic->pdev, buf->dma_addr, + buf->len, PCI_DMA_TODEVICE); + dev_kfree_skb_irq(fp_skb(fp)); + buf->os_buf = NULL; +} + +static int fnic_wq_cmpl_handler_cont(struct vnic_dev *vdev, + struct cq_desc *cq_desc, u8 type, + u16 q_number, u16 completed_index, + void *opaque) +{ + struct fnic *fnic = vnic_dev_priv(vdev); + unsigned long flags; + + spin_lock_irqsave(&fnic->wq_lock[q_number], flags); + vnic_wq_service(&fnic->wq[q_number], cq_desc, completed_index, + fnic_wq_complete_frame_send, NULL); + spin_unlock_irqrestore(&fnic->wq_lock[q_number], flags); + + return 0; +} + +int fnic_wq_cmpl_handler(struct fnic *fnic, int work_to_do) +{ + unsigned int wq_work_done = 0; + unsigned int i; + + for (i = 0; i < fnic->raw_wq_count; i++) { + wq_work_done += vnic_cq_service(&fnic->cq[fnic->rq_count+i], + work_to_do, + fnic_wq_cmpl_handler_cont, + NULL); + } + + return wq_work_done; +} + + +void fnic_free_wq_buf(struct vnic_wq *wq, struct vnic_wq_buf *buf) +{ + struct fc_frame *fp = buf->os_buf; + struct fnic *fnic = vnic_dev_priv(wq->vdev); + + pci_unmap_single(fnic->pdev, buf->dma_addr, + buf->len, PCI_DMA_TODEVICE); + + dev_kfree_skb(fp_skb(fp)); + buf->os_buf = NULL; +} diff --git a/drivers/scsi/fnic/fnic_io.h b/drivers/scsi/fnic/fnic_io.h new file mode 100644 index 00000000000..f0b896988cd --- /dev/null +++ b/drivers/scsi/fnic/fnic_io.h @@ -0,0 +1,67 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _FNIC_IO_H_ +#define _FNIC_IO_H_ + +#include + +#define FNIC_DFLT_SG_DESC_CNT 32 +#define FNIC_MAX_SG_DESC_CNT 1024 /* Maximum descriptors per sgl */ +#define FNIC_SG_DESC_ALIGN 16 /* Descriptor address alignment */ + +struct host_sg_desc { + __le64 addr; + __le32 len; + u32 _resvd; +}; + +struct fnic_dflt_sgl_list { + struct host_sg_desc sg_desc[FNIC_DFLT_SG_DESC_CNT]; +}; + +struct fnic_sgl_list { + struct host_sg_desc sg_desc[FNIC_MAX_SG_DESC_CNT]; +}; + +enum fnic_sgl_list_type { + FNIC_SGL_CACHE_DFLT = 0, /* cache with default size sgl */ + FNIC_SGL_CACHE_MAX, /* cache with max size sgl */ + FNIC_SGL_NUM_CACHES /* number of sgl caches */ +}; + +enum fnic_ioreq_state { + FNIC_IOREQ_CMD_PENDING = 0, + FNIC_IOREQ_ABTS_PENDING, + FNIC_IOREQ_ABTS_COMPLETE, + FNIC_IOREQ_CMD_COMPLETE, +}; + +struct fnic_io_req { + struct host_sg_desc *sgl_list; /* sgl list */ + void *sgl_list_alloc; /* sgl list address used for free */ + dma_addr_t sense_buf_pa; /* dma address for sense buffer*/ + dma_addr_t sgl_list_pa; /* dma address for sgl list */ + u16 sgl_cnt; + u8 sgl_type; /* device DMA descriptor list type */ + u8 io_completed:1; /* set to 1 when fw completes IO */ + u32 port_id; /* remote port DID */ + struct completion *abts_done; /* completion for abts */ + struct completion *dr_done; /* completion for device reset */ +}; + +#endif /* _FNIC_IO_H_ */ diff --git a/drivers/scsi/fnic/fnic_isr.c b/drivers/scsi/fnic/fnic_isr.c new file mode 100644 index 00000000000..2b3064828ae --- /dev/null +++ b/drivers/scsi/fnic/fnic_isr.c @@ -0,0 +1,332 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include "vnic_dev.h" +#include "vnic_intr.h" +#include "vnic_stats.h" +#include "fnic_io.h" +#include "fnic.h" + +static irqreturn_t fnic_isr_legacy(int irq, void *data) +{ + struct fnic *fnic = data; + u32 pba; + unsigned long work_done = 0; + + pba = vnic_intr_legacy_pba(fnic->legacy_pba); + if (!pba) + return IRQ_NONE; + + if (pba & (1 << FNIC_INTX_NOTIFY)) { + vnic_intr_return_all_credits(&fnic->intr[FNIC_INTX_NOTIFY]); + fnic_handle_link_event(fnic); + } + + if (pba & (1 << FNIC_INTX_ERR)) { + vnic_intr_return_all_credits(&fnic->intr[FNIC_INTX_ERR]); + fnic_log_q_error(fnic); + } + + if (pba & (1 << FNIC_INTX_WQ_RQ_COPYWQ)) { + work_done += fnic_wq_copy_cmpl_handler(fnic, 8); + work_done += fnic_wq_cmpl_handler(fnic, 4); + work_done += fnic_rq_cmpl_handler(fnic, 4); + + vnic_intr_return_credits(&fnic->intr[FNIC_INTX_WQ_RQ_COPYWQ], + work_done, + 1 /* unmask intr */, + 1 /* reset intr timer */); + } + + return IRQ_HANDLED; +} + +static irqreturn_t fnic_isr_msi(int irq, void *data) +{ + struct fnic *fnic = data; + unsigned long work_done = 0; + + work_done += fnic_wq_copy_cmpl_handler(fnic, 8); + work_done += fnic_wq_cmpl_handler(fnic, 4); + work_done += fnic_rq_cmpl_handler(fnic, 4); + + vnic_intr_return_credits(&fnic->intr[0], + work_done, + 1 /* unmask intr */, + 1 /* reset intr timer */); + + return IRQ_HANDLED; +} + +static irqreturn_t fnic_isr_msix_rq(int irq, void *data) +{ + struct fnic *fnic = data; + unsigned long rq_work_done = 0; + + rq_work_done = fnic_rq_cmpl_handler(fnic, 4); + vnic_intr_return_credits(&fnic->intr[FNIC_MSIX_RQ], + rq_work_done, + 1 /* unmask intr */, + 1 /* reset intr timer */); + + return IRQ_HANDLED; +} + +static irqreturn_t fnic_isr_msix_wq(int irq, void *data) +{ + struct fnic *fnic = data; + unsigned long wq_work_done = 0; + + wq_work_done = fnic_wq_cmpl_handler(fnic, 4); + vnic_intr_return_credits(&fnic->intr[FNIC_MSIX_WQ], + wq_work_done, + 1 /* unmask intr */, + 1 /* reset intr timer */); + return IRQ_HANDLED; +} + +static irqreturn_t fnic_isr_msix_wq_copy(int irq, void *data) +{ + struct fnic *fnic = data; + unsigned long wq_copy_work_done = 0; + + wq_copy_work_done = fnic_wq_copy_cmpl_handler(fnic, 8); + vnic_intr_return_credits(&fnic->intr[FNIC_MSIX_WQ_COPY], + wq_copy_work_done, + 1 /* unmask intr */, + 1 /* reset intr timer */); + return IRQ_HANDLED; +} + +static irqreturn_t fnic_isr_msix_err_notify(int irq, void *data) +{ + struct fnic *fnic = data; + + vnic_intr_return_all_credits(&fnic->intr[FNIC_MSIX_ERR_NOTIFY]); + fnic_log_q_error(fnic); + fnic_handle_link_event(fnic); + + return IRQ_HANDLED; +} + +void fnic_free_intr(struct fnic *fnic) +{ + int i; + + switch (vnic_dev_get_intr_mode(fnic->vdev)) { + case VNIC_DEV_INTR_MODE_INTX: + case VNIC_DEV_INTR_MODE_MSI: + free_irq(fnic->pdev->irq, fnic); + break; + + case VNIC_DEV_INTR_MODE_MSIX: + for (i = 0; i < ARRAY_SIZE(fnic->msix); i++) + if (fnic->msix[i].requested) + free_irq(fnic->msix_entry[i].vector, + fnic->msix[i].devid); + break; + + default: + break; + } +} + +int fnic_request_intr(struct fnic *fnic) +{ + int err = 0; + int i; + + switch (vnic_dev_get_intr_mode(fnic->vdev)) { + + case VNIC_DEV_INTR_MODE_INTX: + err = request_irq(fnic->pdev->irq, &fnic_isr_legacy, + IRQF_SHARED, DRV_NAME, fnic); + break; + + case VNIC_DEV_INTR_MODE_MSI: + err = request_irq(fnic->pdev->irq, &fnic_isr_msi, + 0, fnic->name, fnic); + break; + + case VNIC_DEV_INTR_MODE_MSIX: + + sprintf(fnic->msix[FNIC_MSIX_RQ].devname, + "%.11s-fcs-rq", fnic->name); + fnic->msix[FNIC_MSIX_RQ].isr = fnic_isr_msix_rq; + fnic->msix[FNIC_MSIX_RQ].devid = fnic; + + sprintf(fnic->msix[FNIC_MSIX_WQ].devname, + "%.11s-fcs-wq", fnic->name); + fnic->msix[FNIC_MSIX_WQ].isr = fnic_isr_msix_wq; + fnic->msix[FNIC_MSIX_WQ].devid = fnic; + + sprintf(fnic->msix[FNIC_MSIX_WQ_COPY].devname, + "%.11s-scsi-wq", fnic->name); + fnic->msix[FNIC_MSIX_WQ_COPY].isr = fnic_isr_msix_wq_copy; + fnic->msix[FNIC_MSIX_WQ_COPY].devid = fnic; + + sprintf(fnic->msix[FNIC_MSIX_ERR_NOTIFY].devname, + "%.11s-err-notify", fnic->name); + fnic->msix[FNIC_MSIX_ERR_NOTIFY].isr = + fnic_isr_msix_err_notify; + fnic->msix[FNIC_MSIX_ERR_NOTIFY].devid = fnic; + + for (i = 0; i < ARRAY_SIZE(fnic->msix); i++) { + err = request_irq(fnic->msix_entry[i].vector, + fnic->msix[i].isr, 0, + fnic->msix[i].devname, + fnic->msix[i].devid); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "MSIX: request_irq" + " failed %d\n", err); + fnic_free_intr(fnic); + break; + } + fnic->msix[i].requested = 1; + } + break; + + default: + break; + } + + return err; +} + +int fnic_set_intr_mode(struct fnic *fnic) +{ + unsigned int n = ARRAY_SIZE(fnic->rq); + unsigned int m = ARRAY_SIZE(fnic->wq); + unsigned int o = ARRAY_SIZE(fnic->wq_copy); + unsigned int i; + + /* + * Set interrupt mode (INTx, MSI, MSI-X) depending + * system capabilities. + * + * Try MSI-X first + * + * We need n RQs, m WQs, o Copy WQs, n+m+o CQs, and n+m+o+1 INTRs + * (last INTR is used for WQ/RQ errors and notification area) + */ + + BUG_ON(ARRAY_SIZE(fnic->msix_entry) < n + m + o + 1); + for (i = 0; i < n + m + o + 1; i++) + fnic->msix_entry[i].entry = i; + + if (fnic->rq_count >= n && + fnic->raw_wq_count >= m && + fnic->wq_copy_count >= o && + fnic->cq_count >= n + m + o) { + if (!pci_enable_msix(fnic->pdev, fnic->msix_entry, + n + m + o + 1)) { + fnic->rq_count = n; + fnic->raw_wq_count = m; + fnic->wq_copy_count = o; + fnic->wq_count = m + o; + fnic->cq_count = n + m + o; + fnic->intr_count = n + m + o + 1; + fnic->err_intr_offset = FNIC_MSIX_ERR_NOTIFY; + + FNIC_ISR_DBG(KERN_DEBUG, fnic->lport->host, + "Using MSI-X Interrupts\n"); + vnic_dev_set_intr_mode(fnic->vdev, + VNIC_DEV_INTR_MODE_MSIX); + return 0; + } + } + + /* + * Next try MSI + * We need 1 RQ, 1 WQ, 1 WQ_COPY, 3 CQs, and 1 INTR + */ + if (fnic->rq_count >= 1 && + fnic->raw_wq_count >= 1 && + fnic->wq_copy_count >= 1 && + fnic->cq_count >= 3 && + fnic->intr_count >= 1 && + !pci_enable_msi(fnic->pdev)) { + + fnic->rq_count = 1; + fnic->raw_wq_count = 1; + fnic->wq_copy_count = 1; + fnic->wq_count = 2; + fnic->cq_count = 3; + fnic->intr_count = 1; + fnic->err_intr_offset = 0; + + FNIC_ISR_DBG(KERN_DEBUG, fnic->lport->host, + "Using MSI Interrupts\n"); + vnic_dev_set_intr_mode(fnic->vdev, VNIC_DEV_INTR_MODE_MSI); + + return 0; + } + + /* + * Next try INTx + * We need 1 RQ, 1 WQ, 1 WQ_COPY, 3 CQs, and 3 INTRs + * 1 INTR is used for all 3 queues, 1 INTR for queue errors + * 1 INTR for notification area + */ + + if (fnic->rq_count >= 1 && + fnic->raw_wq_count >= 1 && + fnic->wq_copy_count >= 1 && + fnic->cq_count >= 3 && + fnic->intr_count >= 3) { + + fnic->rq_count = 1; + fnic->raw_wq_count = 1; + fnic->wq_copy_count = 1; + fnic->cq_count = 3; + fnic->intr_count = 3; + + FNIC_ISR_DBG(KERN_DEBUG, fnic->lport->host, + "Using Legacy Interrupts\n"); + vnic_dev_set_intr_mode(fnic->vdev, VNIC_DEV_INTR_MODE_INTX); + + return 0; + } + + vnic_dev_set_intr_mode(fnic->vdev, VNIC_DEV_INTR_MODE_UNKNOWN); + + return -EINVAL; +} + +void fnic_clear_intr_mode(struct fnic *fnic) +{ + switch (vnic_dev_get_intr_mode(fnic->vdev)) { + case VNIC_DEV_INTR_MODE_MSIX: + pci_disable_msix(fnic->pdev); + break; + case VNIC_DEV_INTR_MODE_MSI: + pci_disable_msi(fnic->pdev); + break; + default: + break; + } + + vnic_dev_set_intr_mode(fnic->vdev, VNIC_DEV_INTR_MODE_INTX); +} + diff --git a/drivers/scsi/fnic/fnic_main.c b/drivers/scsi/fnic/fnic_main.c new file mode 100644 index 00000000000..32ef6b87d89 --- /dev/null +++ b/drivers/scsi/fnic/fnic_main.c @@ -0,0 +1,942 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vnic_dev.h" +#include "vnic_intr.h" +#include "vnic_stats.h" +#include "fnic_io.h" +#include "fnic.h" + +#define PCI_DEVICE_ID_CISCO_FNIC 0x0045 + +/* Timer to poll notification area for events. Used for MSI interrupts */ +#define FNIC_NOTIFY_TIMER_PERIOD (2 * HZ) + +static struct kmem_cache *fnic_sgl_cache[FNIC_SGL_NUM_CACHES]; +static struct kmem_cache *fnic_io_req_cache; +LIST_HEAD(fnic_list); +DEFINE_SPINLOCK(fnic_list_lock); + +/* Supported devices by fnic module */ +static struct pci_device_id fnic_id_table[] = { + { PCI_DEVICE(PCI_VENDOR_ID_CISCO, PCI_DEVICE_ID_CISCO_FNIC) }, + { 0, } +}; + +MODULE_DESCRIPTION(DRV_DESCRIPTION); +MODULE_AUTHOR("Abhijeet Joglekar , " + "Joseph R. Eykholt "); +MODULE_LICENSE("GPL v2"); +MODULE_VERSION(DRV_VERSION); +MODULE_DEVICE_TABLE(pci, fnic_id_table); + +unsigned int fnic_log_level; +module_param(fnic_log_level, int, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(fnic_log_level, "bit mask of fnic logging levels"); + + +static struct libfc_function_template fnic_transport_template = { + .frame_send = fnic_send, + .fcp_abort_io = fnic_empty_scsi_cleanup, + .fcp_cleanup = fnic_empty_scsi_cleanup, + .exch_mgr_reset = fnic_exch_mgr_reset +}; + +static int fnic_slave_alloc(struct scsi_device *sdev) +{ + struct fc_rport *rport = starget_to_rport(scsi_target(sdev)); + struct fc_lport *lp = shost_priv(sdev->host); + struct fnic *fnic = lport_priv(lp); + + sdev->tagged_supported = 1; + + if (!rport || fc_remote_port_chkready(rport)) + return -ENXIO; + + scsi_activate_tcq(sdev, FNIC_DFLT_QUEUE_DEPTH); + rport->dev_loss_tmo = fnic->config.port_down_timeout / 1000; + + return 0; +} + +static struct scsi_host_template fnic_host_template = { + .module = THIS_MODULE, + .name = DRV_NAME, + .queuecommand = fnic_queuecommand, + .eh_abort_handler = fnic_abort_cmd, + .eh_device_reset_handler = fnic_device_reset, + .eh_host_reset_handler = fnic_host_reset, + .slave_alloc = fnic_slave_alloc, + .change_queue_depth = fc_change_queue_depth, + .change_queue_type = fc_change_queue_type, + .this_id = -1, + .cmd_per_lun = 3, + .can_queue = FNIC_MAX_IO_REQ, + .use_clustering = ENABLE_CLUSTERING, + .sg_tablesize = FNIC_MAX_SG_DESC_CNT, + .max_sectors = 0xffff, + .shost_attrs = fnic_attrs, +}; + +static void fnic_get_host_speed(struct Scsi_Host *shost); +static struct scsi_transport_template *fnic_fc_transport; +static struct fc_host_statistics *fnic_get_stats(struct Scsi_Host *); + +static struct fc_function_template fnic_fc_functions = { + + .show_host_node_name = 1, + .show_host_port_name = 1, + .show_host_supported_classes = 1, + .show_host_supported_fc4s = 1, + .show_host_active_fc4s = 1, + .show_host_maxframe_size = 1, + .show_host_port_id = 1, + .show_host_supported_speeds = 1, + .get_host_speed = fnic_get_host_speed, + .show_host_speed = 1, + .show_host_port_type = 1, + .get_host_port_state = fc_get_host_port_state, + .show_host_port_state = 1, + .show_host_symbolic_name = 1, + .show_rport_maxframe_size = 1, + .show_rport_supported_classes = 1, + .show_host_fabric_name = 1, + .show_starget_node_name = 1, + .show_starget_port_name = 1, + .show_starget_port_id = 1, + .show_rport_dev_loss_tmo = 1, + .issue_fc_host_lip = fnic_reset, + .get_fc_host_stats = fnic_get_stats, + .dd_fcrport_size = sizeof(struct fc_rport_libfc_priv), + .terminate_rport_io = fnic_terminate_rport_io, +}; + +static void fnic_get_host_speed(struct Scsi_Host *shost) +{ + struct fc_lport *lp = shost_priv(shost); + struct fnic *fnic = lport_priv(lp); + u32 port_speed = vnic_dev_port_speed(fnic->vdev); + + /* Add in other values as they get defined in fw */ + switch (port_speed) { + case 10000: + fc_host_speed(shost) = FC_PORTSPEED_10GBIT; + break; + default: + fc_host_speed(shost) = FC_PORTSPEED_10GBIT; + break; + } +} + +static struct fc_host_statistics *fnic_get_stats(struct Scsi_Host *host) +{ + int ret; + struct fc_lport *lp = shost_priv(host); + struct fnic *fnic = lport_priv(lp); + struct fc_host_statistics *stats = &lp->host_stats; + struct vnic_stats *vs; + unsigned long flags; + + if (time_before(jiffies, fnic->stats_time + HZ / FNIC_STATS_RATE_LIMIT)) + return stats; + fnic->stats_time = jiffies; + + spin_lock_irqsave(&fnic->fnic_lock, flags); + ret = vnic_dev_stats_dump(fnic->vdev, &fnic->stats); + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + if (ret) { + FNIC_MAIN_DBG(KERN_DEBUG, fnic->lport->host, + "fnic: Get vnic stats failed" + " 0x%x", ret); + return stats; + } + vs = fnic->stats; + stats->tx_frames = vs->tx.tx_unicast_frames_ok; + stats->tx_words = vs->tx.tx_unicast_bytes_ok / 4; + stats->rx_frames = vs->rx.rx_unicast_frames_ok; + stats->rx_words = vs->rx.rx_unicast_bytes_ok / 4; + stats->error_frames = vs->tx.tx_errors + vs->rx.rx_errors; + stats->dumped_frames = vs->tx.tx_drops + vs->rx.rx_drop; + stats->invalid_crc_count = vs->rx.rx_crc_errors; + stats->seconds_since_last_reset = (jiffies - lp->boot_time) / HZ; + stats->fcp_input_megabytes = div_u64(fnic->fcp_input_bytes, 1000000); + stats->fcp_output_megabytes = div_u64(fnic->fcp_output_bytes, 1000000); + + return stats; +} + +void fnic_log_q_error(struct fnic *fnic) +{ + unsigned int i; + u32 error_status; + + for (i = 0; i < fnic->raw_wq_count; i++) { + error_status = ioread32(&fnic->wq[i].ctrl->error_status); + if (error_status) + shost_printk(KERN_ERR, fnic->lport->host, + "WQ[%d] error_status" + " %d\n", i, error_status); + } + + for (i = 0; i < fnic->rq_count; i++) { + error_status = ioread32(&fnic->rq[i].ctrl->error_status); + if (error_status) + shost_printk(KERN_ERR, fnic->lport->host, + "RQ[%d] error_status" + " %d\n", i, error_status); + } + + for (i = 0; i < fnic->wq_copy_count; i++) { + error_status = ioread32(&fnic->wq_copy[i].ctrl->error_status); + if (error_status) + shost_printk(KERN_ERR, fnic->lport->host, + "CWQ[%d] error_status" + " %d\n", i, error_status); + } +} + +void fnic_handle_link_event(struct fnic *fnic) +{ + unsigned long flags; + + spin_lock_irqsave(&fnic->fnic_lock, flags); + if (fnic->stop_rx_link_events) { + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + return; + } + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + queue_work(fnic_event_queue, &fnic->link_work); + +} + +static int fnic_notify_set(struct fnic *fnic) +{ + int err; + + switch (vnic_dev_get_intr_mode(fnic->vdev)) { + case VNIC_DEV_INTR_MODE_INTX: + err = vnic_dev_notify_set(fnic->vdev, FNIC_INTX_NOTIFY); + break; + case VNIC_DEV_INTR_MODE_MSI: + err = vnic_dev_notify_set(fnic->vdev, -1); + break; + case VNIC_DEV_INTR_MODE_MSIX: + err = vnic_dev_notify_set(fnic->vdev, FNIC_MSIX_ERR_NOTIFY); + break; + default: + shost_printk(KERN_ERR, fnic->lport->host, + "Interrupt mode should be set up" + " before devcmd notify set %d\n", + vnic_dev_get_intr_mode(fnic->vdev)); + err = -1; + break; + } + + return err; +} + +static void fnic_notify_timer(unsigned long data) +{ + struct fnic *fnic = (struct fnic *)data; + + fnic_handle_link_event(fnic); + mod_timer(&fnic->notify_timer, + round_jiffies(jiffies + FNIC_NOTIFY_TIMER_PERIOD)); +} + +static void fnic_notify_timer_start(struct fnic *fnic) +{ + switch (vnic_dev_get_intr_mode(fnic->vdev)) { + case VNIC_DEV_INTR_MODE_MSI: + /* + * Schedule first timeout immediately. The driver is + * initiatialized and ready to look for link up notification + */ + mod_timer(&fnic->notify_timer, jiffies); + break; + default: + /* Using intr for notification for INTx/MSI-X */ + break; + }; +} + +static int fnic_dev_wait(struct vnic_dev *vdev, + int (*start)(struct vnic_dev *, int), + int (*finished)(struct vnic_dev *, int *), + int arg) +{ + unsigned long time; + int done; + int err; + + err = start(vdev, arg); + if (err) + return err; + + /* Wait for func to complete...2 seconds max */ + time = jiffies + (HZ * 2); + do { + err = finished(vdev, &done); + if (err) + return err; + if (done) + return 0; + schedule_timeout_uninterruptible(HZ / 10); + } while (time_after(time, jiffies)); + + return -ETIMEDOUT; +} + +static int fnic_cleanup(struct fnic *fnic) +{ + unsigned int i; + int err; + unsigned long flags; + struct fc_frame *flogi = NULL; + struct fc_frame *flogi_resp = NULL; + + vnic_dev_disable(fnic->vdev); + for (i = 0; i < fnic->intr_count; i++) + vnic_intr_mask(&fnic->intr[i]); + + for (i = 0; i < fnic->rq_count; i++) { + err = vnic_rq_disable(&fnic->rq[i]); + if (err) + return err; + } + for (i = 0; i < fnic->raw_wq_count; i++) { + err = vnic_wq_disable(&fnic->wq[i]); + if (err) + return err; + } + for (i = 0; i < fnic->wq_copy_count; i++) { + err = vnic_wq_copy_disable(&fnic->wq_copy[i]); + if (err) + return err; + } + + /* Clean up completed IOs and FCS frames */ + fnic_wq_copy_cmpl_handler(fnic, -1); + fnic_wq_cmpl_handler(fnic, -1); + fnic_rq_cmpl_handler(fnic, -1); + + /* Clean up the IOs and FCS frames that have not completed */ + for (i = 0; i < fnic->raw_wq_count; i++) + vnic_wq_clean(&fnic->wq[i], fnic_free_wq_buf); + for (i = 0; i < fnic->rq_count; i++) + vnic_rq_clean(&fnic->rq[i], fnic_free_rq_buf); + for (i = 0; i < fnic->wq_copy_count; i++) + vnic_wq_copy_clean(&fnic->wq_copy[i], + fnic_wq_copy_cleanup_handler); + + for (i = 0; i < fnic->cq_count; i++) + vnic_cq_clean(&fnic->cq[i]); + for (i = 0; i < fnic->intr_count; i++) + vnic_intr_clean(&fnic->intr[i]); + + /* + * Remove cached flogi and flogi resp frames if any + * These frames are not in any queue, and therefore queue + * cleanup does not clean them. So clean them explicitly + */ + spin_lock_irqsave(&fnic->fnic_lock, flags); + flogi = fnic->flogi; + fnic->flogi = NULL; + flogi_resp = fnic->flogi_resp; + fnic->flogi_resp = NULL; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + if (flogi) + dev_kfree_skb(fp_skb(flogi)); + + if (flogi_resp) + dev_kfree_skb(fp_skb(flogi_resp)); + + mempool_destroy(fnic->io_req_pool); + for (i = 0; i < FNIC_SGL_NUM_CACHES; i++) + mempool_destroy(fnic->io_sgl_pool[i]); + + return 0; +} + +static void fnic_iounmap(struct fnic *fnic) +{ + if (fnic->bar0.vaddr) + iounmap(fnic->bar0.vaddr); +} + +/* + * Allocate element for mempools requiring GFP_DMA flag. + * Otherwise, checks in kmem_flagcheck() hit BUG_ON(). + */ +static void *fnic_alloc_slab_dma(gfp_t gfp_mask, void *pool_data) +{ + struct kmem_cache *mem = pool_data; + + return kmem_cache_alloc(mem, gfp_mask | GFP_ATOMIC | GFP_DMA); +} + +static int __devinit fnic_probe(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct Scsi_Host *host; + struct fc_lport *lp; + struct fnic *fnic; + mempool_t *pool; + int err; + int i; + unsigned long flags; + + /* + * Allocate SCSI Host and set up association between host, + * local port, and fnic + */ + host = scsi_host_alloc(&fnic_host_template, + sizeof(struct fc_lport) + sizeof(struct fnic)); + if (!host) { + printk(KERN_ERR PFX "Unable to alloc SCSI host\n"); + err = -ENOMEM; + goto err_out; + } + lp = shost_priv(host); + lp->host = host; + fnic = lport_priv(lp); + fnic->lport = lp; + + snprintf(fnic->name, sizeof(fnic->name) - 1, "%s%d", DRV_NAME, + host->host_no); + + host->transportt = fnic_fc_transport; + + err = scsi_init_shared_tag_map(host, FNIC_MAX_IO_REQ); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "Unable to alloc shared tag map\n"); + goto err_out_free_hba; + } + + /* Setup PCI resources */ + pci_set_drvdata(pdev, fnic); + + fnic->pdev = pdev; + + err = pci_enable_device(pdev); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "Cannot enable PCI device, aborting.\n"); + goto err_out_free_hba; + } + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "Cannot enable PCI resources, aborting\n"); + goto err_out_disable_device; + } + + pci_set_master(pdev); + + /* Query PCI controller on system for DMA addressing + * limitation for the device. Try 40-bit first, and + * fail to 32-bit. + */ + err = pci_set_dma_mask(pdev, DMA_40BIT_MASK); + if (err) { + err = pci_set_dma_mask(pdev, DMA_32BIT_MASK); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "No usable DMA configuration " + "aborting\n"); + goto err_out_release_regions; + } + err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "Unable to obtain 32-bit DMA " + "for consistent allocations, aborting.\n"); + goto err_out_release_regions; + } + } else { + err = pci_set_consistent_dma_mask(pdev, DMA_40BIT_MASK); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "Unable to obtain 40-bit DMA " + "for consistent allocations, aborting.\n"); + goto err_out_release_regions; + } + } + + /* Map vNIC resources from BAR0 */ + if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { + shost_printk(KERN_ERR, fnic->lport->host, + "BAR0 not memory-map'able, aborting.\n"); + err = -ENODEV; + goto err_out_release_regions; + } + + fnic->bar0.vaddr = pci_iomap(pdev, 0, 0); + fnic->bar0.bus_addr = pci_resource_start(pdev, 0); + fnic->bar0.len = pci_resource_len(pdev, 0); + + if (!fnic->bar0.vaddr) { + shost_printk(KERN_ERR, fnic->lport->host, + "Cannot memory-map BAR0 res hdr, " + "aborting.\n"); + err = -ENODEV; + goto err_out_release_regions; + } + + fnic->vdev = vnic_dev_register(NULL, fnic, pdev, &fnic->bar0); + if (!fnic->vdev) { + shost_printk(KERN_ERR, fnic->lport->host, + "vNIC registration failed, " + "aborting.\n"); + err = -ENODEV; + goto err_out_iounmap; + } + + err = fnic_dev_wait(fnic->vdev, vnic_dev_open, + vnic_dev_open_done, 0); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "vNIC dev open failed, aborting.\n"); + goto err_out_vnic_unregister; + } + + err = vnic_dev_init(fnic->vdev, 0); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "vNIC dev init failed, aborting.\n"); + goto err_out_dev_close; + } + + err = vnic_dev_mac_addr(fnic->vdev, fnic->mac_addr); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "vNIC get MAC addr failed \n"); + goto err_out_dev_close; + } + + /* Get vNIC configuration */ + err = fnic_get_vnic_config(fnic); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "Get vNIC configuration failed, " + "aborting.\n"); + goto err_out_dev_close; + } + host->max_lun = fnic->config.luns_per_tgt; + host->max_id = FNIC_MAX_FCP_TARGET; + + fnic_get_res_counts(fnic); + + err = fnic_set_intr_mode(fnic); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "Failed to set intr mode, " + "aborting.\n"); + goto err_out_dev_close; + } + + err = fnic_request_intr(fnic); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "Unable to request irq.\n"); + goto err_out_clear_intr; + } + + err = fnic_alloc_vnic_resources(fnic); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "Failed to alloc vNIC resources, " + "aborting.\n"); + goto err_out_free_intr; + } + + + /* initialize all fnic locks */ + spin_lock_init(&fnic->fnic_lock); + + for (i = 0; i < FNIC_WQ_MAX; i++) + spin_lock_init(&fnic->wq_lock[i]); + + for (i = 0; i < FNIC_WQ_COPY_MAX; i++) { + spin_lock_init(&fnic->wq_copy_lock[i]); + fnic->wq_copy_desc_low[i] = DESC_CLEAN_LOW_WATERMARK; + fnic->fw_ack_recd[i] = 0; + fnic->fw_ack_index[i] = -1; + } + + for (i = 0; i < FNIC_IO_LOCKS; i++) + spin_lock_init(&fnic->io_req_lock[i]); + + fnic->io_req_pool = mempool_create_slab_pool(2, fnic_io_req_cache); + if (!fnic->io_req_pool) + goto err_out_free_resources; + + pool = mempool_create(2, fnic_alloc_slab_dma, mempool_free_slab, + fnic_sgl_cache[FNIC_SGL_CACHE_DFLT]); + if (!pool) + goto err_out_free_ioreq_pool; + fnic->io_sgl_pool[FNIC_SGL_CACHE_DFLT] = pool; + + pool = mempool_create(2, fnic_alloc_slab_dma, mempool_free_slab, + fnic_sgl_cache[FNIC_SGL_CACHE_MAX]); + if (!pool) + goto err_out_free_dflt_pool; + fnic->io_sgl_pool[FNIC_SGL_CACHE_MAX] = pool; + + /* setup vlan config, hw inserts vlan header */ + fnic->vlan_hw_insert = 1; + fnic->vlan_id = 0; + + fnic->flogi_oxid = FC_XID_UNKNOWN; + fnic->flogi = NULL; + fnic->flogi_resp = NULL; + fnic->state = FNIC_IN_FC_MODE; + + /* Enable hardware stripping of vlan header on ingress */ + fnic_set_nic_config(fnic, 0, 0, 0, 0, 0, 0, 1); + + /* Setup notification buffer area */ + err = fnic_notify_set(fnic); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "Failed to alloc notify buffer, aborting.\n"); + goto err_out_free_max_pool; + } + + /* Setup notify timer when using MSI interrupts */ + if (vnic_dev_get_intr_mode(fnic->vdev) == VNIC_DEV_INTR_MODE_MSI) + setup_timer(&fnic->notify_timer, + fnic_notify_timer, (unsigned long)fnic); + + /* allocate RQ buffers and post them to RQ*/ + for (i = 0; i < fnic->rq_count; i++) { + err = vnic_rq_fill(&fnic->rq[i], fnic_alloc_rq_frame); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "fnic_alloc_rq_frame can't alloc " + "frame\n"); + goto err_out_free_rq_buf; + } + } + + /* + * Initialization done with PCI system, hardware, firmware. + * Add host to SCSI + */ + err = scsi_add_host(lp->host, &pdev->dev); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "fnic: scsi_add_host failed...exiting\n"); + goto err_out_free_rq_buf; + } + + /* Start local port initiatialization */ + + lp->link_up = 0; + lp->tt = fnic_transport_template; + + lp->emp = fc_exch_mgr_alloc(lp, FC_CLASS_3, + FCPIO_HOST_EXCH_RANGE_START, + FCPIO_HOST_EXCH_RANGE_END); + if (!lp->emp) { + err = -ENOMEM; + goto err_out_remove_scsi_host; + } + + lp->max_retry_count = fnic->config.flogi_retries; + lp->service_params = (FCP_SPPF_INIT_FCN | FCP_SPPF_RD_XRDY_DIS | + FCP_SPPF_CONF_COMPL); + if (fnic->config.flags & VFCF_FCP_SEQ_LVL_ERR) + lp->service_params |= FCP_SPPF_RETRY; + + lp->boot_time = jiffies; + lp->e_d_tov = fnic->config.ed_tov; + lp->r_a_tov = fnic->config.ra_tov; + lp->link_supported_speeds = FC_PORTSPEED_10GBIT; + fc_set_wwnn(lp, fnic->config.node_wwn); + fc_set_wwpn(lp, fnic->config.port_wwn); + + fc_exch_init(lp); + fc_lport_init(lp); + fc_elsct_init(lp); + fc_rport_init(lp); + fc_disc_init(lp); + + fc_lport_config(lp); + + if (fc_set_mfs(lp, fnic->config.maxdatafieldsize + + sizeof(struct fc_frame_header))) { + err = -EINVAL; + goto err_out_free_exch_mgr; + } + fc_host_maxframe_size(lp->host) = lp->mfs; + + sprintf(fc_host_symbolic_name(lp->host), + DRV_NAME " v" DRV_VERSION " over %s", fnic->name); + + spin_lock_irqsave(&fnic_list_lock, flags); + list_add_tail(&fnic->list, &fnic_list); + spin_unlock_irqrestore(&fnic_list_lock, flags); + + INIT_WORK(&fnic->link_work, fnic_handle_link); + INIT_WORK(&fnic->frame_work, fnic_handle_frame); + skb_queue_head_init(&fnic->frame_queue); + + /* Enable all queues */ + for (i = 0; i < fnic->raw_wq_count; i++) + vnic_wq_enable(&fnic->wq[i]); + for (i = 0; i < fnic->rq_count; i++) + vnic_rq_enable(&fnic->rq[i]); + for (i = 0; i < fnic->wq_copy_count; i++) + vnic_wq_copy_enable(&fnic->wq_copy[i]); + + fc_fabric_login(lp); + + vnic_dev_enable(fnic->vdev); + for (i = 0; i < fnic->intr_count; i++) + vnic_intr_unmask(&fnic->intr[i]); + + fnic_notify_timer_start(fnic); + + return 0; + +err_out_free_exch_mgr: + fc_exch_mgr_free(lp->emp); +err_out_remove_scsi_host: + fc_remove_host(fnic->lport->host); + scsi_remove_host(fnic->lport->host); +err_out_free_rq_buf: + for (i = 0; i < fnic->rq_count; i++) + vnic_rq_clean(&fnic->rq[i], fnic_free_rq_buf); + vnic_dev_notify_unset(fnic->vdev); +err_out_free_max_pool: + mempool_destroy(fnic->io_sgl_pool[FNIC_SGL_CACHE_MAX]); +err_out_free_dflt_pool: + mempool_destroy(fnic->io_sgl_pool[FNIC_SGL_CACHE_DFLT]); +err_out_free_ioreq_pool: + mempool_destroy(fnic->io_req_pool); +err_out_free_resources: + fnic_free_vnic_resources(fnic); +err_out_free_intr: + fnic_free_intr(fnic); +err_out_clear_intr: + fnic_clear_intr_mode(fnic); +err_out_dev_close: + vnic_dev_close(fnic->vdev); +err_out_vnic_unregister: + vnic_dev_unregister(fnic->vdev); +err_out_iounmap: + fnic_iounmap(fnic); +err_out_release_regions: + pci_release_regions(pdev); +err_out_disable_device: + pci_disable_device(pdev); +err_out_free_hba: + scsi_host_put(lp->host); +err_out: + return err; +} + +static void __devexit fnic_remove(struct pci_dev *pdev) +{ + struct fnic *fnic = pci_get_drvdata(pdev); + unsigned long flags; + + /* + * Mark state so that the workqueue thread stops forwarding + * received frames and link events to the local port. ISR and + * other threads that can queue work items will also stop + * creating work items on the fnic workqueue + */ + spin_lock_irqsave(&fnic->fnic_lock, flags); + fnic->stop_rx_link_events = 1; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + if (vnic_dev_get_intr_mode(fnic->vdev) == VNIC_DEV_INTR_MODE_MSI) + del_timer_sync(&fnic->notify_timer); + + /* + * Flush the fnic event queue. After this call, there should + * be no event queued for this fnic device in the workqueue + */ + flush_workqueue(fnic_event_queue); + skb_queue_purge(&fnic->frame_queue); + + /* + * Log off the fabric. This stops all remote ports, dns port, + * logs off the fabric. This flushes all rport, disc, lport work + * before returning + */ + fc_fabric_logoff(fnic->lport); + + spin_lock_irqsave(&fnic->fnic_lock, flags); + fnic->in_remove = 1; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + fc_lport_destroy(fnic->lport); + + /* + * This stops the fnic device, masks all interrupts. Completed + * CQ entries are drained. Posted WQ/RQ/Copy-WQ entries are + * cleaned up + */ + fnic_cleanup(fnic); + + BUG_ON(!skb_queue_empty(&fnic->frame_queue)); + + spin_lock_irqsave(&fnic_list_lock, flags); + list_del(&fnic->list); + spin_unlock_irqrestore(&fnic_list_lock, flags); + + fc_remove_host(fnic->lport->host); + scsi_remove_host(fnic->lport->host); + fc_exch_mgr_free(fnic->lport->emp); + vnic_dev_notify_unset(fnic->vdev); + fnic_free_vnic_resources(fnic); + fnic_free_intr(fnic); + fnic_clear_intr_mode(fnic); + vnic_dev_close(fnic->vdev); + vnic_dev_unregister(fnic->vdev); + fnic_iounmap(fnic); + pci_release_regions(pdev); + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + scsi_host_put(fnic->lport->host); +} + +static struct pci_driver fnic_driver = { + .name = DRV_NAME, + .id_table = fnic_id_table, + .probe = fnic_probe, + .remove = __devexit_p(fnic_remove), +}; + +static int __init fnic_init_module(void) +{ + size_t len; + int err = 0; + + printk(KERN_INFO PFX "%s, ver %s\n", DRV_DESCRIPTION, DRV_VERSION); + + /* Create a cache for allocation of default size sgls */ + len = sizeof(struct fnic_dflt_sgl_list); + fnic_sgl_cache[FNIC_SGL_CACHE_DFLT] = kmem_cache_create + ("fnic_sgl_dflt", len + FNIC_SG_DESC_ALIGN, FNIC_SG_DESC_ALIGN, + SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA, + NULL); + if (!fnic_sgl_cache[FNIC_SGL_CACHE_DFLT]) { + printk(KERN_ERR PFX "failed to create fnic dflt sgl slab\n"); + err = -ENOMEM; + goto err_create_fnic_sgl_slab_dflt; + } + + /* Create a cache for allocation of max size sgls*/ + len = sizeof(struct fnic_sgl_list); + fnic_sgl_cache[FNIC_SGL_CACHE_MAX] = kmem_cache_create + ("fnic_sgl_max", len + FNIC_SG_DESC_ALIGN, FNIC_SG_DESC_ALIGN, + SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA, + NULL); + if (!fnic_sgl_cache[FNIC_SGL_CACHE_MAX]) { + printk(KERN_ERR PFX "failed to create fnic max sgl slab\n"); + err = -ENOMEM; + goto err_create_fnic_sgl_slab_max; + } + + /* Create a cache of io_req structs for use via mempool */ + fnic_io_req_cache = kmem_cache_create("fnic_io_req", + sizeof(struct fnic_io_req), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!fnic_io_req_cache) { + printk(KERN_ERR PFX "failed to create fnic io_req slab\n"); + err = -ENOMEM; + goto err_create_fnic_ioreq_slab; + } + + fnic_event_queue = create_singlethread_workqueue("fnic_event_wq"); + if (!fnic_event_queue) { + printk(KERN_ERR PFX "fnic work queue create failed\n"); + err = -ENOMEM; + goto err_create_fnic_workq; + } + + spin_lock_init(&fnic_list_lock); + INIT_LIST_HEAD(&fnic_list); + + fnic_fc_transport = fc_attach_transport(&fnic_fc_functions); + if (!fnic_fc_transport) { + printk(KERN_ERR PFX "fc_attach_transport error\n"); + err = -ENOMEM; + goto err_fc_transport; + } + + /* register the driver with PCI system */ + err = pci_register_driver(&fnic_driver); + if (err < 0) { + printk(KERN_ERR PFX "pci register error\n"); + goto err_pci_register; + } + return err; + +err_pci_register: + fc_release_transport(fnic_fc_transport); +err_fc_transport: + destroy_workqueue(fnic_event_queue); +err_create_fnic_workq: + kmem_cache_destroy(fnic_io_req_cache); +err_create_fnic_ioreq_slab: + kmem_cache_destroy(fnic_sgl_cache[FNIC_SGL_CACHE_MAX]); +err_create_fnic_sgl_slab_max: + kmem_cache_destroy(fnic_sgl_cache[FNIC_SGL_CACHE_DFLT]); +err_create_fnic_sgl_slab_dflt: + return err; +} + +static void __exit fnic_cleanup_module(void) +{ + pci_unregister_driver(&fnic_driver); + destroy_workqueue(fnic_event_queue); + kmem_cache_destroy(fnic_sgl_cache[FNIC_SGL_CACHE_MAX]); + kmem_cache_destroy(fnic_sgl_cache[FNIC_SGL_CACHE_DFLT]); + kmem_cache_destroy(fnic_io_req_cache); + fc_release_transport(fnic_fc_transport); +} + +module_init(fnic_init_module); +module_exit(fnic_cleanup_module); + diff --git a/drivers/scsi/fnic/fnic_res.c b/drivers/scsi/fnic/fnic_res.c new file mode 100644 index 00000000000..7ba61ec715d --- /dev/null +++ b/drivers/scsi/fnic/fnic_res.c @@ -0,0 +1,444 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include "wq_enet_desc.h" +#include "rq_enet_desc.h" +#include "cq_enet_desc.h" +#include "vnic_resource.h" +#include "vnic_dev.h" +#include "vnic_wq.h" +#include "vnic_rq.h" +#include "vnic_cq.h" +#include "vnic_intr.h" +#include "vnic_stats.h" +#include "vnic_nic.h" +#include "fnic.h" + +int fnic_get_vnic_config(struct fnic *fnic) +{ + struct vnic_fc_config *c = &fnic->config; + int err; + +#define GET_CONFIG(m) \ + do { \ + err = vnic_dev_spec(fnic->vdev, \ + offsetof(struct vnic_fc_config, m), \ + sizeof(c->m), &c->m); \ + if (err) { \ + shost_printk(KERN_ERR, fnic->lport->host, \ + "Error getting %s, %d\n", #m, \ + err); \ + return err; \ + } \ + } while (0); + + GET_CONFIG(node_wwn); + GET_CONFIG(port_wwn); + GET_CONFIG(wq_enet_desc_count); + GET_CONFIG(wq_copy_desc_count); + GET_CONFIG(rq_desc_count); + GET_CONFIG(maxdatafieldsize); + GET_CONFIG(ed_tov); + GET_CONFIG(ra_tov); + GET_CONFIG(intr_timer); + GET_CONFIG(intr_timer_type); + GET_CONFIG(flags); + GET_CONFIG(flogi_retries); + GET_CONFIG(flogi_timeout); + GET_CONFIG(plogi_retries); + GET_CONFIG(plogi_timeout); + GET_CONFIG(io_throttle_count); + GET_CONFIG(link_down_timeout); + GET_CONFIG(port_down_timeout); + GET_CONFIG(port_down_io_retries); + GET_CONFIG(luns_per_tgt); + + c->wq_enet_desc_count = + min_t(u32, VNIC_FNIC_WQ_DESCS_MAX, + max_t(u32, VNIC_FNIC_WQ_DESCS_MIN, + c->wq_enet_desc_count)); + c->wq_enet_desc_count = ALIGN(c->wq_enet_desc_count, 16); + + c->wq_copy_desc_count = + min_t(u32, VNIC_FNIC_WQ_COPY_DESCS_MAX, + max_t(u32, VNIC_FNIC_WQ_COPY_DESCS_MIN, + c->wq_copy_desc_count)); + c->wq_copy_desc_count = ALIGN(c->wq_copy_desc_count, 16); + + c->rq_desc_count = + min_t(u32, VNIC_FNIC_RQ_DESCS_MAX, + max_t(u32, VNIC_FNIC_RQ_DESCS_MIN, + c->rq_desc_count)); + c->rq_desc_count = ALIGN(c->rq_desc_count, 16); + + c->maxdatafieldsize = + min_t(u16, VNIC_FNIC_MAXDATAFIELDSIZE_MAX, + max_t(u16, VNIC_FNIC_MAXDATAFIELDSIZE_MIN, + c->maxdatafieldsize)); + c->ed_tov = + min_t(u32, VNIC_FNIC_EDTOV_MAX, + max_t(u32, VNIC_FNIC_EDTOV_MIN, + c->ed_tov)); + + c->ra_tov = + min_t(u32, VNIC_FNIC_RATOV_MAX, + max_t(u32, VNIC_FNIC_RATOV_MIN, + c->ra_tov)); + + c->flogi_retries = + min_t(u32, VNIC_FNIC_FLOGI_RETRIES_MAX, c->flogi_retries); + + c->flogi_timeout = + min_t(u32, VNIC_FNIC_FLOGI_TIMEOUT_MAX, + max_t(u32, VNIC_FNIC_FLOGI_TIMEOUT_MIN, + c->flogi_timeout)); + + c->plogi_retries = + min_t(u32, VNIC_FNIC_PLOGI_RETRIES_MAX, c->plogi_retries); + + c->plogi_timeout = + min_t(u32, VNIC_FNIC_PLOGI_TIMEOUT_MAX, + max_t(u32, VNIC_FNIC_PLOGI_TIMEOUT_MIN, + c->plogi_timeout)); + + c->io_throttle_count = + min_t(u32, VNIC_FNIC_IO_THROTTLE_COUNT_MAX, + max_t(u32, VNIC_FNIC_IO_THROTTLE_COUNT_MIN, + c->io_throttle_count)); + + c->link_down_timeout = + min_t(u32, VNIC_FNIC_LINK_DOWN_TIMEOUT_MAX, + c->link_down_timeout); + + c->port_down_timeout = + min_t(u32, VNIC_FNIC_PORT_DOWN_TIMEOUT_MAX, + c->port_down_timeout); + + c->port_down_io_retries = + min_t(u32, VNIC_FNIC_PORT_DOWN_IO_RETRIES_MAX, + c->port_down_io_retries); + + c->luns_per_tgt = + min_t(u32, VNIC_FNIC_LUNS_PER_TARGET_MAX, + max_t(u32, VNIC_FNIC_LUNS_PER_TARGET_MIN, + c->luns_per_tgt)); + + c->intr_timer = min_t(u16, VNIC_INTR_TIMER_MAX, c->intr_timer); + c->intr_timer_type = c->intr_timer_type; + + shost_printk(KERN_INFO, fnic->lport->host, + "vNIC MAC addr %02x:%02x:%02x:%02x:%02x:%02x " + "wq/wq_copy/rq %d/%d/%d\n", + fnic->mac_addr[0], fnic->mac_addr[1], fnic->mac_addr[2], + fnic->mac_addr[3], fnic->mac_addr[4], fnic->mac_addr[5], + c->wq_enet_desc_count, c->wq_copy_desc_count, + c->rq_desc_count); + shost_printk(KERN_INFO, fnic->lport->host, + "vNIC node wwn %llx port wwn %llx\n", + c->node_wwn, c->port_wwn); + shost_printk(KERN_INFO, fnic->lport->host, + "vNIC ed_tov %d ra_tov %d\n", + c->ed_tov, c->ra_tov); + shost_printk(KERN_INFO, fnic->lport->host, + "vNIC mtu %d intr timer %d\n", + c->maxdatafieldsize, c->intr_timer); + shost_printk(KERN_INFO, fnic->lport->host, + "vNIC flags 0x%x luns per tgt %d\n", + c->flags, c->luns_per_tgt); + shost_printk(KERN_INFO, fnic->lport->host, + "vNIC flogi_retries %d flogi timeout %d\n", + c->flogi_retries, c->flogi_timeout); + shost_printk(KERN_INFO, fnic->lport->host, + "vNIC plogi retries %d plogi timeout %d\n", + c->plogi_retries, c->plogi_timeout); + shost_printk(KERN_INFO, fnic->lport->host, + "vNIC io throttle count %d link dn timeout %d\n", + c->io_throttle_count, c->link_down_timeout); + shost_printk(KERN_INFO, fnic->lport->host, + "vNIC port dn io retries %d port dn timeout %d\n", + c->port_down_io_retries, c->port_down_timeout); + + return 0; +} + +int fnic_set_nic_config(struct fnic *fnic, u8 rss_default_cpu, + u8 rss_hash_type, + u8 rss_hash_bits, u8 rss_base_cpu, u8 rss_enable, + u8 tso_ipid_split_en, u8 ig_vlan_strip_en) +{ + u64 a0, a1; + u32 nic_cfg; + int wait = 1000; + + vnic_set_nic_cfg(&nic_cfg, rss_default_cpu, + rss_hash_type, rss_hash_bits, rss_base_cpu, + rss_enable, tso_ipid_split_en, ig_vlan_strip_en); + + a0 = nic_cfg; + a1 = 0; + + return vnic_dev_cmd(fnic->vdev, CMD_NIC_CFG, &a0, &a1, wait); +} + +void fnic_get_res_counts(struct fnic *fnic) +{ + fnic->wq_count = vnic_dev_get_res_count(fnic->vdev, RES_TYPE_WQ); + fnic->raw_wq_count = fnic->wq_count - 1; + fnic->wq_copy_count = fnic->wq_count - fnic->raw_wq_count; + fnic->rq_count = vnic_dev_get_res_count(fnic->vdev, RES_TYPE_RQ); + fnic->cq_count = vnic_dev_get_res_count(fnic->vdev, RES_TYPE_CQ); + fnic->intr_count = vnic_dev_get_res_count(fnic->vdev, + RES_TYPE_INTR_CTRL); +} + +void fnic_free_vnic_resources(struct fnic *fnic) +{ + unsigned int i; + + for (i = 0; i < fnic->raw_wq_count; i++) + vnic_wq_free(&fnic->wq[i]); + + for (i = 0; i < fnic->wq_copy_count; i++) + vnic_wq_copy_free(&fnic->wq_copy[i]); + + for (i = 0; i < fnic->rq_count; i++) + vnic_rq_free(&fnic->rq[i]); + + for (i = 0; i < fnic->cq_count; i++) + vnic_cq_free(&fnic->cq[i]); + + for (i = 0; i < fnic->intr_count; i++) + vnic_intr_free(&fnic->intr[i]); +} + +int fnic_alloc_vnic_resources(struct fnic *fnic) +{ + enum vnic_dev_intr_mode intr_mode; + unsigned int mask_on_assertion; + unsigned int interrupt_offset; + unsigned int error_interrupt_enable; + unsigned int error_interrupt_offset; + unsigned int i, cq_index; + unsigned int wq_copy_cq_desc_count; + int err; + + intr_mode = vnic_dev_get_intr_mode(fnic->vdev); + + shost_printk(KERN_INFO, fnic->lport->host, "vNIC interrupt mode: %s\n", + intr_mode == VNIC_DEV_INTR_MODE_INTX ? "legacy PCI INTx" : + intr_mode == VNIC_DEV_INTR_MODE_MSI ? "MSI" : + intr_mode == VNIC_DEV_INTR_MODE_MSIX ? + "MSI-X" : "unknown"); + + shost_printk(KERN_INFO, fnic->lport->host, "vNIC resources avail: " + "wq %d cp_wq %d raw_wq %d rq %d cq %d intr %d\n", + fnic->wq_count, fnic->wq_copy_count, fnic->raw_wq_count, + fnic->rq_count, fnic->cq_count, fnic->intr_count); + + /* Allocate Raw WQ used for FCS frames */ + for (i = 0; i < fnic->raw_wq_count; i++) { + err = vnic_wq_alloc(fnic->vdev, &fnic->wq[i], i, + fnic->config.wq_enet_desc_count, + sizeof(struct wq_enet_desc)); + if (err) + goto err_out_cleanup; + } + + /* Allocate Copy WQs used for SCSI IOs */ + for (i = 0; i < fnic->wq_copy_count; i++) { + err = vnic_wq_copy_alloc(fnic->vdev, &fnic->wq_copy[i], + (fnic->raw_wq_count + i), + fnic->config.wq_copy_desc_count, + sizeof(struct fcpio_host_req)); + if (err) + goto err_out_cleanup; + } + + /* RQ for receiving FCS frames */ + for (i = 0; i < fnic->rq_count; i++) { + err = vnic_rq_alloc(fnic->vdev, &fnic->rq[i], i, + fnic->config.rq_desc_count, + sizeof(struct rq_enet_desc)); + if (err) + goto err_out_cleanup; + } + + /* CQ for each RQ */ + for (i = 0; i < fnic->rq_count; i++) { + cq_index = i; + err = vnic_cq_alloc(fnic->vdev, + &fnic->cq[cq_index], cq_index, + fnic->config.rq_desc_count, + sizeof(struct cq_enet_rq_desc)); + if (err) + goto err_out_cleanup; + } + + /* CQ for each WQ */ + for (i = 0; i < fnic->raw_wq_count; i++) { + cq_index = fnic->rq_count + i; + err = vnic_cq_alloc(fnic->vdev, &fnic->cq[cq_index], cq_index, + fnic->config.wq_enet_desc_count, + sizeof(struct cq_enet_wq_desc)); + if (err) + goto err_out_cleanup; + } + + /* CQ for each COPY WQ */ + wq_copy_cq_desc_count = (fnic->config.wq_copy_desc_count * 3); + for (i = 0; i < fnic->wq_copy_count; i++) { + cq_index = fnic->raw_wq_count + fnic->rq_count + i; + err = vnic_cq_alloc(fnic->vdev, &fnic->cq[cq_index], + cq_index, + wq_copy_cq_desc_count, + sizeof(struct fcpio_fw_req)); + if (err) + goto err_out_cleanup; + } + + for (i = 0; i < fnic->intr_count; i++) { + err = vnic_intr_alloc(fnic->vdev, &fnic->intr[i], i); + if (err) + goto err_out_cleanup; + } + + fnic->legacy_pba = vnic_dev_get_res(fnic->vdev, + RES_TYPE_INTR_PBA_LEGACY, 0); + + if (!fnic->legacy_pba && intr_mode == VNIC_DEV_INTR_MODE_INTX) { + shost_printk(KERN_ERR, fnic->lport->host, + "Failed to hook legacy pba resource\n"); + err = -ENODEV; + goto err_out_cleanup; + } + + /* + * Init RQ/WQ resources. + * + * RQ[0 to n-1] point to CQ[0 to n-1] + * WQ[0 to m-1] point to CQ[n to n+m-1] + * WQ_COPY[0 to k-1] points to CQ[n+m to n+m+k-1] + * + * Note for copy wq we always initialize with cq_index = 0 + * + * Error interrupt is not enabled for MSI. + */ + + switch (intr_mode) { + case VNIC_DEV_INTR_MODE_INTX: + case VNIC_DEV_INTR_MODE_MSIX: + error_interrupt_enable = 1; + error_interrupt_offset = fnic->err_intr_offset; + break; + default: + error_interrupt_enable = 0; + error_interrupt_offset = 0; + break; + } + + for (i = 0; i < fnic->rq_count; i++) { + cq_index = i; + vnic_rq_init(&fnic->rq[i], + cq_index, + error_interrupt_enable, + error_interrupt_offset); + } + + for (i = 0; i < fnic->raw_wq_count; i++) { + cq_index = i + fnic->rq_count; + vnic_wq_init(&fnic->wq[i], + cq_index, + error_interrupt_enable, + error_interrupt_offset); + } + + for (i = 0; i < fnic->wq_copy_count; i++) { + vnic_wq_copy_init(&fnic->wq_copy[i], + 0 /* cq_index 0 - always */, + error_interrupt_enable, + error_interrupt_offset); + } + + for (i = 0; i < fnic->cq_count; i++) { + + switch (intr_mode) { + case VNIC_DEV_INTR_MODE_MSIX: + interrupt_offset = i; + break; + default: + interrupt_offset = 0; + break; + } + + vnic_cq_init(&fnic->cq[i], + 0 /* flow_control_enable */, + 1 /* color_enable */, + 0 /* cq_head */, + 0 /* cq_tail */, + 1 /* cq_tail_color */, + 1 /* interrupt_enable */, + 1 /* cq_entry_enable */, + 0 /* cq_message_enable */, + interrupt_offset, + 0 /* cq_message_addr */); + } + + /* + * Init INTR resources + * + * mask_on_assertion is not used for INTx due to the level- + * triggered nature of INTx + */ + + switch (intr_mode) { + case VNIC_DEV_INTR_MODE_MSI: + case VNIC_DEV_INTR_MODE_MSIX: + mask_on_assertion = 1; + break; + default: + mask_on_assertion = 0; + break; + } + + for (i = 0; i < fnic->intr_count; i++) { + vnic_intr_init(&fnic->intr[i], + fnic->config.intr_timer, + fnic->config.intr_timer_type, + mask_on_assertion); + } + + /* init the stats memory by making the first call here */ + err = vnic_dev_stats_dump(fnic->vdev, &fnic->stats); + if (err) { + shost_printk(KERN_ERR, fnic->lport->host, + "vnic_dev_stats_dump failed - x%x\n", err); + goto err_out_cleanup; + } + + /* Clear LIF stats */ + vnic_dev_stats_clear(fnic->vdev); + + return 0; + +err_out_cleanup: + fnic_free_vnic_resources(fnic); + + return err; +} diff --git a/drivers/scsi/fnic/fnic_res.h b/drivers/scsi/fnic/fnic_res.h new file mode 100644 index 00000000000..b6f31026253 --- /dev/null +++ b/drivers/scsi/fnic/fnic_res.h @@ -0,0 +1,197 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _FNIC_RES_H_ +#define _FNIC_RES_H_ + +#include "wq_enet_desc.h" +#include "rq_enet_desc.h" +#include "vnic_wq.h" +#include "vnic_rq.h" +#include "fnic_io.h" +#include "fcpio.h" +#include "vnic_wq_copy.h" +#include "vnic_cq_copy.h" + +static inline void fnic_queue_wq_desc(struct vnic_wq *wq, + void *os_buf, dma_addr_t dma_addr, + unsigned int len, unsigned int fc_eof, + int vlan_tag_insert, + unsigned int vlan_tag, + int cq_entry, int sop, int eop) +{ + struct wq_enet_desc *desc = vnic_wq_next_desc(wq); + + wq_enet_desc_enc(desc, + (u64)dma_addr | VNIC_PADDR_TARGET, + (u16)len, + 0, /* mss_or_csum_offset */ + (u16)fc_eof, + 0, /* offload_mode */ + (u8)eop, (u8)cq_entry, + 1, /* fcoe_encap */ + (u8)vlan_tag_insert, + (u16)vlan_tag, + 0 /* loopback */); + + vnic_wq_post(wq, os_buf, dma_addr, len, sop, eop); +} + +static inline void fnic_queue_wq_copy_desc_icmnd_16(struct vnic_wq_copy *wq, + u32 req_id, + u32 lunmap_id, u8 spl_flags, + u32 sgl_cnt, u32 sense_len, + u64 sgl_addr, u64 sns_addr, + u8 crn, u8 pri_ta, + u8 flags, u8 *scsi_cdb, + u32 data_len, u8 *lun, + u32 d_id, u16 mss, + u32 ratov, u32 edtov) +{ + struct fcpio_host_req *desc = vnic_wq_copy_next_desc(wq); + + desc->hdr.type = FCPIO_ICMND_16; /* enum fcpio_type */ + desc->hdr.status = 0; /* header status entry */ + desc->hdr._resvd = 0; /* reserved */ + desc->hdr.tag.u.req_id = req_id; /* id for this request */ + + desc->u.icmnd_16.lunmap_id = lunmap_id; /* index into lunmap table */ + desc->u.icmnd_16.special_req_flags = spl_flags; /* exch req flags */ + desc->u.icmnd_16._resvd0[0] = 0; /* reserved */ + desc->u.icmnd_16._resvd0[1] = 0; /* reserved */ + desc->u.icmnd_16._resvd0[2] = 0; /* reserved */ + desc->u.icmnd_16.sgl_cnt = sgl_cnt; /* scatter-gather list count */ + desc->u.icmnd_16.sense_len = sense_len; /* sense buffer length */ + desc->u.icmnd_16.sgl_addr = sgl_addr; /* scatter-gather list addr */ + desc->u.icmnd_16.sense_addr = sns_addr; /* sense buffer address */ + desc->u.icmnd_16.crn = crn; /* SCSI Command Reference No.*/ + desc->u.icmnd_16.pri_ta = pri_ta; /* SCSI Pri & Task attribute */ + desc->u.icmnd_16._resvd1 = 0; /* reserved: should be 0 */ + desc->u.icmnd_16.flags = flags; /* command flags */ + memcpy(desc->u.icmnd_16.scsi_cdb, scsi_cdb, CDB_16); /* SCSI CDB */ + desc->u.icmnd_16.data_len = data_len; /* length of data expected */ + memcpy(desc->u.icmnd_16.lun, lun, LUN_ADDRESS); /* LUN address */ + desc->u.icmnd_16._resvd2 = 0; /* reserved */ + hton24(desc->u.icmnd_16.d_id, d_id); /* FC vNIC only: Target D_ID */ + desc->u.icmnd_16.mss = mss; /* FC vNIC only: max burst */ + desc->u.icmnd_16.r_a_tov = ratov; /*FC vNIC only: Res. Alloc Timeout */ + desc->u.icmnd_16.e_d_tov = edtov; /*FC vNIC only: Err Detect Timeout */ + + vnic_wq_copy_post(wq); +} + +static inline void fnic_queue_wq_copy_desc_itmf(struct vnic_wq_copy *wq, + u32 req_id, u32 lunmap_id, + u32 tm_req, u32 tm_id, u8 *lun, + u32 d_id, u32 r_a_tov, + u32 e_d_tov) +{ + struct fcpio_host_req *desc = vnic_wq_copy_next_desc(wq); + + desc->hdr.type = FCPIO_ITMF; /* enum fcpio_type */ + desc->hdr.status = 0; /* header status entry */ + desc->hdr._resvd = 0; /* reserved */ + desc->hdr.tag.u.req_id = req_id; /* id for this request */ + + desc->u.itmf.lunmap_id = lunmap_id; /* index into lunmap table */ + desc->u.itmf.tm_req = tm_req; /* SCSI Task Management request */ + desc->u.itmf.t_tag = tm_id; /* tag of fcpio to be aborted */ + desc->u.itmf._resvd = 0; + memcpy(desc->u.itmf.lun, lun, LUN_ADDRESS); /* LUN address */ + desc->u.itmf._resvd1 = 0; + hton24(desc->u.itmf.d_id, d_id); /* FC vNIC only: Target D_ID */ + desc->u.itmf.r_a_tov = r_a_tov; /* FC vNIC only: R_A_TOV in msec */ + desc->u.itmf.e_d_tov = e_d_tov; /* FC vNIC only: E_D_TOV in msec */ + + vnic_wq_copy_post(wq); +} + +static inline void fnic_queue_wq_copy_desc_flogi_reg(struct vnic_wq_copy *wq, + u32 req_id, u8 format, + u32 s_id, u8 *gw_mac) +{ + struct fcpio_host_req *desc = vnic_wq_copy_next_desc(wq); + + desc->hdr.type = FCPIO_FLOGI_REG; /* enum fcpio_type */ + desc->hdr.status = 0; /* header status entry */ + desc->hdr._resvd = 0; /* reserved */ + desc->hdr.tag.u.req_id = req_id; /* id for this request */ + + desc->u.flogi_reg.format = format; + hton24(desc->u.flogi_reg.s_id, s_id); + memcpy(desc->u.flogi_reg.gateway_mac, gw_mac, ETH_ALEN); + + vnic_wq_copy_post(wq); +} + +static inline void fnic_queue_wq_copy_desc_fw_reset(struct vnic_wq_copy *wq, + u32 req_id) +{ + struct fcpio_host_req *desc = vnic_wq_copy_next_desc(wq); + + desc->hdr.type = FCPIO_RESET; /* enum fcpio_type */ + desc->hdr.status = 0; /* header status entry */ + desc->hdr._resvd = 0; /* reserved */ + desc->hdr.tag.u.req_id = req_id; /* id for this request */ + + vnic_wq_copy_post(wq); +} + +static inline void fnic_queue_wq_copy_desc_lunmap(struct vnic_wq_copy *wq, + u32 req_id, u64 lunmap_addr, + u32 lunmap_len) +{ + struct fcpio_host_req *desc = vnic_wq_copy_next_desc(wq); + + desc->hdr.type = FCPIO_LUNMAP_REQ; /* enum fcpio_type */ + desc->hdr.status = 0; /* header status entry */ + desc->hdr._resvd = 0; /* reserved */ + desc->hdr.tag.u.req_id = req_id; /* id for this request */ + + desc->u.lunmap_req.addr = lunmap_addr; /* address of the buffer */ + desc->u.lunmap_req.len = lunmap_len; /* len of the buffer */ + + vnic_wq_copy_post(wq); +} + +static inline void fnic_queue_rq_desc(struct vnic_rq *rq, + void *os_buf, dma_addr_t dma_addr, + u16 len) +{ + struct rq_enet_desc *desc = vnic_rq_next_desc(rq); + + rq_enet_desc_enc(desc, + (u64)dma_addr | VNIC_PADDR_TARGET, + RQ_ENET_TYPE_ONLY_SOP, + (u16)len); + + vnic_rq_post(rq, os_buf, 0, dma_addr, len); +} + + +struct fnic; + +int fnic_get_vnic_config(struct fnic *); +int fnic_alloc_vnic_resources(struct fnic *); +void fnic_free_vnic_resources(struct fnic *); +void fnic_get_res_counts(struct fnic *); +int fnic_set_nic_config(struct fnic *fnic, u8 rss_default_cpu, + u8 rss_hash_type, u8 rss_hash_bits, u8 rss_base_cpu, + u8 rss_enable, u8 tso_ipid_split_en, + u8 ig_vlan_strip_en); + +#endif /* _FNIC_RES_H_ */ diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c new file mode 100644 index 00000000000..eabf3650285 --- /dev/null +++ b/drivers/scsi/fnic/fnic_scsi.c @@ -0,0 +1,1850 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fnic_io.h" +#include "fnic.h" + +const char *fnic_state_str[] = { + [FNIC_IN_FC_MODE] = "FNIC_IN_FC_MODE", + [FNIC_IN_FC_TRANS_ETH_MODE] = "FNIC_IN_FC_TRANS_ETH_MODE", + [FNIC_IN_ETH_MODE] = "FNIC_IN_ETH_MODE", + [FNIC_IN_ETH_TRANS_FC_MODE] = "FNIC_IN_ETH_TRANS_FC_MODE", +}; + +static const char *fnic_ioreq_state_str[] = { + [FNIC_IOREQ_CMD_PENDING] = "FNIC_IOREQ_CMD_PENDING", + [FNIC_IOREQ_ABTS_PENDING] = "FNIC_IOREQ_ABTS_PENDING", + [FNIC_IOREQ_ABTS_COMPLETE] = "FNIC_IOREQ_ABTS_COMPLETE", + [FNIC_IOREQ_CMD_COMPLETE] = "FNIC_IOREQ_CMD_COMPLETE", +}; + +static const char *fcpio_status_str[] = { + [FCPIO_SUCCESS] = "FCPIO_SUCCESS", /*0x0*/ + [FCPIO_INVALID_HEADER] = "FCPIO_INVALID_HEADER", + [FCPIO_OUT_OF_RESOURCE] = "FCPIO_OUT_OF_RESOURCE", + [FCPIO_INVALID_PARAM] = "FCPIO_INVALID_PARAM]", + [FCPIO_REQ_NOT_SUPPORTED] = "FCPIO_REQ_NOT_SUPPORTED", + [FCPIO_IO_NOT_FOUND] = "FCPIO_IO_NOT_FOUND", + [FCPIO_ABORTED] = "FCPIO_ABORTED", /*0x41*/ + [FCPIO_TIMEOUT] = "FCPIO_TIMEOUT", + [FCPIO_SGL_INVALID] = "FCPIO_SGL_INVALID", + [FCPIO_MSS_INVALID] = "FCPIO_MSS_INVALID", + [FCPIO_DATA_CNT_MISMATCH] = "FCPIO_DATA_CNT_MISMATCH", + [FCPIO_FW_ERR] = "FCPIO_FW_ERR", + [FCPIO_ITMF_REJECTED] = "FCPIO_ITMF_REJECTED", + [FCPIO_ITMF_FAILED] = "FCPIO_ITMF_FAILED", + [FCPIO_ITMF_INCORRECT_LUN] = "FCPIO_ITMF_INCORRECT_LUN", + [FCPIO_CMND_REJECTED] = "FCPIO_CMND_REJECTED", + [FCPIO_NO_PATH_AVAIL] = "FCPIO_NO_PATH_AVAIL", + [FCPIO_PATH_FAILED] = "FCPIO_PATH_FAILED", + [FCPIO_LUNMAP_CHNG_PEND] = "FCPIO_LUNHMAP_CHNG_PEND", +}; + +const char *fnic_state_to_str(unsigned int state) +{ + if (state >= ARRAY_SIZE(fnic_state_str) || !fnic_state_str[state]) + return "unknown"; + + return fnic_state_str[state]; +} + +static const char *fnic_ioreq_state_to_str(unsigned int state) +{ + if (state >= ARRAY_SIZE(fnic_ioreq_state_str) || + !fnic_ioreq_state_str[state]) + return "unknown"; + + return fnic_ioreq_state_str[state]; +} + +static const char *fnic_fcpio_status_to_str(unsigned int status) +{ + if (status >= ARRAY_SIZE(fcpio_status_str) || !fcpio_status_str[status]) + return "unknown"; + + return fcpio_status_str[status]; +} + +static void fnic_cleanup_io(struct fnic *fnic, int exclude_id); + +static inline spinlock_t *fnic_io_lock_hash(struct fnic *fnic, + struct scsi_cmnd *sc) +{ + u32 hash = sc->request->tag & (FNIC_IO_LOCKS - 1); + + return &fnic->io_req_lock[hash]; +} + +/* + * Unmap the data buffer and sense buffer for an io_req, + * also unmap and free the device-private scatter/gather list. + */ +static void fnic_release_ioreq_buf(struct fnic *fnic, + struct fnic_io_req *io_req, + struct scsi_cmnd *sc) +{ + if (io_req->sgl_list_pa) + pci_unmap_single(fnic->pdev, io_req->sgl_list_pa, + sizeof(io_req->sgl_list[0]) * io_req->sgl_cnt, + PCI_DMA_TODEVICE); + scsi_dma_unmap(sc); + + if (io_req->sgl_cnt) + mempool_free(io_req->sgl_list_alloc, + fnic->io_sgl_pool[io_req->sgl_type]); + if (io_req->sense_buf_pa) + pci_unmap_single(fnic->pdev, io_req->sense_buf_pa, + SCSI_SENSE_BUFFERSIZE, PCI_DMA_FROMDEVICE); +} + +/* Free up Copy Wq descriptors. Called with copy_wq lock held */ +static int free_wq_copy_descs(struct fnic *fnic, struct vnic_wq_copy *wq) +{ + /* if no Ack received from firmware, then nothing to clean */ + if (!fnic->fw_ack_recd[0]) + return 1; + + /* + * Update desc_available count based on number of freed descriptors + * Account for wraparound + */ + if (wq->to_clean_index <= fnic->fw_ack_index[0]) + wq->ring.desc_avail += (fnic->fw_ack_index[0] + - wq->to_clean_index + 1); + else + wq->ring.desc_avail += (wq->ring.desc_count + - wq->to_clean_index + + fnic->fw_ack_index[0] + 1); + + /* + * just bump clean index to ack_index+1 accounting for wraparound + * this will essentially free up all descriptors between + * to_clean_index and fw_ack_index, both inclusive + */ + wq->to_clean_index = + (fnic->fw_ack_index[0] + 1) % wq->ring.desc_count; + + /* we have processed the acks received so far */ + fnic->fw_ack_recd[0] = 0; + return 0; +} + + +/* + * fnic_fw_reset_handler + * Routine to send reset msg to fw + */ +int fnic_fw_reset_handler(struct fnic *fnic) +{ + struct vnic_wq_copy *wq = &fnic->wq_copy[0]; + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + + if (vnic_wq_copy_desc_avail(wq) <= fnic->wq_copy_desc_low[0]) + free_wq_copy_descs(fnic, wq); + + if (!vnic_wq_copy_desc_avail(wq)) + ret = -EAGAIN; + else + fnic_queue_wq_copy_desc_fw_reset(wq, SCSI_NO_TAG); + + spin_unlock_irqrestore(&fnic->wq_copy_lock[0], flags); + + if (!ret) + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "Issued fw reset\n"); + else + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "Failed to issue fw reset\n"); + return ret; +} + + +/* + * fnic_flogi_reg_handler + * Routine to send flogi register msg to fw + */ +int fnic_flogi_reg_handler(struct fnic *fnic) +{ + struct vnic_wq_copy *wq = &fnic->wq_copy[0]; + u8 gw_mac[ETH_ALEN]; + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + + if (vnic_wq_copy_desc_avail(wq) <= fnic->wq_copy_desc_low[0]) + free_wq_copy_descs(fnic, wq); + + if (!vnic_wq_copy_desc_avail(wq)) { + ret = -EAGAIN; + goto flogi_reg_ioreq_end; + } + + if (fnic->fcoui_mode) + memset(gw_mac, 0xff, ETH_ALEN); + else + memcpy(gw_mac, fnic->dest_addr, ETH_ALEN); + + fnic_queue_wq_copy_desc_flogi_reg(wq, SCSI_NO_TAG, + FCPIO_FLOGI_REG_GW_DEST, + fnic->s_id, + gw_mac); + +flogi_reg_ioreq_end: + spin_unlock_irqrestore(&fnic->wq_copy_lock[0], flags); + + if (!ret) + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "flog reg issued\n"); + + return ret; +} + +/* + * fnic_queue_wq_copy_desc + * Routine to enqueue a wq copy desc + */ +static inline int fnic_queue_wq_copy_desc(struct fnic *fnic, + struct vnic_wq_copy *wq, + struct fnic_io_req *io_req, + struct scsi_cmnd *sc, + u32 sg_count) +{ + struct scatterlist *sg; + struct fc_rport *rport = starget_to_rport(scsi_target(sc->device)); + struct fc_rport_libfc_priv *rp = rport->dd_data; + struct host_sg_desc *desc; + u8 pri_tag = 0; + unsigned int i; + unsigned long intr_flags; + int flags; + u8 exch_flags; + struct scsi_lun fc_lun; + char msg[2]; + + if (sg_count) { + BUG_ON(sg_count < 0); + BUG_ON(sg_count > FNIC_MAX_SG_DESC_CNT); + + /* For each SGE, create a device desc entry */ + desc = io_req->sgl_list; + for_each_sg(scsi_sglist(sc), sg, sg_count, i) { + desc->addr = cpu_to_le64(sg_dma_address(sg)); + desc->len = cpu_to_le32(sg_dma_len(sg)); + desc->_resvd = 0; + desc++; + } + + io_req->sgl_list_pa = pci_map_single + (fnic->pdev, + io_req->sgl_list, + sizeof(io_req->sgl_list[0]) * sg_count, + PCI_DMA_TODEVICE); + } + + io_req->sense_buf_pa = pci_map_single(fnic->pdev, + sc->sense_buffer, + SCSI_SENSE_BUFFERSIZE, + PCI_DMA_FROMDEVICE); + + int_to_scsilun(sc->device->lun, &fc_lun); + + pri_tag = FCPIO_ICMND_PTA_SIMPLE; + msg[0] = MSG_SIMPLE_TAG; + scsi_populate_tag_msg(sc, msg); + if (msg[0] == MSG_ORDERED_TAG) + pri_tag = FCPIO_ICMND_PTA_ORDERED; + + /* Enqueue the descriptor in the Copy WQ */ + spin_lock_irqsave(&fnic->wq_copy_lock[0], intr_flags); + + if (vnic_wq_copy_desc_avail(wq) <= fnic->wq_copy_desc_low[0]) + free_wq_copy_descs(fnic, wq); + + if (unlikely(!vnic_wq_copy_desc_avail(wq))) { + spin_unlock_irqrestore(&fnic->wq_copy_lock[0], intr_flags); + return SCSI_MLQUEUE_HOST_BUSY; + } + + flags = 0; + if (sc->sc_data_direction == DMA_FROM_DEVICE) + flags = FCPIO_ICMND_RDDATA; + else if (sc->sc_data_direction == DMA_TO_DEVICE) + flags = FCPIO_ICMND_WRDATA; + + exch_flags = 0; + if ((fnic->config.flags & VFCF_FCP_SEQ_LVL_ERR) && + (rp->flags & FC_RP_FLAGS_RETRY)) + exch_flags |= FCPIO_ICMND_SRFLAG_RETRY; + + fnic_queue_wq_copy_desc_icmnd_16(wq, sc->request->tag, + 0, exch_flags, io_req->sgl_cnt, + SCSI_SENSE_BUFFERSIZE, + io_req->sgl_list_pa, + io_req->sense_buf_pa, + 0, /* scsi cmd ref, always 0 */ + pri_tag, /* scsi pri and tag */ + flags, /* command flags */ + sc->cmnd, scsi_bufflen(sc), + fc_lun.scsi_lun, io_req->port_id, + rport->maxframe_size, rp->r_a_tov, + rp->e_d_tov); + + spin_unlock_irqrestore(&fnic->wq_copy_lock[0], intr_flags); + return 0; +} + +/* + * fnic_queuecommand + * Routine to send a scsi cdb + * Called with host_lock held and interrupts disabled. + */ +int fnic_queuecommand(struct scsi_cmnd *sc, void (*done)(struct scsi_cmnd *)) +{ + struct fc_lport *lp; + struct fc_rport *rport; + struct fnic_io_req *io_req; + struct fnic *fnic; + struct vnic_wq_copy *wq; + int ret; + u32 sg_count; + unsigned long flags; + unsigned long ptr; + + rport = starget_to_rport(scsi_target(sc->device)); + ret = fc_remote_port_chkready(rport); + if (ret) { + sc->result = ret; + done(sc); + return 0; + } + + lp = shost_priv(sc->device->host); + if (lp->state != LPORT_ST_READY || !(lp->link_up)) + return SCSI_MLQUEUE_HOST_BUSY; + + /* + * Release host lock, use driver resource specific locks from here. + * Don't re-enable interrupts in case they were disabled prior to the + * caller disabling them. + */ + spin_unlock(lp->host->host_lock); + + /* Get a new io_req for this SCSI IO */ + fnic = lport_priv(lp); + + io_req = mempool_alloc(fnic->io_req_pool, GFP_ATOMIC); + if (!io_req) { + ret = SCSI_MLQUEUE_HOST_BUSY; + goto out; + } + memset(io_req, 0, sizeof(*io_req)); + + /* Map the data buffer */ + sg_count = scsi_dma_map(sc); + if (sg_count < 0) { + mempool_free(io_req, fnic->io_req_pool); + goto out; + } + + /* Determine the type of scatter/gather list we need */ + io_req->sgl_cnt = sg_count; + io_req->sgl_type = FNIC_SGL_CACHE_DFLT; + if (sg_count > FNIC_DFLT_SG_DESC_CNT) + io_req->sgl_type = FNIC_SGL_CACHE_MAX; + + if (sg_count) { + io_req->sgl_list = + mempool_alloc(fnic->io_sgl_pool[io_req->sgl_type], + GFP_ATOMIC | GFP_DMA); + if (!io_req->sgl_list) { + ret = SCSI_MLQUEUE_HOST_BUSY; + scsi_dma_unmap(sc); + mempool_free(io_req, fnic->io_req_pool); + goto out; + } + + /* Cache sgl list allocated address before alignment */ + io_req->sgl_list_alloc = io_req->sgl_list; + ptr = (unsigned long) io_req->sgl_list; + if (ptr % FNIC_SG_DESC_ALIGN) { + io_req->sgl_list = (struct host_sg_desc *) + (((unsigned long) ptr + + FNIC_SG_DESC_ALIGN - 1) + & ~(FNIC_SG_DESC_ALIGN - 1)); + } + } + + /* initialize rest of io_req */ + io_req->port_id = rport->port_id; + CMD_STATE(sc) = FNIC_IOREQ_CMD_PENDING; + CMD_SP(sc) = (char *)io_req; + sc->scsi_done = done; + + /* create copy wq desc and enqueue it */ + wq = &fnic->wq_copy[0]; + ret = fnic_queue_wq_copy_desc(fnic, wq, io_req, sc, sg_count); + if (ret) { + /* + * In case another thread cancelled the request, + * refetch the pointer under the lock. + */ + spinlock_t *io_lock = fnic_io_lock_hash(fnic, sc); + + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + CMD_SP(sc) = NULL; + CMD_STATE(sc) = FNIC_IOREQ_CMD_COMPLETE; + spin_unlock_irqrestore(io_lock, flags); + if (io_req) { + fnic_release_ioreq_buf(fnic, io_req, sc); + mempool_free(io_req, fnic->io_req_pool); + } + } +out: + /* acquire host lock before returning to SCSI */ + spin_lock(lp->host->host_lock); + return ret; +} + +/* + * fnic_fcpio_fw_reset_cmpl_handler + * Routine to handle fw reset completion + */ +static int fnic_fcpio_fw_reset_cmpl_handler(struct fnic *fnic, + struct fcpio_fw_req *desc) +{ + u8 type; + u8 hdr_status; + struct fcpio_tag tag; + int ret = 0; + struct fc_frame *flogi; + unsigned long flags; + + fcpio_header_dec(&desc->hdr, &type, &hdr_status, &tag); + + /* Clean up all outstanding io requests */ + fnic_cleanup_io(fnic, SCSI_NO_TAG); + + spin_lock_irqsave(&fnic->fnic_lock, flags); + + flogi = fnic->flogi; + fnic->flogi = NULL; + + /* fnic should be in FC_TRANS_ETH_MODE */ + if (fnic->state == FNIC_IN_FC_TRANS_ETH_MODE) { + /* Check status of reset completion */ + if (!hdr_status) { + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "reset cmpl success\n"); + /* Ready to send flogi out */ + fnic->state = FNIC_IN_ETH_MODE; + } else { + FNIC_SCSI_DBG(KERN_DEBUG, + fnic->lport->host, + "fnic fw_reset : failed %s\n", + fnic_fcpio_status_to_str(hdr_status)); + + /* + * Unable to change to eth mode, cannot send out flogi + * Change state to fc mode, so that subsequent Flogi + * requests from libFC will cause more attempts to + * reset the firmware. Free the cached flogi + */ + fnic->state = FNIC_IN_FC_MODE; + ret = -1; + } + } else { + FNIC_SCSI_DBG(KERN_DEBUG, + fnic->lport->host, + "Unexpected state %s while processing" + " reset cmpl\n", fnic_state_to_str(fnic->state)); + ret = -1; + } + + /* Thread removing device blocks till firmware reset is complete */ + if (fnic->remove_wait) + complete(fnic->remove_wait); + + /* + * If fnic is being removed, or fw reset failed + * free the flogi frame. Else, send it out + */ + if (fnic->remove_wait || ret) { + fnic->flogi_oxid = FC_XID_UNKNOWN; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + if (flogi) + dev_kfree_skb_irq(fp_skb(flogi)); + goto reset_cmpl_handler_end; + } + + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + if (flogi) + ret = fnic_send_frame(fnic, flogi); + + reset_cmpl_handler_end: + return ret; +} + +/* + * fnic_fcpio_flogi_reg_cmpl_handler + * Routine to handle flogi register completion + */ +static int fnic_fcpio_flogi_reg_cmpl_handler(struct fnic *fnic, + struct fcpio_fw_req *desc) +{ + u8 type; + u8 hdr_status; + struct fcpio_tag tag; + int ret = 0; + struct fc_frame *flogi_resp = NULL; + unsigned long flags; + struct sk_buff *skb; + + fcpio_header_dec(&desc->hdr, &type, &hdr_status, &tag); + + /* Update fnic state based on status of flogi reg completion */ + spin_lock_irqsave(&fnic->fnic_lock, flags); + + flogi_resp = fnic->flogi_resp; + fnic->flogi_resp = NULL; + + if (fnic->state == FNIC_IN_ETH_TRANS_FC_MODE) { + + /* Check flogi registration completion status */ + if (!hdr_status) { + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "flog reg succeeded\n"); + fnic->state = FNIC_IN_FC_MODE; + } else { + FNIC_SCSI_DBG(KERN_DEBUG, + fnic->lport->host, + "fnic flogi reg :failed %s\n", + fnic_fcpio_status_to_str(hdr_status)); + fnic->state = FNIC_IN_ETH_MODE; + ret = -1; + } + } else { + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "Unexpected fnic state %s while" + " processing flogi reg completion\n", + fnic_state_to_str(fnic->state)); + ret = -1; + } + + /* Successful flogi reg cmpl, pass frame to LibFC */ + if (!ret && flogi_resp) { + if (fnic->stop_rx_link_events) { + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + goto reg_cmpl_handler_end; + } + skb = (struct sk_buff *)flogi_resp; + /* Use fr_flags to indicate whether flogi resp or not */ + fr_flags(flogi_resp) = 1; + fr_dev(flogi_resp) = fnic->lport; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + skb_queue_tail(&fnic->frame_queue, skb); + queue_work(fnic_event_queue, &fnic->frame_work); + + } else { + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + if (flogi_resp) + dev_kfree_skb_irq(fp_skb(flogi_resp)); + } + +reg_cmpl_handler_end: + return ret; +} + +static inline int is_ack_index_in_range(struct vnic_wq_copy *wq, + u16 request_out) +{ + if (wq->to_clean_index <= wq->to_use_index) { + /* out of range, stale request_out index */ + if (request_out < wq->to_clean_index || + request_out >= wq->to_use_index) + return 0; + } else { + /* out of range, stale request_out index */ + if (request_out < wq->to_clean_index && + request_out >= wq->to_use_index) + return 0; + } + /* request_out index is in range */ + return 1; +} + + +/* + * Mark that ack received and store the Ack index. If there are multiple + * acks received before Tx thread cleans it up, the latest value will be + * used which is correct behavior. This state should be in the copy Wq + * instead of in the fnic + */ +static inline void fnic_fcpio_ack_handler(struct fnic *fnic, + unsigned int cq_index, + struct fcpio_fw_req *desc) +{ + struct vnic_wq_copy *wq; + u16 request_out = desc->u.ack.request_out; + unsigned long flags; + + /* mark the ack state */ + wq = &fnic->wq_copy[cq_index - fnic->raw_wq_count - fnic->rq_count]; + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + + if (is_ack_index_in_range(wq, request_out)) { + fnic->fw_ack_index[0] = request_out; + fnic->fw_ack_recd[0] = 1; + } + spin_unlock_irqrestore(&fnic->wq_copy_lock[0], flags); +} + +/* + * fnic_fcpio_icmnd_cmpl_handler + * Routine to handle icmnd completions + */ +static void fnic_fcpio_icmnd_cmpl_handler(struct fnic *fnic, + struct fcpio_fw_req *desc) +{ + u8 type; + u8 hdr_status; + struct fcpio_tag tag; + u32 id; + u64 xfer_len = 0; + struct fcpio_icmnd_cmpl *icmnd_cmpl; + struct fnic_io_req *io_req; + struct scsi_cmnd *sc; + unsigned long flags; + spinlock_t *io_lock; + + /* Decode the cmpl description to get the io_req id */ + fcpio_header_dec(&desc->hdr, &type, &hdr_status, &tag); + fcpio_tag_id_dec(&tag, &id); + + if (id >= FNIC_MAX_IO_REQ) + return; + + sc = scsi_host_find_tag(fnic->lport->host, id); + WARN_ON_ONCE(!sc); + if (!sc) + return; + + io_lock = fnic_io_lock_hash(fnic, sc); + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + WARN_ON_ONCE(!io_req); + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + return; + } + + /* firmware completed the io */ + io_req->io_completed = 1; + + /* + * if SCSI-ML has already issued abort on this command, + * ignore completion of the IO. The abts path will clean it up + */ + if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { + spin_unlock_irqrestore(io_lock, flags); + return; + } + + /* Mark the IO as complete */ + CMD_STATE(sc) = FNIC_IOREQ_CMD_COMPLETE; + + icmnd_cmpl = &desc->u.icmnd_cmpl; + + switch (hdr_status) { + case FCPIO_SUCCESS: + sc->result = (DID_OK << 16) | icmnd_cmpl->scsi_status; + xfer_len = scsi_bufflen(sc); + scsi_set_resid(sc, icmnd_cmpl->residual); + + if (icmnd_cmpl->flags & FCPIO_ICMND_CMPL_RESID_UNDER) + xfer_len -= icmnd_cmpl->residual; + + /* + * If queue_full, then try to reduce queue depth for all + * LUNS on the target. Todo: this should be accompanied + * by a periodic queue_depth rampup based on successful + * IO completion. + */ + if (icmnd_cmpl->scsi_status == QUEUE_FULL) { + struct scsi_device *t_sdev; + int qd = 0; + + shost_for_each_device(t_sdev, sc->device->host) { + if (t_sdev->id != sc->device->id) + continue; + + if (t_sdev->queue_depth > 1) { + qd = scsi_track_queue_full + (t_sdev, + t_sdev->queue_depth - 1); + if (qd == -1) + qd = t_sdev->host->cmd_per_lun; + shost_printk(KERN_INFO, + fnic->lport->host, + "scsi[%d:%d:%d:%d" + "] queue full detected," + "new depth = %d\n", + t_sdev->host->host_no, + t_sdev->channel, + t_sdev->id, t_sdev->lun, + t_sdev->queue_depth); + } + } + } + break; + + case FCPIO_TIMEOUT: /* request was timed out */ + sc->result = (DID_TIME_OUT << 16) | icmnd_cmpl->scsi_status; + break; + + case FCPIO_ABORTED: /* request was aborted */ + sc->result = (DID_ERROR << 16) | icmnd_cmpl->scsi_status; + break; + + case FCPIO_DATA_CNT_MISMATCH: /* recv/sent more/less data than exp. */ + scsi_set_resid(sc, icmnd_cmpl->residual); + sc->result = (DID_ERROR << 16) | icmnd_cmpl->scsi_status; + break; + + case FCPIO_OUT_OF_RESOURCE: /* out of resources to complete request */ + sc->result = (DID_REQUEUE << 16) | icmnd_cmpl->scsi_status; + break; + case FCPIO_INVALID_HEADER: /* header contains invalid data */ + case FCPIO_INVALID_PARAM: /* some parameter in request invalid */ + case FCPIO_REQ_NOT_SUPPORTED:/* request type is not supported */ + case FCPIO_IO_NOT_FOUND: /* requested I/O was not found */ + case FCPIO_SGL_INVALID: /* request was aborted due to sgl error */ + case FCPIO_MSS_INVALID: /* request was aborted due to mss error */ + case FCPIO_FW_ERR: /* request was terminated due fw error */ + default: + shost_printk(KERN_ERR, fnic->lport->host, "hdr status = %s\n", + fnic_fcpio_status_to_str(hdr_status)); + sc->result = (DID_ERROR << 16) | icmnd_cmpl->scsi_status; + break; + } + + /* Break link with the SCSI command */ + CMD_SP(sc) = NULL; + + spin_unlock_irqrestore(io_lock, flags); + + fnic_release_ioreq_buf(fnic, io_req, sc); + + mempool_free(io_req, fnic->io_req_pool); + + if (sc->sc_data_direction == DMA_FROM_DEVICE) { + fnic->lport->host_stats.fcp_input_requests++; + fnic->fcp_input_bytes += xfer_len; + } else if (sc->sc_data_direction == DMA_TO_DEVICE) { + fnic->lport->host_stats.fcp_output_requests++; + fnic->fcp_output_bytes += xfer_len; + } else + fnic->lport->host_stats.fcp_control_requests++; + + /* Call SCSI completion function to complete the IO */ + if (sc->scsi_done) + sc->scsi_done(sc); + +} + +/* fnic_fcpio_itmf_cmpl_handler + * Routine to handle itmf completions + */ +static void fnic_fcpio_itmf_cmpl_handler(struct fnic *fnic, + struct fcpio_fw_req *desc) +{ + u8 type; + u8 hdr_status; + struct fcpio_tag tag; + u32 id; + struct scsi_cmnd *sc; + struct fnic_io_req *io_req; + unsigned long flags; + spinlock_t *io_lock; + + fcpio_header_dec(&desc->hdr, &type, &hdr_status, &tag); + fcpio_tag_id_dec(&tag, &id); + + if ((id & FNIC_TAG_MASK) >= FNIC_MAX_IO_REQ) + return; + + sc = scsi_host_find_tag(fnic->lport->host, id & FNIC_TAG_MASK); + WARN_ON_ONCE(!sc); + if (!sc) + return; + + io_lock = fnic_io_lock_hash(fnic, sc); + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + WARN_ON_ONCE(!io_req); + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + return; + } + + if (id & FNIC_TAG_ABORT) { + /* Completion of abort cmd */ + if (CMD_STATE(sc) != FNIC_IOREQ_ABTS_PENDING) { + /* This is a late completion. Ignore it */ + spin_unlock_irqrestore(io_lock, flags); + return; + } + CMD_STATE(sc) = FNIC_IOREQ_ABTS_COMPLETE; + CMD_ABTS_STATUS(sc) = hdr_status; + + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "abts cmpl recd. id %d status %s\n", + (int)(id & FNIC_TAG_MASK), + fnic_fcpio_status_to_str(hdr_status)); + + /* + * If scsi_eh thread is blocked waiting for abts to complete, + * signal completion to it. IO will be cleaned in the thread + * else clean it in this context + */ + if (io_req->abts_done) { + complete(io_req->abts_done); + spin_unlock_irqrestore(io_lock, flags); + } else { + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "abts cmpl, completing IO\n"); + CMD_SP(sc) = NULL; + sc->result = (DID_ERROR << 16); + + spin_unlock_irqrestore(io_lock, flags); + + fnic_release_ioreq_buf(fnic, io_req, sc); + mempool_free(io_req, fnic->io_req_pool); + if (sc->scsi_done) + sc->scsi_done(sc); + } + + } else if (id & FNIC_TAG_DEV_RST) { + /* Completion of device reset */ + CMD_LR_STATUS(sc) = hdr_status; + CMD_STATE(sc) = FNIC_IOREQ_CMD_COMPLETE; + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "dev reset cmpl recd. id %d status %s\n", + (int)(id & FNIC_TAG_MASK), + fnic_fcpio_status_to_str(hdr_status)); + if (io_req->dr_done) + complete(io_req->dr_done); + spin_unlock_irqrestore(io_lock, flags); + + } else { + shost_printk(KERN_ERR, fnic->lport->host, + "Unexpected itmf io state %s tag %x\n", + fnic_ioreq_state_to_str(CMD_STATE(sc)), id); + spin_unlock_irqrestore(io_lock, flags); + } + +} + +/* + * fnic_fcpio_cmpl_handler + * Routine to service the cq for wq_copy + */ +static int fnic_fcpio_cmpl_handler(struct vnic_dev *vdev, + unsigned int cq_index, + struct fcpio_fw_req *desc) +{ + struct fnic *fnic = vnic_dev_priv(vdev); + int ret = 0; + + switch (desc->hdr.type) { + case FCPIO_ACK: /* fw copied copy wq desc to its queue */ + fnic_fcpio_ack_handler(fnic, cq_index, desc); + break; + + case FCPIO_ICMND_CMPL: /* fw completed a command */ + fnic_fcpio_icmnd_cmpl_handler(fnic, desc); + break; + + case FCPIO_ITMF_CMPL: /* fw completed itmf (abort cmd, lun reset)*/ + fnic_fcpio_itmf_cmpl_handler(fnic, desc); + break; + + case FCPIO_FLOGI_REG_CMPL: /* fw completed flogi_reg */ + ret = fnic_fcpio_flogi_reg_cmpl_handler(fnic, desc); + break; + + case FCPIO_RESET_CMPL: /* fw completed reset */ + ret = fnic_fcpio_fw_reset_cmpl_handler(fnic, desc); + break; + + default: + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "firmware completion type %d\n", + desc->hdr.type); + break; + } + + return ret; +} + +/* + * fnic_wq_copy_cmpl_handler + * Routine to process wq copy + */ +int fnic_wq_copy_cmpl_handler(struct fnic *fnic, int copy_work_to_do) +{ + unsigned int wq_work_done = 0; + unsigned int i, cq_index; + unsigned int cur_work_done; + + for (i = 0; i < fnic->wq_copy_count; i++) { + cq_index = i + fnic->raw_wq_count + fnic->rq_count; + cur_work_done = vnic_cq_copy_service(&fnic->cq[cq_index], + fnic_fcpio_cmpl_handler, + copy_work_to_do); + wq_work_done += cur_work_done; + } + return wq_work_done; +} + +static void fnic_cleanup_io(struct fnic *fnic, int exclude_id) +{ + unsigned int i; + struct fnic_io_req *io_req; + unsigned long flags = 0; + struct scsi_cmnd *sc; + spinlock_t *io_lock; + + for (i = 0; i < FNIC_MAX_IO_REQ; i++) { + if (i == exclude_id) + continue; + + sc = scsi_host_find_tag(fnic->lport->host, i); + if (!sc) + continue; + + io_lock = fnic_io_lock_hash(fnic, sc); + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + goto cleanup_scsi_cmd; + } + + CMD_SP(sc) = NULL; + + spin_unlock_irqrestore(io_lock, flags); + + /* + * If there is a scsi_cmnd associated with this io_req, then + * free the corresponding state + */ + fnic_release_ioreq_buf(fnic, io_req, sc); + mempool_free(io_req, fnic->io_req_pool); + +cleanup_scsi_cmd: + sc->result = DID_TRANSPORT_DISRUPTED << 16; + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, "fnic_cleanup_io:" + " DID_TRANSPORT_DISRUPTED\n"); + + /* Complete the command to SCSI */ + if (sc->scsi_done) + sc->scsi_done(sc); + } +} + +void fnic_wq_copy_cleanup_handler(struct vnic_wq_copy *wq, + struct fcpio_host_req *desc) +{ + u32 id; + struct fnic *fnic = vnic_dev_priv(wq->vdev); + struct fnic_io_req *io_req; + struct scsi_cmnd *sc; + unsigned long flags; + spinlock_t *io_lock; + + /* get the tag reference */ + fcpio_tag_id_dec(&desc->hdr.tag, &id); + id &= FNIC_TAG_MASK; + + if (id >= FNIC_MAX_IO_REQ) + return; + + sc = scsi_host_find_tag(fnic->lport->host, id); + if (!sc) + return; + + io_lock = fnic_io_lock_hash(fnic, sc); + spin_lock_irqsave(io_lock, flags); + + /* Get the IO context which this desc refers to */ + io_req = (struct fnic_io_req *)CMD_SP(sc); + + /* fnic interrupts are turned off by now */ + + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + goto wq_copy_cleanup_scsi_cmd; + } + + CMD_SP(sc) = NULL; + + spin_unlock_irqrestore(io_lock, flags); + + fnic_release_ioreq_buf(fnic, io_req, sc); + mempool_free(io_req, fnic->io_req_pool); + +wq_copy_cleanup_scsi_cmd: + sc->result = DID_NO_CONNECT << 16; + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, "wq_copy_cleanup_handler:" + " DID_NO_CONNECT\n"); + + if (sc->scsi_done) + sc->scsi_done(sc); +} + +static inline int fnic_queue_abort_io_req(struct fnic *fnic, int tag, + u32 task_req, u8 *fc_lun, + struct fnic_io_req *io_req) +{ + struct vnic_wq_copy *wq = &fnic->wq_copy[0]; + unsigned long flags; + + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + + if (vnic_wq_copy_desc_avail(wq) <= fnic->wq_copy_desc_low[0]) + free_wq_copy_descs(fnic, wq); + + if (!vnic_wq_copy_desc_avail(wq)) { + spin_unlock_irqrestore(&fnic->wq_copy_lock[0], flags); + return 1; + } + fnic_queue_wq_copy_desc_itmf(wq, tag | FNIC_TAG_ABORT, + 0, task_req, tag, fc_lun, io_req->port_id, + fnic->config.ra_tov, fnic->config.ed_tov); + + spin_unlock_irqrestore(&fnic->wq_copy_lock[0], flags); + return 0; +} + +void fnic_rport_exch_reset(struct fnic *fnic, u32 port_id) +{ + int tag; + struct fnic_io_req *io_req; + spinlock_t *io_lock; + unsigned long flags; + struct scsi_cmnd *sc; + struct scsi_lun fc_lun; + enum fnic_ioreq_state old_ioreq_state; + + FNIC_SCSI_DBG(KERN_DEBUG, + fnic->lport->host, + "fnic_rport_reset_exch called portid 0x%06x\n", + port_id); + + if (fnic->in_remove) + return; + + for (tag = 0; tag < FNIC_MAX_IO_REQ; tag++) { + sc = scsi_host_find_tag(fnic->lport->host, tag); + if (!sc) + continue; + + io_lock = fnic_io_lock_hash(fnic, sc); + spin_lock_irqsave(io_lock, flags); + + io_req = (struct fnic_io_req *)CMD_SP(sc); + + if (!io_req || io_req->port_id != port_id) { + spin_unlock_irqrestore(io_lock, flags); + continue; + } + + /* + * Found IO that is still pending with firmware and + * belongs to rport that went away + */ + if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { + spin_unlock_irqrestore(io_lock, flags); + continue; + } + old_ioreq_state = CMD_STATE(sc); + CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING; + CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE; + + BUG_ON(io_req->abts_done); + + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "fnic_rport_reset_exch: Issuing abts\n"); + + spin_unlock_irqrestore(io_lock, flags); + + /* Now queue the abort command to firmware */ + int_to_scsilun(sc->device->lun, &fc_lun); + + if (fnic_queue_abort_io_req(fnic, tag, + FCPIO_ITMF_ABT_TASK_TERM, + fc_lun.scsi_lun, io_req)) { + /* + * Revert the cmd state back to old state, if + * it hasnt changed in between. This cmd will get + * aborted later by scsi_eh, or cleaned up during + * lun reset + */ + io_lock = fnic_io_lock_hash(fnic, sc); + + spin_lock_irqsave(io_lock, flags); + if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) + CMD_STATE(sc) = old_ioreq_state; + spin_unlock_irqrestore(io_lock, flags); + } + } + +} + +void fnic_terminate_rport_io(struct fc_rport *rport) +{ + int tag; + struct fnic_io_req *io_req; + spinlock_t *io_lock; + unsigned long flags; + struct scsi_cmnd *sc; + struct scsi_lun fc_lun; + struct fc_rport_libfc_priv *rdata = rport->dd_data; + struct fc_lport *lport = rdata->local_port; + struct fnic *fnic = lport_priv(lport); + struct fc_rport *cmd_rport; + enum fnic_ioreq_state old_ioreq_state; + + FNIC_SCSI_DBG(KERN_DEBUG, + fnic->lport->host, "fnic_terminate_rport_io called" + " wwpn 0x%llx, wwnn0x%llx, portid 0x%06x\n", + rport->port_name, rport->node_name, + rport->port_id); + + if (fnic->in_remove) + return; + + for (tag = 0; tag < FNIC_MAX_IO_REQ; tag++) { + sc = scsi_host_find_tag(fnic->lport->host, tag); + if (!sc) + continue; + + cmd_rport = starget_to_rport(scsi_target(sc->device)); + if (rport != cmd_rport) + continue; + + io_lock = fnic_io_lock_hash(fnic, sc); + spin_lock_irqsave(io_lock, flags); + + io_req = (struct fnic_io_req *)CMD_SP(sc); + + if (!io_req || rport != cmd_rport) { + spin_unlock_irqrestore(io_lock, flags); + continue; + } + + /* + * Found IO that is still pending with firmware and + * belongs to rport that went away + */ + if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { + spin_unlock_irqrestore(io_lock, flags); + continue; + } + old_ioreq_state = CMD_STATE(sc); + CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING; + CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE; + + BUG_ON(io_req->abts_done); + + FNIC_SCSI_DBG(KERN_DEBUG, + fnic->lport->host, + "fnic_terminate_rport_io: Issuing abts\n"); + + spin_unlock_irqrestore(io_lock, flags); + + /* Now queue the abort command to firmware */ + int_to_scsilun(sc->device->lun, &fc_lun); + + if (fnic_queue_abort_io_req(fnic, tag, + FCPIO_ITMF_ABT_TASK_TERM, + fc_lun.scsi_lun, io_req)) { + /* + * Revert the cmd state back to old state, if + * it hasnt changed in between. This cmd will get + * aborted later by scsi_eh, or cleaned up during + * lun reset + */ + io_lock = fnic_io_lock_hash(fnic, sc); + + spin_lock_irqsave(io_lock, flags); + if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) + CMD_STATE(sc) = old_ioreq_state; + spin_unlock_irqrestore(io_lock, flags); + } + } + +} + +static void fnic_block_error_handler(struct scsi_cmnd *sc) +{ + struct Scsi_Host *shost = sc->device->host; + struct fc_rport *rport = starget_to_rport(scsi_target(sc->device)); + unsigned long flags; + + spin_lock_irqsave(shost->host_lock, flags); + while (rport->port_state == FC_PORTSTATE_BLOCKED) { + spin_unlock_irqrestore(shost->host_lock, flags); + msleep(1000); + spin_lock_irqsave(shost->host_lock, flags); + } + spin_unlock_irqrestore(shost->host_lock, flags); + +} + +/* + * This function is exported to SCSI for sending abort cmnds. + * A SCSI IO is represented by a io_req in the driver. + * The ioreq is linked to the SCSI Cmd, thus a link with the ULP's IO. + */ +int fnic_abort_cmd(struct scsi_cmnd *sc) +{ + struct fc_lport *lp; + struct fnic *fnic; + struct fnic_io_req *io_req; + struct fc_rport *rport; + spinlock_t *io_lock; + unsigned long flags; + int ret = SUCCESS; + u32 task_req; + struct scsi_lun fc_lun; + DECLARE_COMPLETION_ONSTACK(tm_done); + + /* Wait for rport to unblock */ + fnic_block_error_handler(sc); + + /* Get local-port, check ready and link up */ + lp = shost_priv(sc->device->host); + + fnic = lport_priv(lp); + FNIC_SCSI_DBG(KERN_DEBUG, + fnic->lport->host, + "Abort Cmd called FCID 0x%x, LUN 0x%x TAG %d\n", + (starget_to_rport(scsi_target(sc->device)))->port_id, + sc->device->lun, sc->request->tag); + + if (lp->state != LPORT_ST_READY || !(lp->link_up)) { + ret = FAILED; + goto fnic_abort_cmd_end; + } + + /* + * Avoid a race between SCSI issuing the abort and the device + * completing the command. + * + * If the command is already completed by the fw cmpl code, + * we just return SUCCESS from here. This means that the abort + * succeeded. In the SCSI ML, since the timeout for command has + * happened, the completion wont actually complete the command + * and it will be considered as an aborted command + * + * The CMD_SP will not be cleared except while holding io_req_lock. + */ + io_lock = fnic_io_lock_hash(fnic, sc); + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + goto fnic_abort_cmd_end; + } + + io_req->abts_done = &tm_done; + + if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { + spin_unlock_irqrestore(io_lock, flags); + goto wait_pending; + } + /* + * Command is still pending, need to abort it + * If the firmware completes the command after this point, + * the completion wont be done till mid-layer, since abort + * has already started. + */ + CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING; + CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE; + + spin_unlock_irqrestore(io_lock, flags); + + /* + * Check readiness of the remote port. If the path to remote + * port is up, then send abts to the remote port to terminate + * the IO. Else, just locally terminate the IO in the firmware + */ + rport = starget_to_rport(scsi_target(sc->device)); + if (fc_remote_port_chkready(rport) == 0) + task_req = FCPIO_ITMF_ABT_TASK; + else + task_req = FCPIO_ITMF_ABT_TASK_TERM; + + /* Now queue the abort command to firmware */ + int_to_scsilun(sc->device->lun, &fc_lun); + + if (fnic_queue_abort_io_req(fnic, sc->request->tag, task_req, + fc_lun.scsi_lun, io_req)) { + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (io_req) + io_req->abts_done = NULL; + spin_unlock_irqrestore(io_lock, flags); + ret = FAILED; + goto fnic_abort_cmd_end; + } + + /* + * We queued an abort IO, wait for its completion. + * Once the firmware completes the abort command, it will + * wake up this thread. + */ + wait_pending: + wait_for_completion_timeout(&tm_done, + msecs_to_jiffies + (2 * fnic->config.ra_tov + + fnic->config.ed_tov)); + + /* Check the abort status */ + spin_lock_irqsave(io_lock, flags); + + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + ret = FAILED; + goto fnic_abort_cmd_end; + } + io_req->abts_done = NULL; + + /* fw did not complete abort, timed out */ + if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { + spin_unlock_irqrestore(io_lock, flags); + ret = FAILED; + goto fnic_abort_cmd_end; + } + + /* + * firmware completed the abort, check the status, + * free the io_req irrespective of failure or success + */ + if (CMD_ABTS_STATUS(sc) != FCPIO_SUCCESS) + ret = FAILED; + + CMD_SP(sc) = NULL; + + spin_unlock_irqrestore(io_lock, flags); + + fnic_release_ioreq_buf(fnic, io_req, sc); + mempool_free(io_req, fnic->io_req_pool); + +fnic_abort_cmd_end: + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "Returning from abort cmd %s\n", + (ret == SUCCESS) ? + "SUCCESS" : "FAILED"); + return ret; +} + +static inline int fnic_queue_dr_io_req(struct fnic *fnic, + struct scsi_cmnd *sc, + struct fnic_io_req *io_req) +{ + struct vnic_wq_copy *wq = &fnic->wq_copy[0]; + struct scsi_lun fc_lun; + int ret = 0; + unsigned long intr_flags; + + spin_lock_irqsave(&fnic->wq_copy_lock[0], intr_flags); + + if (vnic_wq_copy_desc_avail(wq) <= fnic->wq_copy_desc_low[0]) + free_wq_copy_descs(fnic, wq); + + if (!vnic_wq_copy_desc_avail(wq)) { + ret = -EAGAIN; + goto lr_io_req_end; + } + + /* fill in the lun info */ + int_to_scsilun(sc->device->lun, &fc_lun); + + fnic_queue_wq_copy_desc_itmf(wq, sc->request->tag | FNIC_TAG_DEV_RST, + 0, FCPIO_ITMF_LUN_RESET, SCSI_NO_TAG, + fc_lun.scsi_lun, io_req->port_id, + fnic->config.ra_tov, fnic->config.ed_tov); + +lr_io_req_end: + spin_unlock_irqrestore(&fnic->wq_copy_lock[0], intr_flags); + + return ret; +} + +/* + * Clean up any pending aborts on the lun + * For each outstanding IO on this lun, whose abort is not completed by fw, + * issue a local abort. Wait for abort to complete. Return 0 if all commands + * successfully aborted, 1 otherwise + */ +static int fnic_clean_pending_aborts(struct fnic *fnic, + struct scsi_cmnd *lr_sc) +{ + int tag; + struct fnic_io_req *io_req; + spinlock_t *io_lock; + unsigned long flags; + int ret = 0; + struct scsi_cmnd *sc; + struct fc_rport *rport; + struct scsi_lun fc_lun; + struct scsi_device *lun_dev = lr_sc->device; + DECLARE_COMPLETION_ONSTACK(tm_done); + + for (tag = 0; tag < FNIC_MAX_IO_REQ; tag++) { + sc = scsi_host_find_tag(fnic->lport->host, tag); + /* + * ignore this lun reset cmd or cmds that do not belong to + * this lun + */ + if (!sc || sc == lr_sc || sc->device != lun_dev) + continue; + + io_lock = fnic_io_lock_hash(fnic, sc); + spin_lock_irqsave(io_lock, flags); + + io_req = (struct fnic_io_req *)CMD_SP(sc); + + if (!io_req || sc->device != lun_dev) { + spin_unlock_irqrestore(io_lock, flags); + continue; + } + + /* + * Found IO that is still pending with firmware and + * belongs to the LUN that we are resetting + */ + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "Found IO in %s on lun\n", + fnic_ioreq_state_to_str(CMD_STATE(sc))); + + BUG_ON(CMD_STATE(sc) != FNIC_IOREQ_ABTS_PENDING); + + CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE; + io_req->abts_done = &tm_done; + spin_unlock_irqrestore(io_lock, flags); + + /* Now queue the abort command to firmware */ + int_to_scsilun(sc->device->lun, &fc_lun); + rport = starget_to_rport(scsi_target(sc->device)); + + if (fnic_queue_abort_io_req(fnic, tag, + FCPIO_ITMF_ABT_TASK_TERM, + fc_lun.scsi_lun, io_req)) { + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (io_req) + io_req->abts_done = NULL; + spin_unlock_irqrestore(io_lock, flags); + ret = 1; + goto clean_pending_aborts_end; + } + + wait_for_completion_timeout(&tm_done, + msecs_to_jiffies + (fnic->config.ed_tov)); + + /* Recheck cmd state to check if it is now aborted */ + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + ret = 1; + goto clean_pending_aborts_end; + } + + io_req->abts_done = NULL; + + /* if abort is still pending with fw, fail */ + if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { + spin_unlock_irqrestore(io_lock, flags); + ret = 1; + goto clean_pending_aborts_end; + } + CMD_SP(sc) = NULL; + spin_unlock_irqrestore(io_lock, flags); + + fnic_release_ioreq_buf(fnic, io_req, sc); + mempool_free(io_req, fnic->io_req_pool); + } + +clean_pending_aborts_end: + return ret; +} + +/* + * SCSI Eh thread issues a Lun Reset when one or more commands on a LUN + * fail to get aborted. It calls driver's eh_device_reset with a SCSI command + * on the LUN. + */ +int fnic_device_reset(struct scsi_cmnd *sc) +{ + struct fc_lport *lp; + struct fnic *fnic; + struct fnic_io_req *io_req; + struct fc_rport *rport; + int status; + int ret = FAILED; + spinlock_t *io_lock; + unsigned long flags; + DECLARE_COMPLETION_ONSTACK(tm_done); + + /* Wait for rport to unblock */ + fnic_block_error_handler(sc); + + /* Get local-port, check ready and link up */ + lp = shost_priv(sc->device->host); + + fnic = lport_priv(lp); + FNIC_SCSI_DBG(KERN_DEBUG, + fnic->lport->host, + "Device reset called FCID 0x%x, LUN 0x%x\n", + (starget_to_rport(scsi_target(sc->device)))->port_id, + sc->device->lun); + + + if (lp->state != LPORT_ST_READY || !(lp->link_up)) + goto fnic_device_reset_end; + + /* Check if remote port up */ + rport = starget_to_rport(scsi_target(sc->device)); + if (fc_remote_port_chkready(rport)) + goto fnic_device_reset_end; + + io_lock = fnic_io_lock_hash(fnic, sc); + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + + /* + * If there is a io_req attached to this command, then use it, + * else allocate a new one. + */ + if (!io_req) { + io_req = mempool_alloc(fnic->io_req_pool, GFP_ATOMIC); + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + goto fnic_device_reset_end; + } + memset(io_req, 0, sizeof(*io_req)); + io_req->port_id = rport->port_id; + CMD_SP(sc) = (char *)io_req; + } + io_req->dr_done = &tm_done; + CMD_STATE(sc) = FNIC_IOREQ_CMD_PENDING; + CMD_LR_STATUS(sc) = FCPIO_INVALID_CODE; + spin_unlock_irqrestore(io_lock, flags); + + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, "TAG %d\n", + sc->request->tag); + + /* + * issue the device reset, if enqueue failed, clean up the ioreq + * and break assoc with scsi cmd + */ + if (fnic_queue_dr_io_req(fnic, sc, io_req)) { + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (io_req) + io_req->dr_done = NULL; + goto fnic_device_reset_clean; + } + + /* + * Wait on the local completion for LUN reset. The io_req may be + * freed while we wait since we hold no lock. + */ + wait_for_completion_timeout(&tm_done, + msecs_to_jiffies(FNIC_LUN_RESET_TIMEOUT)); + + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + goto fnic_device_reset_end; + } + io_req->dr_done = NULL; + + status = CMD_LR_STATUS(sc); + spin_unlock_irqrestore(io_lock, flags); + + /* + * If lun reset not completed, bail out with failed. io_req + * gets cleaned up during higher levels of EH + */ + if (status == FCPIO_INVALID_CODE) { + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "Device reset timed out\n"); + goto fnic_device_reset_end; + } + + /* Completed, but not successful, clean up the io_req, return fail */ + if (status != FCPIO_SUCCESS) { + spin_lock_irqsave(io_lock, flags); + FNIC_SCSI_DBG(KERN_DEBUG, + fnic->lport->host, + "Device reset completed - failed\n"); + io_req = (struct fnic_io_req *)CMD_SP(sc); + goto fnic_device_reset_clean; + } + + /* + * Clean up any aborts on this lun that have still not + * completed. If any of these fail, then LUN reset fails. + * clean_pending_aborts cleans all cmds on this lun except + * the lun reset cmd. If all cmds get cleaned, the lun reset + * succeeds + */ + if (fnic_clean_pending_aborts(fnic, sc)) { + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "Device reset failed" + " since could not abort all IOs\n"); + goto fnic_device_reset_clean; + } + + /* Clean lun reset command */ + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (io_req) + /* Completed, and successful */ + ret = SUCCESS; + +fnic_device_reset_clean: + if (io_req) + CMD_SP(sc) = NULL; + + spin_unlock_irqrestore(io_lock, flags); + + if (io_req) { + fnic_release_ioreq_buf(fnic, io_req, sc); + mempool_free(io_req, fnic->io_req_pool); + } + +fnic_device_reset_end: + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "Returning from device reset %s\n", + (ret == SUCCESS) ? + "SUCCESS" : "FAILED"); + return ret; +} + +/* Clean up all IOs, clean up libFC local port */ +int fnic_reset(struct Scsi_Host *shost) +{ + struct fc_lport *lp; + struct fnic *fnic; + int ret = SUCCESS; + + lp = shost_priv(shost); + fnic = lport_priv(lp); + + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "fnic_reset called\n"); + + /* + * Reset local port, this will clean up libFC exchanges, + * reset remote port sessions, and if link is up, begin flogi + */ + if (lp->tt.lport_reset(lp)) + ret = FAILED; + + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "Returning from fnic reset %s\n", + (ret == SUCCESS) ? + "SUCCESS" : "FAILED"); + + return ret; +} + +/* + * SCSI Error handling calls driver's eh_host_reset if all prior + * error handling levels return FAILED. If host reset completes + * successfully, and if link is up, then Fabric login begins. + * + * Host Reset is the highest level of error recovery. If this fails, then + * host is offlined by SCSI. + * + */ +int fnic_host_reset(struct scsi_cmnd *sc) +{ + int ret; + unsigned long wait_host_tmo; + struct Scsi_Host *shost = sc->device->host; + struct fc_lport *lp = shost_priv(shost); + + /* + * If fnic_reset is successful, wait for fabric login to complete + * scsi-ml tries to send a TUR to every device if host reset is + * successful, so before returning to scsi, fabric should be up + */ + ret = fnic_reset(shost); + if (ret == SUCCESS) { + wait_host_tmo = jiffies + FNIC_HOST_RESET_SETTLE_TIME * HZ; + ret = FAILED; + while (time_before(jiffies, wait_host_tmo)) { + if ((lp->state == LPORT_ST_READY) && + (lp->link_up)) { + ret = SUCCESS; + break; + } + ssleep(1); + } + } + + return ret; +} + +/* + * This fxn is called from libFC when host is removed + */ +void fnic_scsi_abort_io(struct fc_lport *lp) +{ + int err = 0; + unsigned long flags; + enum fnic_state old_state; + struct fnic *fnic = lport_priv(lp); + DECLARE_COMPLETION_ONSTACK(remove_wait); + + /* Issue firmware reset for fnic, wait for reset to complete */ + spin_lock_irqsave(&fnic->fnic_lock, flags); + fnic->remove_wait = &remove_wait; + old_state = fnic->state; + fnic->state = FNIC_IN_FC_TRANS_ETH_MODE; + vnic_dev_del_addr(fnic->vdev, fnic->data_src_addr); + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + err = fnic_fw_reset_handler(fnic); + if (err) { + spin_lock_irqsave(&fnic->fnic_lock, flags); + if (fnic->state == FNIC_IN_FC_TRANS_ETH_MODE) + fnic->state = old_state; + fnic->remove_wait = NULL; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + return; + } + + /* Wait for firmware reset to complete */ + wait_for_completion_timeout(&remove_wait, + msecs_to_jiffies(FNIC_RMDEVICE_TIMEOUT)); + + spin_lock_irqsave(&fnic->fnic_lock, flags); + fnic->remove_wait = NULL; + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "fnic_scsi_abort_io %s\n", + (fnic->state == FNIC_IN_ETH_MODE) ? + "SUCCESS" : "FAILED"); + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + +} + +/* + * This fxn called from libFC to clean up driver IO state on link down + */ +void fnic_scsi_cleanup(struct fc_lport *lp) +{ + unsigned long flags; + enum fnic_state old_state; + struct fnic *fnic = lport_priv(lp); + + /* issue fw reset */ + spin_lock_irqsave(&fnic->fnic_lock, flags); + old_state = fnic->state; + fnic->state = FNIC_IN_FC_TRANS_ETH_MODE; + vnic_dev_del_addr(fnic->vdev, fnic->data_src_addr); + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + + if (fnic_fw_reset_handler(fnic)) { + spin_lock_irqsave(&fnic->fnic_lock, flags); + if (fnic->state == FNIC_IN_FC_TRANS_ETH_MODE) + fnic->state = old_state; + spin_unlock_irqrestore(&fnic->fnic_lock, flags); + } + +} + +void fnic_empty_scsi_cleanup(struct fc_lport *lp) +{ +} + +void fnic_exch_mgr_reset(struct fc_lport *lp, u32 sid, u32 did) +{ + struct fnic *fnic = lport_priv(lp); + + /* Non-zero sid, nothing to do */ + if (sid) + goto call_fc_exch_mgr_reset; + + if (did) { + fnic_rport_exch_reset(fnic, did); + goto call_fc_exch_mgr_reset; + } + + /* + * sid = 0, did = 0 + * link down or device being removed + */ + if (!fnic->in_remove) + fnic_scsi_cleanup(lp); + else + fnic_scsi_abort_io(lp); + + /* call libFC exch mgr reset to reset its exchanges */ +call_fc_exch_mgr_reset: + fc_exch_mgr_reset(lp, sid, did); + +} diff --git a/drivers/scsi/fnic/rq_enet_desc.h b/drivers/scsi/fnic/rq_enet_desc.h new file mode 100644 index 00000000000..92e80ae6b72 --- /dev/null +++ b/drivers/scsi/fnic/rq_enet_desc.h @@ -0,0 +1,58 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _RQ_ENET_DESC_H_ +#define _RQ_ENET_DESC_H_ + +/* Ethernet receive queue descriptor: 16B */ +struct rq_enet_desc { + __le64 address; + __le16 length_type; + u8 reserved[6]; +}; + +enum rq_enet_type_types { + RQ_ENET_TYPE_ONLY_SOP = 0, + RQ_ENET_TYPE_NOT_SOP = 1, + RQ_ENET_TYPE_RESV2 = 2, + RQ_ENET_TYPE_RESV3 = 3, +}; + +#define RQ_ENET_ADDR_BITS 64 +#define RQ_ENET_LEN_BITS 14 +#define RQ_ENET_LEN_MASK ((1 << RQ_ENET_LEN_BITS) - 1) +#define RQ_ENET_TYPE_BITS 2 +#define RQ_ENET_TYPE_MASK ((1 << RQ_ENET_TYPE_BITS) - 1) + +static inline void rq_enet_desc_enc(struct rq_enet_desc *desc, + u64 address, u8 type, u16 length) +{ + desc->address = cpu_to_le64(address); + desc->length_type = cpu_to_le16((length & RQ_ENET_LEN_MASK) | + ((type & RQ_ENET_TYPE_MASK) << RQ_ENET_LEN_BITS)); +} + +static inline void rq_enet_desc_dec(struct rq_enet_desc *desc, + u64 *address, u8 *type, u16 *length) +{ + *address = le64_to_cpu(desc->address); + *length = le16_to_cpu(desc->length_type) & RQ_ENET_LEN_MASK; + *type = (u8)((le16_to_cpu(desc->length_type) >> RQ_ENET_LEN_BITS) & + RQ_ENET_TYPE_MASK); +} + +#endif /* _RQ_ENET_DESC_H_ */ diff --git a/drivers/scsi/fnic/vnic_cq.c b/drivers/scsi/fnic/vnic_cq.c new file mode 100644 index 00000000000..c5db32eda5e --- /dev/null +++ b/drivers/scsi/fnic/vnic_cq.c @@ -0,0 +1,85 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include "vnic_dev.h" +#include "vnic_cq.h" + +void vnic_cq_free(struct vnic_cq *cq) +{ + vnic_dev_free_desc_ring(cq->vdev, &cq->ring); + + cq->ctrl = NULL; +} + +int vnic_cq_alloc(struct vnic_dev *vdev, struct vnic_cq *cq, unsigned int index, + unsigned int desc_count, unsigned int desc_size) +{ + int err; + + cq->index = index; + cq->vdev = vdev; + + cq->ctrl = vnic_dev_get_res(vdev, RES_TYPE_CQ, index); + if (!cq->ctrl) { + printk(KERN_ERR "Failed to hook CQ[%d] resource\n", index); + return -EINVAL; + } + + err = vnic_dev_alloc_desc_ring(vdev, &cq->ring, desc_count, desc_size); + if (err) + return err; + + return 0; +} + +void vnic_cq_init(struct vnic_cq *cq, unsigned int flow_control_enable, + unsigned int color_enable, unsigned int cq_head, unsigned int cq_tail, + unsigned int cq_tail_color, unsigned int interrupt_enable, + unsigned int cq_entry_enable, unsigned int cq_message_enable, + unsigned int interrupt_offset, u64 cq_message_addr) +{ + u64 paddr; + + paddr = (u64)cq->ring.base_addr | VNIC_PADDR_TARGET; + writeq(paddr, &cq->ctrl->ring_base); + iowrite32(cq->ring.desc_count, &cq->ctrl->ring_size); + iowrite32(flow_control_enable, &cq->ctrl->flow_control_enable); + iowrite32(color_enable, &cq->ctrl->color_enable); + iowrite32(cq_head, &cq->ctrl->cq_head); + iowrite32(cq_tail, &cq->ctrl->cq_tail); + iowrite32(cq_tail_color, &cq->ctrl->cq_tail_color); + iowrite32(interrupt_enable, &cq->ctrl->interrupt_enable); + iowrite32(cq_entry_enable, &cq->ctrl->cq_entry_enable); + iowrite32(cq_message_enable, &cq->ctrl->cq_message_enable); + iowrite32(interrupt_offset, &cq->ctrl->interrupt_offset); + writeq(cq_message_addr, &cq->ctrl->cq_message_addr); +} + +void vnic_cq_clean(struct vnic_cq *cq) +{ + cq->to_clean = 0; + cq->last_color = 0; + + iowrite32(0, &cq->ctrl->cq_head); + iowrite32(0, &cq->ctrl->cq_tail); + iowrite32(1, &cq->ctrl->cq_tail_color); + + vnic_dev_clear_desc_ring(&cq->ring); +} diff --git a/drivers/scsi/fnic/vnic_cq.h b/drivers/scsi/fnic/vnic_cq.h new file mode 100644 index 00000000000..4ede6809fb1 --- /dev/null +++ b/drivers/scsi/fnic/vnic_cq.h @@ -0,0 +1,121 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _VNIC_CQ_H_ +#define _VNIC_CQ_H_ + +#include "cq_desc.h" +#include "vnic_dev.h" + +/* + * These defines avoid symbol clash between fnic and enic (Cisco 10G Eth + * Driver) when both are built with CONFIG options =y + */ +#define vnic_cq_service fnic_cq_service +#define vnic_cq_free fnic_cq_free +#define vnic_cq_alloc fnic_cq_alloc +#define vnic_cq_init fnic_cq_init +#define vnic_cq_clean fnic_cq_clean + +/* Completion queue control */ +struct vnic_cq_ctrl { + u64 ring_base; /* 0x00 */ + u32 ring_size; /* 0x08 */ + u32 pad0; + u32 flow_control_enable; /* 0x10 */ + u32 pad1; + u32 color_enable; /* 0x18 */ + u32 pad2; + u32 cq_head; /* 0x20 */ + u32 pad3; + u32 cq_tail; /* 0x28 */ + u32 pad4; + u32 cq_tail_color; /* 0x30 */ + u32 pad5; + u32 interrupt_enable; /* 0x38 */ + u32 pad6; + u32 cq_entry_enable; /* 0x40 */ + u32 pad7; + u32 cq_message_enable; /* 0x48 */ + u32 pad8; + u32 interrupt_offset; /* 0x50 */ + u32 pad9; + u64 cq_message_addr; /* 0x58 */ + u32 pad10; +}; + +struct vnic_cq { + unsigned int index; + struct vnic_dev *vdev; + struct vnic_cq_ctrl __iomem *ctrl; /* memory-mapped */ + struct vnic_dev_ring ring; + unsigned int to_clean; + unsigned int last_color; +}; + +static inline unsigned int vnic_cq_service(struct vnic_cq *cq, + unsigned int work_to_do, + int (*q_service)(struct vnic_dev *vdev, struct cq_desc *cq_desc, + u8 type, u16 q_number, u16 completed_index, void *opaque), + void *opaque) +{ + struct cq_desc *cq_desc; + unsigned int work_done = 0; + u16 q_number, completed_index; + u8 type, color; + + cq_desc = (struct cq_desc *)((u8 *)cq->ring.descs + + cq->ring.desc_size * cq->to_clean); + cq_desc_dec(cq_desc, &type, &color, + &q_number, &completed_index); + + while (color != cq->last_color) { + + if ((*q_service)(cq->vdev, cq_desc, type, + q_number, completed_index, opaque)) + break; + + cq->to_clean++; + if (cq->to_clean == cq->ring.desc_count) { + cq->to_clean = 0; + cq->last_color = cq->last_color ? 0 : 1; + } + + cq_desc = (struct cq_desc *)((u8 *)cq->ring.descs + + cq->ring.desc_size * cq->to_clean); + cq_desc_dec(cq_desc, &type, &color, + &q_number, &completed_index); + + work_done++; + if (work_done >= work_to_do) + break; + } + + return work_done; +} + +void vnic_cq_free(struct vnic_cq *cq); +int vnic_cq_alloc(struct vnic_dev *vdev, struct vnic_cq *cq, unsigned int index, + unsigned int desc_count, unsigned int desc_size); +void vnic_cq_init(struct vnic_cq *cq, unsigned int flow_control_enable, + unsigned int color_enable, unsigned int cq_head, unsigned int cq_tail, + unsigned int cq_tail_color, unsigned int interrupt_enable, + unsigned int cq_entry_enable, unsigned int message_enable, + unsigned int interrupt_offset, u64 message_addr); +void vnic_cq_clean(struct vnic_cq *cq); + +#endif /* _VNIC_CQ_H_ */ diff --git a/drivers/scsi/fnic/vnic_cq_copy.h b/drivers/scsi/fnic/vnic_cq_copy.h new file mode 100644 index 00000000000..7901ce255a8 --- /dev/null +++ b/drivers/scsi/fnic/vnic_cq_copy.h @@ -0,0 +1,62 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _VNIC_CQ_COPY_H_ +#define _VNIC_CQ_COPY_H_ + +#include "fcpio.h" + +static inline unsigned int vnic_cq_copy_service( + struct vnic_cq *cq, + int (*q_service)(struct vnic_dev *vdev, + unsigned int index, + struct fcpio_fw_req *desc), + unsigned int work_to_do) + +{ + struct fcpio_fw_req *desc; + unsigned int work_done = 0; + u8 color; + + desc = (struct fcpio_fw_req *)((u8 *)cq->ring.descs + + cq->ring.desc_size * cq->to_clean); + fcpio_color_dec(desc, &color); + + while (color != cq->last_color) { + + if ((*q_service)(cq->vdev, cq->index, desc)) + break; + + cq->to_clean++; + if (cq->to_clean == cq->ring.desc_count) { + cq->to_clean = 0; + cq->last_color = cq->last_color ? 0 : 1; + } + + desc = (struct fcpio_fw_req *)((u8 *)cq->ring.descs + + cq->ring.desc_size * cq->to_clean); + fcpio_color_dec(desc, &color); + + work_done++; + if (work_done >= work_to_do) + break; + } + + return work_done; +} + +#endif /* _VNIC_CQ_COPY_H_ */ diff --git a/drivers/scsi/fnic/vnic_dev.c b/drivers/scsi/fnic/vnic_dev.c new file mode 100644 index 00000000000..56677064508 --- /dev/null +++ b/drivers/scsi/fnic/vnic_dev.c @@ -0,0 +1,690 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "vnic_resource.h" +#include "vnic_devcmd.h" +#include "vnic_dev.h" +#include "vnic_stats.h" + +struct vnic_res { + void __iomem *vaddr; + unsigned int count; +}; + +struct vnic_dev { + void *priv; + struct pci_dev *pdev; + struct vnic_res res[RES_TYPE_MAX]; + enum vnic_dev_intr_mode intr_mode; + struct vnic_devcmd __iomem *devcmd; + struct vnic_devcmd_notify *notify; + struct vnic_devcmd_notify notify_copy; + dma_addr_t notify_pa; + u32 *linkstatus; + dma_addr_t linkstatus_pa; + struct vnic_stats *stats; + dma_addr_t stats_pa; + struct vnic_devcmd_fw_info *fw_info; + dma_addr_t fw_info_pa; +}; + +#define VNIC_MAX_RES_HDR_SIZE \ + (sizeof(struct vnic_resource_header) + \ + sizeof(struct vnic_resource) * RES_TYPE_MAX) +#define VNIC_RES_STRIDE 128 + +void *vnic_dev_priv(struct vnic_dev *vdev) +{ + return vdev->priv; +} + +static int vnic_dev_discover_res(struct vnic_dev *vdev, + struct vnic_dev_bar *bar) +{ + struct vnic_resource_header __iomem *rh; + struct vnic_resource __iomem *r; + u8 type; + + if (bar->len < VNIC_MAX_RES_HDR_SIZE) { + printk(KERN_ERR "vNIC BAR0 res hdr length error\n"); + return -EINVAL; + } + + rh = bar->vaddr; + if (!rh) { + printk(KERN_ERR "vNIC BAR0 res hdr not mem-mapped\n"); + return -EINVAL; + } + + if (ioread32(&rh->magic) != VNIC_RES_MAGIC || + ioread32(&rh->version) != VNIC_RES_VERSION) { + printk(KERN_ERR "vNIC BAR0 res magic/version error " + "exp (%lx/%lx) curr (%x/%x)\n", + VNIC_RES_MAGIC, VNIC_RES_VERSION, + ioread32(&rh->magic), ioread32(&rh->version)); + return -EINVAL; + } + + r = (struct vnic_resource __iomem *)(rh + 1); + + while ((type = ioread8(&r->type)) != RES_TYPE_EOL) { + + u8 bar_num = ioread8(&r->bar); + u32 bar_offset = ioread32(&r->bar_offset); + u32 count = ioread32(&r->count); + u32 len; + + r++; + + if (bar_num != 0) /* only mapping in BAR0 resources */ + continue; + + switch (type) { + case RES_TYPE_WQ: + case RES_TYPE_RQ: + case RES_TYPE_CQ: + case RES_TYPE_INTR_CTRL: + /* each count is stride bytes long */ + len = count * VNIC_RES_STRIDE; + if (len + bar_offset > bar->len) { + printk(KERN_ERR "vNIC BAR0 resource %d " + "out-of-bounds, offset 0x%x + " + "size 0x%x > bar len 0x%lx\n", + type, bar_offset, + len, + bar->len); + return -EINVAL; + } + break; + case RES_TYPE_INTR_PBA_LEGACY: + case RES_TYPE_DEVCMD: + len = count; + break; + default: + continue; + } + + vdev->res[type].count = count; + vdev->res[type].vaddr = (char __iomem *)bar->vaddr + bar_offset; + } + + return 0; +} + +unsigned int vnic_dev_get_res_count(struct vnic_dev *vdev, + enum vnic_res_type type) +{ + return vdev->res[type].count; +} + +void __iomem *vnic_dev_get_res(struct vnic_dev *vdev, enum vnic_res_type type, + unsigned int index) +{ + if (!vdev->res[type].vaddr) + return NULL; + + switch (type) { + case RES_TYPE_WQ: + case RES_TYPE_RQ: + case RES_TYPE_CQ: + case RES_TYPE_INTR_CTRL: + return (char __iomem *)vdev->res[type].vaddr + + index * VNIC_RES_STRIDE; + default: + return (char __iomem *)vdev->res[type].vaddr; + } +} + +unsigned int vnic_dev_desc_ring_size(struct vnic_dev_ring *ring, + unsigned int desc_count, + unsigned int desc_size) +{ + /* The base address of the desc rings must be 512 byte aligned. + * Descriptor count is aligned to groups of 32 descriptors. A + * count of 0 means the maximum 4096 descriptors. Descriptor + * size is aligned to 16 bytes. + */ + + unsigned int count_align = 32; + unsigned int desc_align = 16; + + ring->base_align = 512; + + if (desc_count == 0) + desc_count = 4096; + + ring->desc_count = ALIGN(desc_count, count_align); + + ring->desc_size = ALIGN(desc_size, desc_align); + + ring->size = ring->desc_count * ring->desc_size; + ring->size_unaligned = ring->size + ring->base_align; + + return ring->size_unaligned; +} + +void vnic_dev_clear_desc_ring(struct vnic_dev_ring *ring) +{ + memset(ring->descs, 0, ring->size); +} + +int vnic_dev_alloc_desc_ring(struct vnic_dev *vdev, struct vnic_dev_ring *ring, + unsigned int desc_count, unsigned int desc_size) +{ + vnic_dev_desc_ring_size(ring, desc_count, desc_size); + + ring->descs_unaligned = pci_alloc_consistent(vdev->pdev, + ring->size_unaligned, + &ring->base_addr_unaligned); + + if (!ring->descs_unaligned) { + printk(KERN_ERR + "Failed to allocate ring (size=%d), aborting\n", + (int)ring->size); + return -ENOMEM; + } + + ring->base_addr = ALIGN(ring->base_addr_unaligned, + ring->base_align); + ring->descs = (u8 *)ring->descs_unaligned + + (ring->base_addr - ring->base_addr_unaligned); + + vnic_dev_clear_desc_ring(ring); + + ring->desc_avail = ring->desc_count - 1; + + return 0; +} + +void vnic_dev_free_desc_ring(struct vnic_dev *vdev, struct vnic_dev_ring *ring) +{ + if (ring->descs) { + pci_free_consistent(vdev->pdev, + ring->size_unaligned, + ring->descs_unaligned, + ring->base_addr_unaligned); + ring->descs = NULL; + } +} + +int vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, + u64 *a0, u64 *a1, int wait) +{ + struct vnic_devcmd __iomem *devcmd = vdev->devcmd; + int delay; + u32 status; + int dev_cmd_err[] = { + /* convert from fw's version of error.h to host's version */ + 0, /* ERR_SUCCESS */ + EINVAL, /* ERR_EINVAL */ + EFAULT, /* ERR_EFAULT */ + EPERM, /* ERR_EPERM */ + EBUSY, /* ERR_EBUSY */ + }; + int err; + + status = ioread32(&devcmd->status); + if (status & STAT_BUSY) { + printk(KERN_ERR "Busy devcmd %d\n", _CMD_N(cmd)); + return -EBUSY; + } + + if (_CMD_DIR(cmd) & _CMD_DIR_WRITE) { + writeq(*a0, &devcmd->args[0]); + writeq(*a1, &devcmd->args[1]); + wmb(); + } + + iowrite32(cmd, &devcmd->cmd); + + if ((_CMD_FLAGS(cmd) & _CMD_FLAGS_NOWAIT)) + return 0; + + for (delay = 0; delay < wait; delay++) { + + udelay(100); + + status = ioread32(&devcmd->status); + if (!(status & STAT_BUSY)) { + + if (status & STAT_ERROR) { + err = dev_cmd_err[(int)readq(&devcmd->args[0])]; + printk(KERN_ERR "Error %d devcmd %d\n", + err, _CMD_N(cmd)); + return -err; + } + + if (_CMD_DIR(cmd) & _CMD_DIR_READ) { + rmb(); + *a0 = readq(&devcmd->args[0]); + *a1 = readq(&devcmd->args[1]); + } + + return 0; + } + } + + printk(KERN_ERR "Timedout devcmd %d\n", _CMD_N(cmd)); + return -ETIMEDOUT; +} + +int vnic_dev_fw_info(struct vnic_dev *vdev, + struct vnic_devcmd_fw_info **fw_info) +{ + u64 a0, a1 = 0; + int wait = 1000; + int err = 0; + + if (!vdev->fw_info) { + vdev->fw_info = pci_alloc_consistent(vdev->pdev, + sizeof(struct vnic_devcmd_fw_info), + &vdev->fw_info_pa); + if (!vdev->fw_info) + return -ENOMEM; + + a0 = vdev->fw_info_pa; + + /* only get fw_info once and cache it */ + err = vnic_dev_cmd(vdev, CMD_MCPU_FW_INFO, &a0, &a1, wait); + } + + *fw_info = vdev->fw_info; + + return err; +} + +int vnic_dev_spec(struct vnic_dev *vdev, unsigned int offset, unsigned int size, + void *value) +{ + u64 a0, a1; + int wait = 1000; + int err; + + a0 = offset; + a1 = size; + + err = vnic_dev_cmd(vdev, CMD_DEV_SPEC, &a0, &a1, wait); + + switch (size) { + case 1: + *(u8 *)value = (u8)a0; + break; + case 2: + *(u16 *)value = (u16)a0; + break; + case 4: + *(u32 *)value = (u32)a0; + break; + case 8: + *(u64 *)value = a0; + break; + default: + BUG(); + break; + } + + return err; +} + +int vnic_dev_stats_clear(struct vnic_dev *vdev) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_STATS_CLEAR, &a0, &a1, wait); +} + +int vnic_dev_stats_dump(struct vnic_dev *vdev, struct vnic_stats **stats) +{ + u64 a0, a1; + int wait = 1000; + + if (!vdev->stats) { + vdev->stats = pci_alloc_consistent(vdev->pdev, + sizeof(struct vnic_stats), &vdev->stats_pa); + if (!vdev->stats) + return -ENOMEM; + } + + *stats = vdev->stats; + a0 = vdev->stats_pa; + a1 = sizeof(struct vnic_stats); + + return vnic_dev_cmd(vdev, CMD_STATS_DUMP, &a0, &a1, wait); +} + +int vnic_dev_close(struct vnic_dev *vdev) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_CLOSE, &a0, &a1, wait); +} + +int vnic_dev_enable(struct vnic_dev *vdev) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_ENABLE, &a0, &a1, wait); +} + +int vnic_dev_disable(struct vnic_dev *vdev) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_DISABLE, &a0, &a1, wait); +} + +int vnic_dev_open(struct vnic_dev *vdev, int arg) +{ + u64 a0 = (u32)arg, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_OPEN, &a0, &a1, wait); +} + +int vnic_dev_open_done(struct vnic_dev *vdev, int *done) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + int err; + + *done = 0; + + err = vnic_dev_cmd(vdev, CMD_OPEN_STATUS, &a0, &a1, wait); + if (err) + return err; + + *done = (a0 == 0); + + return 0; +} + +int vnic_dev_soft_reset(struct vnic_dev *vdev, int arg) +{ + u64 a0 = (u32)arg, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_SOFT_RESET, &a0, &a1, wait); +} + +int vnic_dev_soft_reset_done(struct vnic_dev *vdev, int *done) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + int err; + + *done = 0; + + err = vnic_dev_cmd(vdev, CMD_SOFT_RESET_STATUS, &a0, &a1, wait); + if (err) + return err; + + *done = (a0 == 0); + + return 0; +} + +int vnic_dev_hang_notify(struct vnic_dev *vdev) +{ + u64 a0, a1; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_HANG_NOTIFY, &a0, &a1, wait); +} + +int vnic_dev_mac_addr(struct vnic_dev *vdev, u8 *mac_addr) +{ + u64 a0, a1; + int wait = 1000; + int err, i; + + for (i = 0; i < ETH_ALEN; i++) + mac_addr[i] = 0; + + err = vnic_dev_cmd(vdev, CMD_MAC_ADDR, &a0, &a1, wait); + if (err) + return err; + + for (i = 0; i < ETH_ALEN; i++) + mac_addr[i] = ((u8 *)&a0)[i]; + + return 0; +} + +void vnic_dev_packet_filter(struct vnic_dev *vdev, int directed, int multicast, + int broadcast, int promisc, int allmulti) +{ + u64 a0, a1 = 0; + int wait = 1000; + int err; + + a0 = (directed ? CMD_PFILTER_DIRECTED : 0) | + (multicast ? CMD_PFILTER_MULTICAST : 0) | + (broadcast ? CMD_PFILTER_BROADCAST : 0) | + (promisc ? CMD_PFILTER_PROMISCUOUS : 0) | + (allmulti ? CMD_PFILTER_ALL_MULTICAST : 0); + + err = vnic_dev_cmd(vdev, CMD_PACKET_FILTER, &a0, &a1, wait); + if (err) + printk(KERN_ERR "Can't set packet filter\n"); +} + +void vnic_dev_add_addr(struct vnic_dev *vdev, u8 *addr) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + int err; + int i; + + for (i = 0; i < ETH_ALEN; i++) + ((u8 *)&a0)[i] = addr[i]; + + err = vnic_dev_cmd(vdev, CMD_ADDR_ADD, &a0, &a1, wait); + if (err) + printk(KERN_ERR + "Can't add addr [%02x:%02x:%02x:%02x:%02x:%02x], %d\n", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], + err); +} + +void vnic_dev_del_addr(struct vnic_dev *vdev, u8 *addr) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + int err; + int i; + + for (i = 0; i < ETH_ALEN; i++) + ((u8 *)&a0)[i] = addr[i]; + + err = vnic_dev_cmd(vdev, CMD_ADDR_DEL, &a0, &a1, wait); + if (err) + printk(KERN_ERR + "Can't del addr [%02x:%02x:%02x:%02x:%02x:%02x], %d\n", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], + err); +} + +int vnic_dev_notify_set(struct vnic_dev *vdev, u16 intr) +{ + u64 a0, a1; + int wait = 1000; + + if (!vdev->notify) { + vdev->notify = pci_alloc_consistent(vdev->pdev, + sizeof(struct vnic_devcmd_notify), + &vdev->notify_pa); + if (!vdev->notify) + return -ENOMEM; + } + + a0 = vdev->notify_pa; + a1 = ((u64)intr << 32) & 0x0000ffff00000000ULL; + a1 += sizeof(struct vnic_devcmd_notify); + + return vnic_dev_cmd(vdev, CMD_NOTIFY, &a0, &a1, wait); +} + +void vnic_dev_notify_unset(struct vnic_dev *vdev) +{ + u64 a0, a1; + int wait = 1000; + + a0 = 0; /* paddr = 0 to unset notify buffer */ + a1 = 0x0000ffff00000000ULL; /* intr num = -1 to unreg for intr */ + a1 += sizeof(struct vnic_devcmd_notify); + + vnic_dev_cmd(vdev, CMD_NOTIFY, &a0, &a1, wait); +} + +static int vnic_dev_notify_ready(struct vnic_dev *vdev) +{ + u32 *words; + unsigned int nwords = sizeof(struct vnic_devcmd_notify) / 4; + unsigned int i; + u32 csum; + + if (!vdev->notify) + return 0; + + do { + csum = 0; + memcpy(&vdev->notify_copy, vdev->notify, + sizeof(struct vnic_devcmd_notify)); + words = (u32 *)&vdev->notify_copy; + for (i = 1; i < nwords; i++) + csum += words[i]; + } while (csum != words[0]); + + return 1; +} + +int vnic_dev_init(struct vnic_dev *vdev, int arg) +{ + u64 a0 = (u32)arg, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_INIT, &a0, &a1, wait); +} + +int vnic_dev_link_status(struct vnic_dev *vdev) +{ + if (vdev->linkstatus) + return *vdev->linkstatus; + + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.link_state; +} + +u32 vnic_dev_port_speed(struct vnic_dev *vdev) +{ + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.port_speed; +} + +u32 vnic_dev_msg_lvl(struct vnic_dev *vdev) +{ + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.msglvl; +} + +u32 vnic_dev_mtu(struct vnic_dev *vdev) +{ + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.mtu; +} + +u32 vnic_dev_link_down_cnt(struct vnic_dev *vdev) +{ + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.link_down_cnt; +} + +void vnic_dev_set_intr_mode(struct vnic_dev *vdev, + enum vnic_dev_intr_mode intr_mode) +{ + vdev->intr_mode = intr_mode; +} + +enum vnic_dev_intr_mode vnic_dev_get_intr_mode( + struct vnic_dev *vdev) +{ + return vdev->intr_mode; +} + +void vnic_dev_unregister(struct vnic_dev *vdev) +{ + if (vdev) { + if (vdev->notify) + pci_free_consistent(vdev->pdev, + sizeof(struct vnic_devcmd_notify), + vdev->notify, + vdev->notify_pa); + if (vdev->linkstatus) + pci_free_consistent(vdev->pdev, + sizeof(u32), + vdev->linkstatus, + vdev->linkstatus_pa); + if (vdev->stats) + pci_free_consistent(vdev->pdev, + sizeof(struct vnic_dev), + vdev->stats, vdev->stats_pa); + if (vdev->fw_info) + pci_free_consistent(vdev->pdev, + sizeof(struct vnic_devcmd_fw_info), + vdev->fw_info, vdev->fw_info_pa); + kfree(vdev); + } +} + +struct vnic_dev *vnic_dev_register(struct vnic_dev *vdev, + void *priv, struct pci_dev *pdev, struct vnic_dev_bar *bar) +{ + if (!vdev) { + vdev = kzalloc(sizeof(struct vnic_dev), GFP_KERNEL); + if (!vdev) + return NULL; + } + + vdev->priv = priv; + vdev->pdev = pdev; + + if (vnic_dev_discover_res(vdev, bar)) + goto err_out; + + vdev->devcmd = vnic_dev_get_res(vdev, RES_TYPE_DEVCMD, 0); + if (!vdev->devcmd) + goto err_out; + + return vdev; + +err_out: + vnic_dev_unregister(vdev); + return NULL; +} diff --git a/drivers/scsi/fnic/vnic_dev.h b/drivers/scsi/fnic/vnic_dev.h new file mode 100644 index 00000000000..f9935a8a5a0 --- /dev/null +++ b/drivers/scsi/fnic/vnic_dev.h @@ -0,0 +1,161 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _VNIC_DEV_H_ +#define _VNIC_DEV_H_ + +#include "vnic_resource.h" +#include "vnic_devcmd.h" + +/* + * These defines avoid symbol clash between fnic and enic (Cisco 10G Eth + * Driver) when both are built with CONFIG options =y + */ +#define vnic_dev_priv fnic_dev_priv +#define vnic_dev_get_res_count fnic_dev_get_res_count +#define vnic_dev_get_res fnic_dev_get_res +#define vnic_dev_desc_ring_size fnic_dev_desc_ring_siz +#define vnic_dev_clear_desc_ring fnic_dev_clear_desc_ring +#define vnic_dev_alloc_desc_ring fnic_dev_alloc_desc_ring +#define vnic_dev_free_desc_ring fnic_dev_free_desc_ring +#define vnic_dev_cmd fnic_dev_cmd +#define vnic_dev_fw_info fnic_dev_fw_info +#define vnic_dev_spec fnic_dev_spec +#define vnic_dev_stats_clear fnic_dev_stats_clear +#define vnic_dev_stats_dump fnic_dev_stats_dump +#define vnic_dev_hang_notify fnic_dev_hang_notify +#define vnic_dev_packet_filter fnic_dev_packet_filter +#define vnic_dev_add_addr fnic_dev_add_addr +#define vnic_dev_del_addr fnic_dev_del_addr +#define vnic_dev_mac_addr fnic_dev_mac_addr +#define vnic_dev_notify_set fnic_dev_notify_set +#define vnic_dev_notify_unset fnic_dev_notify_unset +#define vnic_dev_link_status fnic_dev_link_status +#define vnic_dev_port_speed fnic_dev_port_speed +#define vnic_dev_msg_lvl fnic_dev_msg_lvl +#define vnic_dev_mtu fnic_dev_mtu +#define vnic_dev_link_down_cnt fnic_dev_link_down_cnt +#define vnic_dev_close fnic_dev_close +#define vnic_dev_enable fnic_dev_enable +#define vnic_dev_disable fnic_dev_disable +#define vnic_dev_open fnic_dev_open +#define vnic_dev_open_done fnic_dev_open_done +#define vnic_dev_init fnic_dev_init +#define vnic_dev_soft_reset fnic_dev_soft_reset +#define vnic_dev_soft_reset_done fnic_dev_soft_reset_done +#define vnic_dev_set_intr_mode fnic_dev_set_intr_mode +#define vnic_dev_get_intr_mode fnic_dev_get_intr_mode +#define vnic_dev_unregister fnic_dev_unregister +#define vnic_dev_register fnic_dev_register + +#ifndef VNIC_PADDR_TARGET +#define VNIC_PADDR_TARGET 0x0000000000000000ULL +#endif + +#ifndef readq +static inline u64 readq(void __iomem *reg) +{ + return ((u64)readl(reg + 0x4UL) << 32) | (u64)readl(reg); +} + +static inline void writeq(u64 val, void __iomem *reg) +{ + writel(val & 0xffffffff, reg); + writel(val >> 32, reg + 0x4UL); +} +#endif + +enum vnic_dev_intr_mode { + VNIC_DEV_INTR_MODE_UNKNOWN, + VNIC_DEV_INTR_MODE_INTX, + VNIC_DEV_INTR_MODE_MSI, + VNIC_DEV_INTR_MODE_MSIX, +}; + +struct vnic_dev_bar { + void __iomem *vaddr; + dma_addr_t bus_addr; + unsigned long len; +}; + +struct vnic_dev_ring { + void *descs; + size_t size; + dma_addr_t base_addr; + size_t base_align; + void *descs_unaligned; + size_t size_unaligned; + dma_addr_t base_addr_unaligned; + unsigned int desc_size; + unsigned int desc_count; + unsigned int desc_avail; +}; + +struct vnic_dev; +struct vnic_stats; + +void *vnic_dev_priv(struct vnic_dev *vdev); +unsigned int vnic_dev_get_res_count(struct vnic_dev *vdev, + enum vnic_res_type type); +void __iomem *vnic_dev_get_res(struct vnic_dev *vdev, enum vnic_res_type type, + unsigned int index); +unsigned int vnic_dev_desc_ring_size(struct vnic_dev_ring *ring, + unsigned int desc_count, + unsigned int desc_size); +void vnic_dev_clear_desc_ring(struct vnic_dev_ring *ring); +int vnic_dev_alloc_desc_ring(struct vnic_dev *vdev, struct vnic_dev_ring *ring, + unsigned int desc_count, unsigned int desc_size); +void vnic_dev_free_desc_ring(struct vnic_dev *vdev, + struct vnic_dev_ring *ring); +int vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, + u64 *a0, u64 *a1, int wait); +int vnic_dev_fw_info(struct vnic_dev *vdev, + struct vnic_devcmd_fw_info **fw_info); +int vnic_dev_spec(struct vnic_dev *vdev, unsigned int offset, + unsigned int size, void *value); +int vnic_dev_stats_clear(struct vnic_dev *vdev); +int vnic_dev_stats_dump(struct vnic_dev *vdev, struct vnic_stats **stats); +int vnic_dev_hang_notify(struct vnic_dev *vdev); +void vnic_dev_packet_filter(struct vnic_dev *vdev, int directed, int multicast, + int broadcast, int promisc, int allmulti); +void vnic_dev_add_addr(struct vnic_dev *vdev, u8 *addr); +void vnic_dev_del_addr(struct vnic_dev *vdev, u8 *addr); +int vnic_dev_mac_addr(struct vnic_dev *vdev, u8 *mac_addr); +int vnic_dev_notify_set(struct vnic_dev *vdev, u16 intr); +void vnic_dev_notify_unset(struct vnic_dev *vdev); +int vnic_dev_link_status(struct vnic_dev *vdev); +u32 vnic_dev_port_speed(struct vnic_dev *vdev); +u32 vnic_dev_msg_lvl(struct vnic_dev *vdev); +u32 vnic_dev_mtu(struct vnic_dev *vdev); +u32 vnic_dev_link_down_cnt(struct vnic_dev *vdev); +int vnic_dev_close(struct vnic_dev *vdev); +int vnic_dev_enable(struct vnic_dev *vdev); +int vnic_dev_disable(struct vnic_dev *vdev); +int vnic_dev_open(struct vnic_dev *vdev, int arg); +int vnic_dev_open_done(struct vnic_dev *vdev, int *done); +int vnic_dev_init(struct vnic_dev *vdev, int arg); +int vnic_dev_soft_reset(struct vnic_dev *vdev, int arg); +int vnic_dev_soft_reset_done(struct vnic_dev *vdev, int *done); +void vnic_dev_set_intr_mode(struct vnic_dev *vdev, + enum vnic_dev_intr_mode intr_mode); +enum vnic_dev_intr_mode vnic_dev_get_intr_mode(struct vnic_dev *vdev); +void vnic_dev_unregister(struct vnic_dev *vdev); +struct vnic_dev *vnic_dev_register(struct vnic_dev *vdev, + void *priv, struct pci_dev *pdev, + struct vnic_dev_bar *bar); + +#endif /* _VNIC_DEV_H_ */ diff --git a/drivers/scsi/fnic/vnic_devcmd.h b/drivers/scsi/fnic/vnic_devcmd.h new file mode 100644 index 00000000000..d62b9061bf1 --- /dev/null +++ b/drivers/scsi/fnic/vnic_devcmd.h @@ -0,0 +1,281 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _VNIC_DEVCMD_H_ +#define _VNIC_DEVCMD_H_ + +#define _CMD_NBITS 14 +#define _CMD_VTYPEBITS 10 +#define _CMD_FLAGSBITS 6 +#define _CMD_DIRBITS 2 + +#define _CMD_NMASK ((1 << _CMD_NBITS)-1) +#define _CMD_VTYPEMASK ((1 << _CMD_VTYPEBITS)-1) +#define _CMD_FLAGSMASK ((1 << _CMD_FLAGSBITS)-1) +#define _CMD_DIRMASK ((1 << _CMD_DIRBITS)-1) + +#define _CMD_NSHIFT 0 +#define _CMD_VTYPESHIFT (_CMD_NSHIFT+_CMD_NBITS) +#define _CMD_FLAGSSHIFT (_CMD_VTYPESHIFT+_CMD_VTYPEBITS) +#define _CMD_DIRSHIFT (_CMD_FLAGSSHIFT+_CMD_FLAGSBITS) + +/* + * Direction bits (from host perspective). + */ +#define _CMD_DIR_NONE 0U +#define _CMD_DIR_WRITE 1U +#define _CMD_DIR_READ 2U +#define _CMD_DIR_RW (_CMD_DIR_WRITE | _CMD_DIR_READ) + +/* + * Flag bits. + */ +#define _CMD_FLAGS_NONE 0U +#define _CMD_FLAGS_NOWAIT 1U + +/* + * vNIC type bits. + */ +#define _CMD_VTYPE_NONE 0U +#define _CMD_VTYPE_ENET 1U +#define _CMD_VTYPE_FC 2U +#define _CMD_VTYPE_SCSI 4U +#define _CMD_VTYPE_ALL (_CMD_VTYPE_ENET | _CMD_VTYPE_FC | _CMD_VTYPE_SCSI) + +/* + * Used to create cmds.. +*/ +#define _CMDCF(dir, flags, vtype, nr) \ + (((dir) << _CMD_DIRSHIFT) | \ + ((flags) << _CMD_FLAGSSHIFT) | \ + ((vtype) << _CMD_VTYPESHIFT) | \ + ((nr) << _CMD_NSHIFT)) +#define _CMDC(dir, vtype, nr) _CMDCF(dir, 0, vtype, nr) +#define _CMDCNW(dir, vtype, nr) _CMDCF(dir, _CMD_FLAGS_NOWAIT, vtype, nr) + +/* + * Used to decode cmds.. +*/ +#define _CMD_DIR(cmd) (((cmd) >> _CMD_DIRSHIFT) & _CMD_DIRMASK) +#define _CMD_FLAGS(cmd) (((cmd) >> _CMD_FLAGSSHIFT) & _CMD_FLAGSMASK) +#define _CMD_VTYPE(cmd) (((cmd) >> _CMD_VTYPESHIFT) & _CMD_VTYPEMASK) +#define _CMD_N(cmd) (((cmd) >> _CMD_NSHIFT) & _CMD_NMASK) + +enum vnic_devcmd_cmd { + CMD_NONE = _CMDC(_CMD_DIR_NONE, _CMD_VTYPE_NONE, 0), + + /* mcpu fw info in mem: (u64)a0=paddr to struct vnic_devcmd_fw_info */ + CMD_MCPU_FW_INFO = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 1), + + /* dev-specific block member: + * in: (u16)a0=offset,(u8)a1=size + * out: a0=value */ + CMD_DEV_SPEC = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 2), + + /* stats clear */ + CMD_STATS_CLEAR = _CMDCNW(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 3), + + /* stats dump in mem: (u64)a0=paddr to stats area, + * (u16)a1=sizeof stats area */ + CMD_STATS_DUMP = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 4), + + /* set Rx packet filter: (u32)a0=filters (see CMD_PFILTER_*) */ + CMD_PACKET_FILTER = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 7), + + /* hang detection notification */ + CMD_HANG_NOTIFY = _CMDC(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 8), + + /* MAC address in (u48)a0 */ + CMD_MAC_ADDR = _CMDC(_CMD_DIR_READ, + _CMD_VTYPE_ENET | _CMD_VTYPE_FC, 9), + + /* disable/enable promisc mode: (u8)a0=0/1 */ +/***** XXX DEPRECATED *****/ + CMD_PROMISC_MODE = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 10), + + /* disable/enable all-multi mode: (u8)a0=0/1 */ +/***** XXX DEPRECATED *****/ + CMD_ALLMULTI_MODE = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 11), + + /* add addr from (u48)a0 */ + CMD_ADDR_ADD = _CMDCNW(_CMD_DIR_WRITE, + _CMD_VTYPE_ENET | _CMD_VTYPE_FC, 12), + + /* del addr from (u48)a0 */ + CMD_ADDR_DEL = _CMDCNW(_CMD_DIR_WRITE, + _CMD_VTYPE_ENET | _CMD_VTYPE_FC, 13), + + /* add VLAN id in (u16)a0 */ + CMD_VLAN_ADD = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 14), + + /* del VLAN id in (u16)a0 */ + CMD_VLAN_DEL = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 15), + + /* nic_cfg in (u32)a0 */ + CMD_NIC_CFG = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 16), + + /* union vnic_rss_key in mem: (u64)a0=paddr, (u16)a1=len */ + CMD_RSS_KEY = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 17), + + /* union vnic_rss_cpu in mem: (u64)a0=paddr, (u16)a1=len */ + CMD_RSS_CPU = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 18), + + /* initiate softreset */ + CMD_SOFT_RESET = _CMDCNW(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 19), + + /* softreset status: + * out: a0=0 reset complete, a0=1 reset in progress */ + CMD_SOFT_RESET_STATUS = _CMDC(_CMD_DIR_READ, _CMD_VTYPE_ALL, 20), + + /* set struct vnic_devcmd_notify buffer in mem: + * in: + * (u64)a0=paddr to notify (set paddr=0 to unset) + * (u32)a1 & 0x00000000ffffffff=sizeof(struct vnic_devcmd_notify) + * (u16)a1 & 0x0000ffff00000000=intr num (-1 for no intr) + * out: + * (u32)a1 = effective size + */ + CMD_NOTIFY = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 21), + + /* UNDI API: (u64)a0=paddr to s_PXENV_UNDI_ struct, + * (u8)a1=PXENV_UNDI_xxx */ + CMD_UNDI = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 22), + + /* initiate open sequence (u32)a0=flags (see CMD_OPENF_*) */ + CMD_OPEN = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 23), + + /* open status: + * out: a0=0 open complete, a0=1 open in progress */ + CMD_OPEN_STATUS = _CMDC(_CMD_DIR_READ, _CMD_VTYPE_ALL, 24), + + /* close vnic */ + CMD_CLOSE = _CMDC(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 25), + + /* initialize virtual link: (u32)a0=flags (see CMD_INITF_*) */ + CMD_INIT = _CMDCNW(_CMD_DIR_READ, _CMD_VTYPE_ALL, 26), + + /* variant of CMD_INIT, with provisioning info + * (u64)a0=paddr of vnic_devcmd_provinfo + * (u32)a1=sizeof provision info */ + CMD_INIT_PROV_INFO = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 27), + + /* enable virtual link */ + CMD_ENABLE = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 28), + + /* disable virtual link */ + CMD_DISABLE = _CMDC(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 29), + + /* stats dump all vnics on uplink in mem: (u64)a0=paddr (u32)a1=uif */ + CMD_STATS_DUMP_ALL = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 30), + + /* init status: + * out: a0=0 init complete, a0=1 init in progress + * if a0=0, a1=errno */ + CMD_INIT_STATUS = _CMDC(_CMD_DIR_READ, _CMD_VTYPE_ALL, 31), + + /* INT13 API: (u64)a0=paddr to vnic_int13_params struct + * (u8)a1=INT13_CMD_xxx */ + CMD_INT13 = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_FC, 32), + + /* logical uplink enable/disable: (u64)a0: 0/1=disable/enable */ + CMD_LOGICAL_UPLINK = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 33), + + /* undo initialize of virtual link */ + CMD_DEINIT = _CMDCNW(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 34), +}; + +/* flags for CMD_OPEN */ +#define CMD_OPENF_OPROM 0x1 /* open coming from option rom */ + +/* flags for CMD_INIT */ +#define CMD_INITF_DEFAULT_MAC 0x1 /* init with default mac addr */ + +/* flags for CMD_PACKET_FILTER */ +#define CMD_PFILTER_DIRECTED 0x01 +#define CMD_PFILTER_MULTICAST 0x02 +#define CMD_PFILTER_BROADCAST 0x04 +#define CMD_PFILTER_PROMISCUOUS 0x08 +#define CMD_PFILTER_ALL_MULTICAST 0x10 + +enum vnic_devcmd_status { + STAT_NONE = 0, + STAT_BUSY = 1 << 0, /* cmd in progress */ + STAT_ERROR = 1 << 1, /* last cmd caused error (code in a0) */ +}; + +enum vnic_devcmd_error { + ERR_SUCCESS = 0, + ERR_EINVAL = 1, + ERR_EFAULT = 2, + ERR_EPERM = 3, + ERR_EBUSY = 4, + ERR_ECMDUNKNOWN = 5, + ERR_EBADSTATE = 6, + ERR_ENOMEM = 7, + ERR_ETIMEDOUT = 8, + ERR_ELINKDOWN = 9, +}; + +struct vnic_devcmd_fw_info { + char fw_version[32]; + char fw_build[32]; + char hw_version[32]; + char hw_serial_number[32]; +}; + +struct vnic_devcmd_notify { + u32 csum; /* checksum over following words */ + + u32 link_state; /* link up == 1 */ + u32 port_speed; /* effective port speed (rate limit) */ + u32 mtu; /* MTU */ + u32 msglvl; /* requested driver msg lvl */ + u32 uif; /* uplink interface */ + u32 status; /* status bits (see VNIC_STF_*) */ + u32 error; /* error code (see ERR_*) for first ERR */ + u32 link_down_cnt; /* running count of link down transitions */ +}; +#define VNIC_STF_FATAL_ERR 0x0001 /* fatal fw error */ + +struct vnic_devcmd_provinfo { + u8 oui[3]; + u8 type; + u8 data[0]; +}; + +/* + * Writing cmd register causes STAT_BUSY to get set in status register. + * When cmd completes, STAT_BUSY will be cleared. + * + * If cmd completed successfully STAT_ERROR will be clear + * and args registers contain cmd-specific results. + * + * If cmd error, STAT_ERROR will be set and args[0] contains error code. + * + * status register is read-only. While STAT_BUSY is set, + * all other register contents are read-only. + */ + +/* Make sizeof(vnic_devcmd) a power-of-2 for I/O BAR. */ +#define VNIC_DEVCMD_NARGS 15 +struct vnic_devcmd { + u32 status; /* RO */ + u32 cmd; /* RW */ + u64 args[VNIC_DEVCMD_NARGS]; /* RW cmd args (little-endian) */ +}; + +#endif /* _VNIC_DEVCMD_H_ */ diff --git a/drivers/scsi/fnic/vnic_intr.c b/drivers/scsi/fnic/vnic_intr.c new file mode 100644 index 00000000000..4f4dc8793d2 --- /dev/null +++ b/drivers/scsi/fnic/vnic_intr.c @@ -0,0 +1,60 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include "vnic_dev.h" +#include "vnic_intr.h" + +void vnic_intr_free(struct vnic_intr *intr) +{ + intr->ctrl = NULL; +} + +int vnic_intr_alloc(struct vnic_dev *vdev, struct vnic_intr *intr, + unsigned int index) +{ + intr->index = index; + intr->vdev = vdev; + + intr->ctrl = vnic_dev_get_res(vdev, RES_TYPE_INTR_CTRL, index); + if (!intr->ctrl) { + printk(KERN_ERR "Failed to hook INTR[%d].ctrl resource\n", + index); + return -EINVAL; + } + + return 0; +} + +void vnic_intr_init(struct vnic_intr *intr, unsigned int coalescing_timer, + unsigned int coalescing_type, unsigned int mask_on_assertion) +{ + iowrite32(coalescing_timer, &intr->ctrl->coalescing_timer); + iowrite32(coalescing_type, &intr->ctrl->coalescing_type); + iowrite32(mask_on_assertion, &intr->ctrl->mask_on_assertion); + iowrite32(0, &intr->ctrl->int_credits); +} + +void vnic_intr_clean(struct vnic_intr *intr) +{ + iowrite32(0, &intr->ctrl->int_credits); +} diff --git a/drivers/scsi/fnic/vnic_intr.h b/drivers/scsi/fnic/vnic_intr.h new file mode 100644 index 00000000000..d5fb40e7c98 --- /dev/null +++ b/drivers/scsi/fnic/vnic_intr.h @@ -0,0 +1,118 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _VNIC_INTR_H_ +#define _VNIC_INTR_H_ + +#include +#include "vnic_dev.h" + +/* + * These defines avoid symbol clash between fnic and enic (Cisco 10G Eth + * Driver) when both are built with CONFIG options =y + */ +#define vnic_intr_unmask fnic_intr_unmask +#define vnic_intr_mask fnic_intr_mask +#define vnic_intr_return_credits fnic_intr_return_credits +#define vnic_intr_credits fnic_intr_credits +#define vnic_intr_return_all_credits fnic_intr_return_all_credits +#define vnic_intr_legacy_pba fnic_intr_legacy_pba +#define vnic_intr_free fnic_intr_free +#define vnic_intr_alloc fnic_intr_alloc +#define vnic_intr_init fnic_intr_init +#define vnic_intr_clean fnic_intr_clean + +#define VNIC_INTR_TIMER_MAX 0xffff + +#define VNIC_INTR_TIMER_TYPE_ABS 0 +#define VNIC_INTR_TIMER_TYPE_QUIET 1 + +/* Interrupt control */ +struct vnic_intr_ctrl { + u32 coalescing_timer; /* 0x00 */ + u32 pad0; + u32 coalescing_value; /* 0x08 */ + u32 pad1; + u32 coalescing_type; /* 0x10 */ + u32 pad2; + u32 mask_on_assertion; /* 0x18 */ + u32 pad3; + u32 mask; /* 0x20 */ + u32 pad4; + u32 int_credits; /* 0x28 */ + u32 pad5; + u32 int_credit_return; /* 0x30 */ + u32 pad6; +}; + +struct vnic_intr { + unsigned int index; + struct vnic_dev *vdev; + struct vnic_intr_ctrl __iomem *ctrl; /* memory-mapped */ +}; + +static inline void vnic_intr_unmask(struct vnic_intr *intr) +{ + iowrite32(0, &intr->ctrl->mask); +} + +static inline void vnic_intr_mask(struct vnic_intr *intr) +{ + iowrite32(1, &intr->ctrl->mask); +} + +static inline void vnic_intr_return_credits(struct vnic_intr *intr, + unsigned int credits, int unmask, int reset_timer) +{ +#define VNIC_INTR_UNMASK_SHIFT 16 +#define VNIC_INTR_RESET_TIMER_SHIFT 17 + + u32 int_credit_return = (credits & 0xffff) | + (unmask ? (1 << VNIC_INTR_UNMASK_SHIFT) : 0) | + (reset_timer ? (1 << VNIC_INTR_RESET_TIMER_SHIFT) : 0); + + iowrite32(int_credit_return, &intr->ctrl->int_credit_return); +} + +static inline unsigned int vnic_intr_credits(struct vnic_intr *intr) +{ + return ioread32(&intr->ctrl->int_credits); +} + +static inline void vnic_intr_return_all_credits(struct vnic_intr *intr) +{ + unsigned int credits = vnic_intr_credits(intr); + int unmask = 1; + int reset_timer = 1; + + vnic_intr_return_credits(intr, credits, unmask, reset_timer); +} + +static inline u32 vnic_intr_legacy_pba(u32 __iomem *legacy_pba) +{ + /* read PBA without clearing */ + return ioread32(legacy_pba); +} + +void vnic_intr_free(struct vnic_intr *intr); +int vnic_intr_alloc(struct vnic_dev *vdev, struct vnic_intr *intr, + unsigned int index); +void vnic_intr_init(struct vnic_intr *intr, unsigned int coalescing_timer, + unsigned int coalescing_type, unsigned int mask_on_assertion); +void vnic_intr_clean(struct vnic_intr *intr); + +#endif /* _VNIC_INTR_H_ */ diff --git a/drivers/scsi/fnic/vnic_nic.h b/drivers/scsi/fnic/vnic_nic.h new file mode 100644 index 00000000000..f15b83eeace --- /dev/null +++ b/drivers/scsi/fnic/vnic_nic.h @@ -0,0 +1,69 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _VNIC_NIC_H_ +#define _VNIC_NIC_H_ + +/* + * These defines avoid symbol clash between fnic and enic (Cisco 10G Eth + * Driver) when both are built with CONFIG options =y + */ +#define vnic_set_nic_cfg fnic_set_nic_cfg + +#define NIC_CFG_RSS_DEFAULT_CPU_MASK_FIELD 0xffUL +#define NIC_CFG_RSS_DEFAULT_CPU_SHIFT 0 +#define NIC_CFG_RSS_HASH_TYPE (0xffUL << 8) +#define NIC_CFG_RSS_HASH_TYPE_MASK_FIELD 0xffUL +#define NIC_CFG_RSS_HASH_TYPE_SHIFT 8 +#define NIC_CFG_RSS_HASH_BITS (7UL << 16) +#define NIC_CFG_RSS_HASH_BITS_MASK_FIELD 7UL +#define NIC_CFG_RSS_HASH_BITS_SHIFT 16 +#define NIC_CFG_RSS_BASE_CPU (7UL << 19) +#define NIC_CFG_RSS_BASE_CPU_MASK_FIELD 7UL +#define NIC_CFG_RSS_BASE_CPU_SHIFT 19 +#define NIC_CFG_RSS_ENABLE (1UL << 22) +#define NIC_CFG_RSS_ENABLE_MASK_FIELD 1UL +#define NIC_CFG_RSS_ENABLE_SHIFT 22 +#define NIC_CFG_TSO_IPID_SPLIT_EN (1UL << 23) +#define NIC_CFG_TSO_IPID_SPLIT_EN_MASK_FIELD 1UL +#define NIC_CFG_TSO_IPID_SPLIT_EN_SHIFT 23 +#define NIC_CFG_IG_VLAN_STRIP_EN (1UL << 24) +#define NIC_CFG_IG_VLAN_STRIP_EN_MASK_FIELD 1UL +#define NIC_CFG_IG_VLAN_STRIP_EN_SHIFT 24 + +static inline void vnic_set_nic_cfg(u32 *nic_cfg, + u8 rss_default_cpu, u8 rss_hash_type, + u8 rss_hash_bits, u8 rss_base_cpu, + u8 rss_enable, u8 tso_ipid_split_en, + u8 ig_vlan_strip_en) +{ + *nic_cfg = (rss_default_cpu & NIC_CFG_RSS_DEFAULT_CPU_MASK_FIELD) | + ((rss_hash_type & NIC_CFG_RSS_HASH_TYPE_MASK_FIELD) + << NIC_CFG_RSS_HASH_TYPE_SHIFT) | + ((rss_hash_bits & NIC_CFG_RSS_HASH_BITS_MASK_FIELD) + << NIC_CFG_RSS_HASH_BITS_SHIFT) | + ((rss_base_cpu & NIC_CFG_RSS_BASE_CPU_MASK_FIELD) + << NIC_CFG_RSS_BASE_CPU_SHIFT) | + ((rss_enable & NIC_CFG_RSS_ENABLE_MASK_FIELD) + << NIC_CFG_RSS_ENABLE_SHIFT) | + ((tso_ipid_split_en & NIC_CFG_TSO_IPID_SPLIT_EN_MASK_FIELD) + << NIC_CFG_TSO_IPID_SPLIT_EN_SHIFT) | + ((ig_vlan_strip_en & NIC_CFG_IG_VLAN_STRIP_EN_MASK_FIELD) + << NIC_CFG_IG_VLAN_STRIP_EN_SHIFT); +} + +#endif /* _VNIC_NIC_H_ */ diff --git a/drivers/scsi/fnic/vnic_resource.h b/drivers/scsi/fnic/vnic_resource.h new file mode 100644 index 00000000000..2d842f79d41 --- /dev/null +++ b/drivers/scsi/fnic/vnic_resource.h @@ -0,0 +1,61 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _VNIC_RESOURCE_H_ +#define _VNIC_RESOURCE_H_ + +#define VNIC_RES_MAGIC 0x766E6963L /* 'vnic' */ +#define VNIC_RES_VERSION 0x00000000L + +/* vNIC resource types */ +enum vnic_res_type { + RES_TYPE_EOL, /* End-of-list */ + RES_TYPE_WQ, /* Work queues */ + RES_TYPE_RQ, /* Receive queues */ + RES_TYPE_CQ, /* Completion queues */ + RES_TYPE_RSVD1, + RES_TYPE_NIC_CFG, /* Enet NIC config registers */ + RES_TYPE_RSVD2, + RES_TYPE_RSVD3, + RES_TYPE_RSVD4, + RES_TYPE_RSVD5, + RES_TYPE_INTR_CTRL, /* Interrupt ctrl table */ + RES_TYPE_INTR_TABLE, /* MSI/MSI-X Interrupt table */ + RES_TYPE_INTR_PBA, /* MSI/MSI-X PBA table */ + RES_TYPE_INTR_PBA_LEGACY, /* Legacy intr status */ + RES_TYPE_RSVD6, + RES_TYPE_RSVD7, + RES_TYPE_DEVCMD, /* Device command region */ + RES_TYPE_PASS_THRU_PAGE, /* Pass-thru page */ + + RES_TYPE_MAX, /* Count of resource types */ +}; + +struct vnic_resource_header { + u32 magic; + u32 version; +}; + +struct vnic_resource { + u8 type; + u8 bar; + u8 pad[2]; + u32 bar_offset; + u32 count; +}; + +#endif /* _VNIC_RESOURCE_H_ */ diff --git a/drivers/scsi/fnic/vnic_rq.c b/drivers/scsi/fnic/vnic_rq.c new file mode 100644 index 00000000000..bedd0d28563 --- /dev/null +++ b/drivers/scsi/fnic/vnic_rq.c @@ -0,0 +1,196 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "vnic_dev.h" +#include "vnic_rq.h" + +static int vnic_rq_alloc_bufs(struct vnic_rq *rq) +{ + struct vnic_rq_buf *buf; + struct vnic_dev *vdev; + unsigned int i, j, count = rq->ring.desc_count; + unsigned int blks = VNIC_RQ_BUF_BLKS_NEEDED(count); + + vdev = rq->vdev; + + for (i = 0; i < blks; i++) { + rq->bufs[i] = kzalloc(VNIC_RQ_BUF_BLK_SZ, GFP_ATOMIC); + if (!rq->bufs[i]) { + printk(KERN_ERR "Failed to alloc rq_bufs\n"); + return -ENOMEM; + } + } + + for (i = 0; i < blks; i++) { + buf = rq->bufs[i]; + for (j = 0; j < VNIC_RQ_BUF_BLK_ENTRIES; j++) { + buf->index = i * VNIC_RQ_BUF_BLK_ENTRIES + j; + buf->desc = (u8 *)rq->ring.descs + + rq->ring.desc_size * buf->index; + if (buf->index + 1 == count) { + buf->next = rq->bufs[0]; + break; + } else if (j + 1 == VNIC_RQ_BUF_BLK_ENTRIES) { + buf->next = rq->bufs[i + 1]; + } else { + buf->next = buf + 1; + buf++; + } + } + } + + rq->to_use = rq->to_clean = rq->bufs[0]; + rq->buf_index = 0; + + return 0; +} + +void vnic_rq_free(struct vnic_rq *rq) +{ + struct vnic_dev *vdev; + unsigned int i; + + vdev = rq->vdev; + + vnic_dev_free_desc_ring(vdev, &rq->ring); + + for (i = 0; i < VNIC_RQ_BUF_BLKS_MAX; i++) { + kfree(rq->bufs[i]); + rq->bufs[i] = NULL; + } + + rq->ctrl = NULL; +} + +int vnic_rq_alloc(struct vnic_dev *vdev, struct vnic_rq *rq, unsigned int index, + unsigned int desc_count, unsigned int desc_size) +{ + int err; + + rq->index = index; + rq->vdev = vdev; + + rq->ctrl = vnic_dev_get_res(vdev, RES_TYPE_RQ, index); + if (!rq->ctrl) { + printk(KERN_ERR "Failed to hook RQ[%d] resource\n", index); + return -EINVAL; + } + + vnic_rq_disable(rq); + + err = vnic_dev_alloc_desc_ring(vdev, &rq->ring, desc_count, desc_size); + if (err) + return err; + + err = vnic_rq_alloc_bufs(rq); + if (err) { + vnic_rq_free(rq); + return err; + } + + return 0; +} + +void vnic_rq_init(struct vnic_rq *rq, unsigned int cq_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset) +{ + u64 paddr; + u32 fetch_index; + + paddr = (u64)rq->ring.base_addr | VNIC_PADDR_TARGET; + writeq(paddr, &rq->ctrl->ring_base); + iowrite32(rq->ring.desc_count, &rq->ctrl->ring_size); + iowrite32(cq_index, &rq->ctrl->cq_index); + iowrite32(error_interrupt_enable, &rq->ctrl->error_interrupt_enable); + iowrite32(error_interrupt_offset, &rq->ctrl->error_interrupt_offset); + iowrite32(0, &rq->ctrl->dropped_packet_count); + iowrite32(0, &rq->ctrl->error_status); + + /* Use current fetch_index as the ring starting point */ + fetch_index = ioread32(&rq->ctrl->fetch_index); + rq->to_use = rq->to_clean = + &rq->bufs[fetch_index / VNIC_RQ_BUF_BLK_ENTRIES] + [fetch_index % VNIC_RQ_BUF_BLK_ENTRIES]; + iowrite32(fetch_index, &rq->ctrl->posted_index); + + rq->buf_index = 0; +} + +unsigned int vnic_rq_error_status(struct vnic_rq *rq) +{ + return ioread32(&rq->ctrl->error_status); +} + +void vnic_rq_enable(struct vnic_rq *rq) +{ + iowrite32(1, &rq->ctrl->enable); +} + +int vnic_rq_disable(struct vnic_rq *rq) +{ + unsigned int wait; + + iowrite32(0, &rq->ctrl->enable); + + /* Wait for HW to ACK disable request */ + for (wait = 0; wait < 100; wait++) { + if (!(ioread32(&rq->ctrl->running))) + return 0; + udelay(1); + } + + printk(KERN_ERR "Failed to disable RQ[%d]\n", rq->index); + + return -ETIMEDOUT; +} + +void vnic_rq_clean(struct vnic_rq *rq, + void (*buf_clean)(struct vnic_rq *rq, struct vnic_rq_buf *buf)) +{ + struct vnic_rq_buf *buf; + u32 fetch_index; + + BUG_ON(ioread32(&rq->ctrl->enable)); + + buf = rq->to_clean; + + while (vnic_rq_desc_used(rq) > 0) { + + (*buf_clean)(rq, buf); + + buf = rq->to_clean = buf->next; + rq->ring.desc_avail++; + } + + /* Use current fetch_index as the ring starting point */ + fetch_index = ioread32(&rq->ctrl->fetch_index); + rq->to_use = rq->to_clean = + &rq->bufs[fetch_index / VNIC_RQ_BUF_BLK_ENTRIES] + [fetch_index % VNIC_RQ_BUF_BLK_ENTRIES]; + iowrite32(fetch_index, &rq->ctrl->posted_index); + + rq->buf_index = 0; + + vnic_dev_clear_desc_ring(&rq->ring); +} + diff --git a/drivers/scsi/fnic/vnic_rq.h b/drivers/scsi/fnic/vnic_rq.h new file mode 100644 index 00000000000..aebdfbd6ad3 --- /dev/null +++ b/drivers/scsi/fnic/vnic_rq.h @@ -0,0 +1,235 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _VNIC_RQ_H_ +#define _VNIC_RQ_H_ + +#include +#include "vnic_dev.h" +#include "vnic_cq.h" + +/* + * These defines avoid symbol clash between fnic and enic (Cisco 10G Eth + * Driver) when both are built with CONFIG options =y + */ +#define vnic_rq_desc_avail fnic_rq_desc_avail +#define vnic_rq_desc_used fnic_rq_desc_used +#define vnic_rq_next_desc fnic_rq_next_desc +#define vnic_rq_next_index fnic_rq_next_index +#define vnic_rq_next_buf_index fnic_rq_next_buf_index +#define vnic_rq_post fnic_rq_post +#define vnic_rq_posting_soon fnic_rq_posting_soon +#define vnic_rq_return_descs fnic_rq_return_descs +#define vnic_rq_service fnic_rq_service +#define vnic_rq_fill fnic_rq_fill +#define vnic_rq_free fnic_rq_free +#define vnic_rq_alloc fnic_rq_alloc +#define vnic_rq_init fnic_rq_init +#define vnic_rq_error_status fnic_rq_error_status +#define vnic_rq_enable fnic_rq_enable +#define vnic_rq_disable fnic_rq_disable +#define vnic_rq_clean fnic_rq_clean + +/* Receive queue control */ +struct vnic_rq_ctrl { + u64 ring_base; /* 0x00 */ + u32 ring_size; /* 0x08 */ + u32 pad0; + u32 posted_index; /* 0x10 */ + u32 pad1; + u32 cq_index; /* 0x18 */ + u32 pad2; + u32 enable; /* 0x20 */ + u32 pad3; + u32 running; /* 0x28 */ + u32 pad4; + u32 fetch_index; /* 0x30 */ + u32 pad5; + u32 error_interrupt_enable; /* 0x38 */ + u32 pad6; + u32 error_interrupt_offset; /* 0x40 */ + u32 pad7; + u32 error_status; /* 0x48 */ + u32 pad8; + u32 dropped_packet_count; /* 0x50 */ + u32 pad9; + u32 dropped_packet_count_rc; /* 0x58 */ + u32 pad10; +}; + +/* Break the vnic_rq_buf allocations into blocks of 64 entries */ +#define VNIC_RQ_BUF_BLK_ENTRIES 64 +#define VNIC_RQ_BUF_BLK_SZ \ + (VNIC_RQ_BUF_BLK_ENTRIES * sizeof(struct vnic_rq_buf)) +#define VNIC_RQ_BUF_BLKS_NEEDED(entries) \ + DIV_ROUND_UP(entries, VNIC_RQ_BUF_BLK_ENTRIES) +#define VNIC_RQ_BUF_BLKS_MAX VNIC_RQ_BUF_BLKS_NEEDED(4096) + +struct vnic_rq_buf { + struct vnic_rq_buf *next; + dma_addr_t dma_addr; + void *os_buf; + unsigned int os_buf_index; + unsigned int len; + unsigned int index; + void *desc; +}; + +struct vnic_rq { + unsigned int index; + struct vnic_dev *vdev; + struct vnic_rq_ctrl __iomem *ctrl; /* memory-mapped */ + struct vnic_dev_ring ring; + struct vnic_rq_buf *bufs[VNIC_RQ_BUF_BLKS_MAX]; + struct vnic_rq_buf *to_use; + struct vnic_rq_buf *to_clean; + void *os_buf_head; + unsigned int buf_index; + unsigned int pkts_outstanding; +}; + +static inline unsigned int vnic_rq_desc_avail(struct vnic_rq *rq) +{ + /* how many does SW own? */ + return rq->ring.desc_avail; +} + +static inline unsigned int vnic_rq_desc_used(struct vnic_rq *rq) +{ + /* how many does HW own? */ + return rq->ring.desc_count - rq->ring.desc_avail - 1; +} + +static inline void *vnic_rq_next_desc(struct vnic_rq *rq) +{ + return rq->to_use->desc; +} + +static inline unsigned int vnic_rq_next_index(struct vnic_rq *rq) +{ + return rq->to_use->index; +} + +static inline unsigned int vnic_rq_next_buf_index(struct vnic_rq *rq) +{ + return rq->buf_index++; +} + +static inline void vnic_rq_post(struct vnic_rq *rq, + void *os_buf, unsigned int os_buf_index, + dma_addr_t dma_addr, unsigned int len) +{ + struct vnic_rq_buf *buf = rq->to_use; + + buf->os_buf = os_buf; + buf->os_buf_index = os_buf_index; + buf->dma_addr = dma_addr; + buf->len = len; + + buf = buf->next; + rq->to_use = buf; + rq->ring.desc_avail--; + + /* Move the posted_index every nth descriptor + */ + +#ifndef VNIC_RQ_RETURN_RATE +#define VNIC_RQ_RETURN_RATE 0xf /* keep 2^n - 1 */ +#endif + + if ((buf->index & VNIC_RQ_RETURN_RATE) == 0) { + /* Adding write memory barrier prevents compiler and/or CPU + * reordering, thus avoiding descriptor posting before + * descriptor is initialized. Otherwise, hardware can read + * stale descriptor fields. + */ + wmb(); + iowrite32(buf->index, &rq->ctrl->posted_index); + } +} + +static inline int vnic_rq_posting_soon(struct vnic_rq *rq) +{ + return (rq->to_use->index & VNIC_RQ_RETURN_RATE) == 0; +} + +static inline void vnic_rq_return_descs(struct vnic_rq *rq, unsigned int count) +{ + rq->ring.desc_avail += count; +} + +enum desc_return_options { + VNIC_RQ_RETURN_DESC, + VNIC_RQ_DEFER_RETURN_DESC, +}; + +static inline void vnic_rq_service(struct vnic_rq *rq, + struct cq_desc *cq_desc, u16 completed_index, + int desc_return, void (*buf_service)(struct vnic_rq *rq, + struct cq_desc *cq_desc, struct vnic_rq_buf *buf, + int skipped, void *opaque), void *opaque) +{ + struct vnic_rq_buf *buf; + int skipped; + + buf = rq->to_clean; + while (1) { + + skipped = (buf->index != completed_index); + + (*buf_service)(rq, cq_desc, buf, skipped, opaque); + + if (desc_return == VNIC_RQ_RETURN_DESC) + rq->ring.desc_avail++; + + rq->to_clean = buf->next; + + if (!skipped) + break; + + buf = rq->to_clean; + } +} + +static inline int vnic_rq_fill(struct vnic_rq *rq, + int (*buf_fill)(struct vnic_rq *rq)) +{ + int err; + + while (vnic_rq_desc_avail(rq) > 1) { + + err = (*buf_fill)(rq); + if (err) + return err; + } + + return 0; +} + +void vnic_rq_free(struct vnic_rq *rq); +int vnic_rq_alloc(struct vnic_dev *vdev, struct vnic_rq *rq, unsigned int index, + unsigned int desc_count, unsigned int desc_size); +void vnic_rq_init(struct vnic_rq *rq, unsigned int cq_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset); +unsigned int vnic_rq_error_status(struct vnic_rq *rq); +void vnic_rq_enable(struct vnic_rq *rq); +int vnic_rq_disable(struct vnic_rq *rq); +void vnic_rq_clean(struct vnic_rq *rq, + void (*buf_clean)(struct vnic_rq *rq, struct vnic_rq_buf *buf)); + +#endif /* _VNIC_RQ_H_ */ diff --git a/drivers/scsi/fnic/vnic_scsi.h b/drivers/scsi/fnic/vnic_scsi.h new file mode 100644 index 00000000000..46baa525400 --- /dev/null +++ b/drivers/scsi/fnic/vnic_scsi.h @@ -0,0 +1,99 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _VNIC_SCSI_H_ +#define _VNIC_SCSI_H_ + +#define VNIC_FNIC_WQ_COPY_COUNT_MIN 1 +#define VNIC_FNIC_WQ_COPY_COUNT_MAX 1 + +#define VNIC_FNIC_WQ_DESCS_MIN 64 +#define VNIC_FNIC_WQ_DESCS_MAX 128 + +#define VNIC_FNIC_WQ_COPY_DESCS_MIN 64 +#define VNIC_FNIC_WQ_COPY_DESCS_MAX 512 + +#define VNIC_FNIC_RQ_DESCS_MIN 64 +#define VNIC_FNIC_RQ_DESCS_MAX 128 + +#define VNIC_FNIC_EDTOV_MIN 1000 +#define VNIC_FNIC_EDTOV_MAX 255000 +#define VNIC_FNIC_EDTOV_DEF 2000 + +#define VNIC_FNIC_RATOV_MIN 1000 +#define VNIC_FNIC_RATOV_MAX 255000 + +#define VNIC_FNIC_MAXDATAFIELDSIZE_MIN 256 +#define VNIC_FNIC_MAXDATAFIELDSIZE_MAX 2112 + +#define VNIC_FNIC_FLOGI_RETRIES_MIN 0 +#define VNIC_FNIC_FLOGI_RETRIES_MAX 0xffffffff +#define VNIC_FNIC_FLOGI_RETRIES_DEF 0xffffffff + +#define VNIC_FNIC_FLOGI_TIMEOUT_MIN 1000 +#define VNIC_FNIC_FLOGI_TIMEOUT_MAX 255000 + +#define VNIC_FNIC_PLOGI_RETRIES_MIN 0 +#define VNIC_FNIC_PLOGI_RETRIES_MAX 255 +#define VNIC_FNIC_PLOGI_RETRIES_DEF 8 + +#define VNIC_FNIC_PLOGI_TIMEOUT_MIN 1000 +#define VNIC_FNIC_PLOGI_TIMEOUT_MAX 255000 + +#define VNIC_FNIC_IO_THROTTLE_COUNT_MIN 256 +#define VNIC_FNIC_IO_THROTTLE_COUNT_MAX 4096 + +#define VNIC_FNIC_LINK_DOWN_TIMEOUT_MIN 0 +#define VNIC_FNIC_LINK_DOWN_TIMEOUT_MAX 240000 + +#define VNIC_FNIC_PORT_DOWN_TIMEOUT_MIN 0 +#define VNIC_FNIC_PORT_DOWN_TIMEOUT_MAX 240000 + +#define VNIC_FNIC_PORT_DOWN_IO_RETRIES_MIN 0 +#define VNIC_FNIC_PORT_DOWN_IO_RETRIES_MAX 255 + +#define VNIC_FNIC_LUNS_PER_TARGET_MIN 1 +#define VNIC_FNIC_LUNS_PER_TARGET_MAX 1024 + +/* Device-specific region: scsi configuration */ +struct vnic_fc_config { + u64 node_wwn; + u64 port_wwn; + u32 flags; + u32 wq_enet_desc_count; + u32 wq_copy_desc_count; + u32 rq_desc_count; + u32 flogi_retries; + u32 flogi_timeout; + u32 plogi_retries; + u32 plogi_timeout; + u32 io_throttle_count; + u32 link_down_timeout; + u32 port_down_timeout; + u32 port_down_io_retries; + u32 luns_per_tgt; + u16 maxdatafieldsize; + u16 ed_tov; + u16 ra_tov; + u16 intr_timer; + u8 intr_timer_type; +}; + +#define VFCF_FCP_SEQ_LVL_ERR 0x1 /* Enable FCP-2 Error Recovery */ +#define VFCF_PERBI 0x2 /* persistent binding info available */ + +#endif /* _VNIC_SCSI_H_ */ diff --git a/drivers/scsi/fnic/vnic_stats.h b/drivers/scsi/fnic/vnic_stats.h new file mode 100644 index 00000000000..5372e23c1cb --- /dev/null +++ b/drivers/scsi/fnic/vnic_stats.h @@ -0,0 +1,68 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _VNIC_STATS_H_ +#define _VNIC_STATS_H_ + +/* Tx statistics */ +struct vnic_tx_stats { + u64 tx_frames_ok; + u64 tx_unicast_frames_ok; + u64 tx_multicast_frames_ok; + u64 tx_broadcast_frames_ok; + u64 tx_bytes_ok; + u64 tx_unicast_bytes_ok; + u64 tx_multicast_bytes_ok; + u64 tx_broadcast_bytes_ok; + u64 tx_drops; + u64 tx_errors; + u64 tx_tso; + u64 rsvd[16]; +}; + +/* Rx statistics */ +struct vnic_rx_stats { + u64 rx_frames_ok; + u64 rx_frames_total; + u64 rx_unicast_frames_ok; + u64 rx_multicast_frames_ok; + u64 rx_broadcast_frames_ok; + u64 rx_bytes_ok; + u64 rx_unicast_bytes_ok; + u64 rx_multicast_bytes_ok; + u64 rx_broadcast_bytes_ok; + u64 rx_drop; + u64 rx_no_bufs; + u64 rx_errors; + u64 rx_rss; + u64 rx_crc_errors; + u64 rx_frames_64; + u64 rx_frames_127; + u64 rx_frames_255; + u64 rx_frames_511; + u64 rx_frames_1023; + u64 rx_frames_1518; + u64 rx_frames_to_max; + u64 rsvd[16]; +}; + +struct vnic_stats { + struct vnic_tx_stats tx; + struct vnic_rx_stats rx; +}; + +#endif /* _VNIC_STATS_H_ */ diff --git a/drivers/scsi/fnic/vnic_wq.c b/drivers/scsi/fnic/vnic_wq.c new file mode 100644 index 00000000000..1f9ea790d13 --- /dev/null +++ b/drivers/scsi/fnic/vnic_wq.c @@ -0,0 +1,182 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "vnic_dev.h" +#include "vnic_wq.h" + +static int vnic_wq_alloc_bufs(struct vnic_wq *wq) +{ + struct vnic_wq_buf *buf; + struct vnic_dev *vdev; + unsigned int i, j, count = wq->ring.desc_count; + unsigned int blks = VNIC_WQ_BUF_BLKS_NEEDED(count); + + vdev = wq->vdev; + + for (i = 0; i < blks; i++) { + wq->bufs[i] = kzalloc(VNIC_WQ_BUF_BLK_SZ, GFP_ATOMIC); + if (!wq->bufs[i]) { + printk(KERN_ERR "Failed to alloc wq_bufs\n"); + return -ENOMEM; + } + } + + for (i = 0; i < blks; i++) { + buf = wq->bufs[i]; + for (j = 0; j < VNIC_WQ_BUF_BLK_ENTRIES; j++) { + buf->index = i * VNIC_WQ_BUF_BLK_ENTRIES + j; + buf->desc = (u8 *)wq->ring.descs + + wq->ring.desc_size * buf->index; + if (buf->index + 1 == count) { + buf->next = wq->bufs[0]; + break; + } else if (j + 1 == VNIC_WQ_BUF_BLK_ENTRIES) { + buf->next = wq->bufs[i + 1]; + } else { + buf->next = buf + 1; + buf++; + } + } + } + + wq->to_use = wq->to_clean = wq->bufs[0]; + + return 0; +} + +void vnic_wq_free(struct vnic_wq *wq) +{ + struct vnic_dev *vdev; + unsigned int i; + + vdev = wq->vdev; + + vnic_dev_free_desc_ring(vdev, &wq->ring); + + for (i = 0; i < VNIC_WQ_BUF_BLKS_MAX; i++) { + kfree(wq->bufs[i]); + wq->bufs[i] = NULL; + } + + wq->ctrl = NULL; + +} + +int vnic_wq_alloc(struct vnic_dev *vdev, struct vnic_wq *wq, unsigned int index, + unsigned int desc_count, unsigned int desc_size) +{ + int err; + + wq->index = index; + wq->vdev = vdev; + + wq->ctrl = vnic_dev_get_res(vdev, RES_TYPE_WQ, index); + if (!wq->ctrl) { + printk(KERN_ERR "Failed to hook WQ[%d] resource\n", index); + return -EINVAL; + } + + vnic_wq_disable(wq); + + err = vnic_dev_alloc_desc_ring(vdev, &wq->ring, desc_count, desc_size); + if (err) + return err; + + err = vnic_wq_alloc_bufs(wq); + if (err) { + vnic_wq_free(wq); + return err; + } + + return 0; +} + +void vnic_wq_init(struct vnic_wq *wq, unsigned int cq_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset) +{ + u64 paddr; + + paddr = (u64)wq->ring.base_addr | VNIC_PADDR_TARGET; + writeq(paddr, &wq->ctrl->ring_base); + iowrite32(wq->ring.desc_count, &wq->ctrl->ring_size); + iowrite32(0, &wq->ctrl->fetch_index); + iowrite32(0, &wq->ctrl->posted_index); + iowrite32(cq_index, &wq->ctrl->cq_index); + iowrite32(error_interrupt_enable, &wq->ctrl->error_interrupt_enable); + iowrite32(error_interrupt_offset, &wq->ctrl->error_interrupt_offset); + iowrite32(0, &wq->ctrl->error_status); +} + +unsigned int vnic_wq_error_status(struct vnic_wq *wq) +{ + return ioread32(&wq->ctrl->error_status); +} + +void vnic_wq_enable(struct vnic_wq *wq) +{ + iowrite32(1, &wq->ctrl->enable); +} + +int vnic_wq_disable(struct vnic_wq *wq) +{ + unsigned int wait; + + iowrite32(0, &wq->ctrl->enable); + + /* Wait for HW to ACK disable request */ + for (wait = 0; wait < 100; wait++) { + if (!(ioread32(&wq->ctrl->running))) + return 0; + udelay(1); + } + + printk(KERN_ERR "Failed to disable WQ[%d]\n", wq->index); + + return -ETIMEDOUT; +} + +void vnic_wq_clean(struct vnic_wq *wq, + void (*buf_clean)(struct vnic_wq *wq, struct vnic_wq_buf *buf)) +{ + struct vnic_wq_buf *buf; + + BUG_ON(ioread32(&wq->ctrl->enable)); + + buf = wq->to_clean; + + while (vnic_wq_desc_used(wq) > 0) { + + (*buf_clean)(wq, buf); + + buf = wq->to_clean = buf->next; + wq->ring.desc_avail++; + } + + wq->to_use = wq->to_clean = wq->bufs[0]; + + iowrite32(0, &wq->ctrl->fetch_index); + iowrite32(0, &wq->ctrl->posted_index); + iowrite32(0, &wq->ctrl->error_status); + + vnic_dev_clear_desc_ring(&wq->ring); +} diff --git a/drivers/scsi/fnic/vnic_wq.h b/drivers/scsi/fnic/vnic_wq.h new file mode 100644 index 00000000000..5cd094f7928 --- /dev/null +++ b/drivers/scsi/fnic/vnic_wq.h @@ -0,0 +1,175 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _VNIC_WQ_H_ +#define _VNIC_WQ_H_ + +#include +#include "vnic_dev.h" +#include "vnic_cq.h" + +/* + * These defines avoid symbol clash between fnic and enic (Cisco 10G Eth + * Driver) when both are built with CONFIG options =y + */ +#define vnic_wq_desc_avail fnic_wq_desc_avail +#define vnic_wq_desc_used fnic_wq_desc_used +#define vnic_wq_next_desc fni_cwq_next_desc +#define vnic_wq_post fnic_wq_post +#define vnic_wq_service fnic_wq_service +#define vnic_wq_free fnic_wq_free +#define vnic_wq_alloc fnic_wq_alloc +#define vnic_wq_init fnic_wq_init +#define vnic_wq_error_status fnic_wq_error_status +#define vnic_wq_enable fnic_wq_enable +#define vnic_wq_disable fnic_wq_disable +#define vnic_wq_clean fnic_wq_clean + +/* Work queue control */ +struct vnic_wq_ctrl { + u64 ring_base; /* 0x00 */ + u32 ring_size; /* 0x08 */ + u32 pad0; + u32 posted_index; /* 0x10 */ + u32 pad1; + u32 cq_index; /* 0x18 */ + u32 pad2; + u32 enable; /* 0x20 */ + u32 pad3; + u32 running; /* 0x28 */ + u32 pad4; + u32 fetch_index; /* 0x30 */ + u32 pad5; + u32 dca_value; /* 0x38 */ + u32 pad6; + u32 error_interrupt_enable; /* 0x40 */ + u32 pad7; + u32 error_interrupt_offset; /* 0x48 */ + u32 pad8; + u32 error_status; /* 0x50 */ + u32 pad9; +}; + +struct vnic_wq_buf { + struct vnic_wq_buf *next; + dma_addr_t dma_addr; + void *os_buf; + unsigned int len; + unsigned int index; + int sop; + void *desc; +}; + +/* Break the vnic_wq_buf allocations into blocks of 64 entries */ +#define VNIC_WQ_BUF_BLK_ENTRIES 64 +#define VNIC_WQ_BUF_BLK_SZ \ + (VNIC_WQ_BUF_BLK_ENTRIES * sizeof(struct vnic_wq_buf)) +#define VNIC_WQ_BUF_BLKS_NEEDED(entries) \ + DIV_ROUND_UP(entries, VNIC_WQ_BUF_BLK_ENTRIES) +#define VNIC_WQ_BUF_BLKS_MAX VNIC_WQ_BUF_BLKS_NEEDED(4096) + +struct vnic_wq { + unsigned int index; + struct vnic_dev *vdev; + struct vnic_wq_ctrl __iomem *ctrl; /* memory-mapped */ + struct vnic_dev_ring ring; + struct vnic_wq_buf *bufs[VNIC_WQ_BUF_BLKS_MAX]; + struct vnic_wq_buf *to_use; + struct vnic_wq_buf *to_clean; + unsigned int pkts_outstanding; +}; + +static inline unsigned int vnic_wq_desc_avail(struct vnic_wq *wq) +{ + /* how many does SW own? */ + return wq->ring.desc_avail; +} + +static inline unsigned int vnic_wq_desc_used(struct vnic_wq *wq) +{ + /* how many does HW own? */ + return wq->ring.desc_count - wq->ring.desc_avail - 1; +} + +static inline void *vnic_wq_next_desc(struct vnic_wq *wq) +{ + return wq->to_use->desc; +} + +static inline void vnic_wq_post(struct vnic_wq *wq, + void *os_buf, dma_addr_t dma_addr, + unsigned int len, int sop, int eop) +{ + struct vnic_wq_buf *buf = wq->to_use; + + buf->sop = sop; + buf->os_buf = eop ? os_buf : NULL; + buf->dma_addr = dma_addr; + buf->len = len; + + buf = buf->next; + if (eop) { + /* Adding write memory barrier prevents compiler and/or CPU + * reordering, thus avoiding descriptor posting before + * descriptor is initialized. Otherwise, hardware can read + * stale descriptor fields. + */ + wmb(); + iowrite32(buf->index, &wq->ctrl->posted_index); + } + wq->to_use = buf; + + wq->ring.desc_avail--; +} + +static inline void vnic_wq_service(struct vnic_wq *wq, + struct cq_desc *cq_desc, u16 completed_index, + void (*buf_service)(struct vnic_wq *wq, + struct cq_desc *cq_desc, struct vnic_wq_buf *buf, void *opaque), + void *opaque) +{ + struct vnic_wq_buf *buf; + + buf = wq->to_clean; + while (1) { + + (*buf_service)(wq, cq_desc, buf, opaque); + + wq->ring.desc_avail++; + + wq->to_clean = buf->next; + + if (buf->index == completed_index) + break; + + buf = wq->to_clean; + } +} + +void vnic_wq_free(struct vnic_wq *wq); +int vnic_wq_alloc(struct vnic_dev *vdev, struct vnic_wq *wq, unsigned int index, + unsigned int desc_count, unsigned int desc_size); +void vnic_wq_init(struct vnic_wq *wq, unsigned int cq_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset); +unsigned int vnic_wq_error_status(struct vnic_wq *wq); +void vnic_wq_enable(struct vnic_wq *wq); +int vnic_wq_disable(struct vnic_wq *wq); +void vnic_wq_clean(struct vnic_wq *wq, + void (*buf_clean)(struct vnic_wq *wq, struct vnic_wq_buf *buf)); + +#endif /* _VNIC_WQ_H_ */ diff --git a/drivers/scsi/fnic/vnic_wq_copy.c b/drivers/scsi/fnic/vnic_wq_copy.c new file mode 100644 index 00000000000..9eab7e7caf3 --- /dev/null +++ b/drivers/scsi/fnic/vnic_wq_copy.c @@ -0,0 +1,117 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "vnic_wq_copy.h" + +void vnic_wq_copy_enable(struct vnic_wq_copy *wq) +{ + iowrite32(1, &wq->ctrl->enable); +} + +int vnic_wq_copy_disable(struct vnic_wq_copy *wq) +{ + unsigned int wait; + + iowrite32(0, &wq->ctrl->enable); + + /* Wait for HW to ACK disable request */ + for (wait = 0; wait < 100; wait++) { + if (!(ioread32(&wq->ctrl->running))) + return 0; + udelay(1); + } + + printk(KERN_ERR "Failed to disable Copy WQ[%d]," + " fetch index=%d, posted_index=%d\n", + wq->index, ioread32(&wq->ctrl->fetch_index), + ioread32(&wq->ctrl->posted_index)); + + return -ENODEV; +} + +void vnic_wq_copy_clean(struct vnic_wq_copy *wq, + void (*q_clean)(struct vnic_wq_copy *wq, + struct fcpio_host_req *wq_desc)) +{ + BUG_ON(ioread32(&wq->ctrl->enable)); + + if (vnic_wq_copy_desc_in_use(wq)) + vnic_wq_copy_service(wq, -1, q_clean); + + wq->to_use_index = wq->to_clean_index = 0; + + iowrite32(0, &wq->ctrl->fetch_index); + iowrite32(0, &wq->ctrl->posted_index); + iowrite32(0, &wq->ctrl->error_status); + + vnic_dev_clear_desc_ring(&wq->ring); +} + +void vnic_wq_copy_free(struct vnic_wq_copy *wq) +{ + struct vnic_dev *vdev; + + vdev = wq->vdev; + vnic_dev_free_desc_ring(vdev, &wq->ring); + wq->ctrl = NULL; +} + +int vnic_wq_copy_alloc(struct vnic_dev *vdev, struct vnic_wq_copy *wq, + unsigned int index, unsigned int desc_count, + unsigned int desc_size) +{ + int err; + + wq->index = index; + wq->vdev = vdev; + wq->to_use_index = wq->to_clean_index = 0; + wq->ctrl = vnic_dev_get_res(vdev, RES_TYPE_WQ, index); + if (!wq->ctrl) { + printk(KERN_ERR "Failed to hook COPY WQ[%d] resource\n", index); + return -EINVAL; + } + + vnic_wq_copy_disable(wq); + + err = vnic_dev_alloc_desc_ring(vdev, &wq->ring, desc_count, desc_size); + if (err) + return err; + + return 0; +} + +void vnic_wq_copy_init(struct vnic_wq_copy *wq, unsigned int cq_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset) +{ + u64 paddr; + + paddr = (u64)wq->ring.base_addr | VNIC_PADDR_TARGET; + writeq(paddr, &wq->ctrl->ring_base); + iowrite32(wq->ring.desc_count, &wq->ctrl->ring_size); + iowrite32(0, &wq->ctrl->fetch_index); + iowrite32(0, &wq->ctrl->posted_index); + iowrite32(cq_index, &wq->ctrl->cq_index); + iowrite32(error_interrupt_enable, &wq->ctrl->error_interrupt_enable); + iowrite32(error_interrupt_offset, &wq->ctrl->error_interrupt_offset); +} + diff --git a/drivers/scsi/fnic/vnic_wq_copy.h b/drivers/scsi/fnic/vnic_wq_copy.h new file mode 100644 index 00000000000..6aff9740c3d --- /dev/null +++ b/drivers/scsi/fnic/vnic_wq_copy.h @@ -0,0 +1,128 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _VNIC_WQ_COPY_H_ +#define _VNIC_WQ_COPY_H_ + +#include +#include "vnic_wq.h" +#include "fcpio.h" + +#define VNIC_WQ_COPY_MAX 1 + +struct vnic_wq_copy { + unsigned int index; + struct vnic_dev *vdev; + struct vnic_wq_ctrl __iomem *ctrl; /* memory-mapped */ + struct vnic_dev_ring ring; + unsigned to_use_index; + unsigned to_clean_index; +}; + +static inline unsigned int vnic_wq_copy_desc_avail(struct vnic_wq_copy *wq) +{ + return wq->ring.desc_avail; +} + +static inline unsigned int vnic_wq_copy_desc_in_use(struct vnic_wq_copy *wq) +{ + return wq->ring.desc_count - 1 - wq->ring.desc_avail; +} + +static inline void *vnic_wq_copy_next_desc(struct vnic_wq_copy *wq) +{ + struct fcpio_host_req *desc = wq->ring.descs; + return &desc[wq->to_use_index]; +} + +static inline void vnic_wq_copy_post(struct vnic_wq_copy *wq) +{ + + ((wq->to_use_index + 1) == wq->ring.desc_count) ? + (wq->to_use_index = 0) : (wq->to_use_index++); + wq->ring.desc_avail--; + + /* Adding write memory barrier prevents compiler and/or CPU + * reordering, thus avoiding descriptor posting before + * descriptor is initialized. Otherwise, hardware can read + * stale descriptor fields. + */ + wmb(); + + iowrite32(wq->to_use_index, &wq->ctrl->posted_index); +} + +static inline void vnic_wq_copy_desc_process(struct vnic_wq_copy *wq, u16 index) +{ + unsigned int cnt; + + if (wq->to_clean_index <= index) + cnt = (index - wq->to_clean_index) + 1; + else + cnt = wq->ring.desc_count - wq->to_clean_index + index + 1; + + wq->to_clean_index = ((index + 1) % wq->ring.desc_count); + wq->ring.desc_avail += cnt; + +} + +static inline void vnic_wq_copy_service(struct vnic_wq_copy *wq, + u16 completed_index, + void (*q_service)(struct vnic_wq_copy *wq, + struct fcpio_host_req *wq_desc)) +{ + struct fcpio_host_req *wq_desc = wq->ring.descs; + unsigned int curr_index; + + while (1) { + + if (q_service) + (*q_service)(wq, &wq_desc[wq->to_clean_index]); + + wq->ring.desc_avail++; + + curr_index = wq->to_clean_index; + + /* increment the to-clean index so that we start + * with an unprocessed index next time we enter the loop + */ + ((wq->to_clean_index + 1) == wq->ring.desc_count) ? + (wq->to_clean_index = 0) : (wq->to_clean_index++); + + if (curr_index == completed_index) + break; + + /* we have cleaned all the entries */ + if ((completed_index == (u16)-1) && + (wq->to_clean_index == wq->to_use_index)) + break; + } +} + +void vnic_wq_copy_enable(struct vnic_wq_copy *wq); +int vnic_wq_copy_disable(struct vnic_wq_copy *wq); +void vnic_wq_copy_free(struct vnic_wq_copy *wq); +int vnic_wq_copy_alloc(struct vnic_dev *vdev, struct vnic_wq_copy *wq, + unsigned int index, unsigned int desc_count, unsigned int desc_size); +void vnic_wq_copy_init(struct vnic_wq_copy *wq, unsigned int cq_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset); +void vnic_wq_copy_clean(struct vnic_wq_copy *wq, + void (*q_clean)(struct vnic_wq_copy *wq, + struct fcpio_host_req *wq_desc)); + +#endif /* _VNIC_WQ_COPY_H_ */ diff --git a/drivers/scsi/fnic/wq_enet_desc.h b/drivers/scsi/fnic/wq_enet_desc.h new file mode 100644 index 00000000000..b121cbad18b --- /dev/null +++ b/drivers/scsi/fnic/wq_enet_desc.h @@ -0,0 +1,96 @@ +/* + * Copyright 2008 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _WQ_ENET_DESC_H_ +#define _WQ_ENET_DESC_H_ + +/* Ethernet work queue descriptor: 16B */ +struct wq_enet_desc { + __le64 address; + __le16 length; + __le16 mss_loopback; + __le16 header_length_flags; + __le16 vlan_tag; +}; + +#define WQ_ENET_ADDR_BITS 64 +#define WQ_ENET_LEN_BITS 14 +#define WQ_ENET_LEN_MASK ((1 << WQ_ENET_LEN_BITS) - 1) +#define WQ_ENET_MSS_BITS 14 +#define WQ_ENET_MSS_MASK ((1 << WQ_ENET_MSS_BITS) - 1) +#define WQ_ENET_MSS_SHIFT 2 +#define WQ_ENET_LOOPBACK_SHIFT 1 +#define WQ_ENET_HDRLEN_BITS 10 +#define WQ_ENET_HDRLEN_MASK ((1 << WQ_ENET_HDRLEN_BITS) - 1) +#define WQ_ENET_FLAGS_OM_BITS 2 +#define WQ_ENET_FLAGS_OM_MASK ((1 << WQ_ENET_FLAGS_OM_BITS) - 1) +#define WQ_ENET_FLAGS_EOP_SHIFT 12 +#define WQ_ENET_FLAGS_CQ_ENTRY_SHIFT 13 +#define WQ_ENET_FLAGS_FCOE_ENCAP_SHIFT 14 +#define WQ_ENET_FLAGS_VLAN_TAG_INSERT_SHIFT 15 + +#define WQ_ENET_OFFLOAD_MODE_CSUM 0 +#define WQ_ENET_OFFLOAD_MODE_RESERVED 1 +#define WQ_ENET_OFFLOAD_MODE_CSUM_L4 2 +#define WQ_ENET_OFFLOAD_MODE_TSO 3 + +static inline void wq_enet_desc_enc(struct wq_enet_desc *desc, + u64 address, u16 length, u16 mss, u16 header_length, + u8 offload_mode, u8 eop, u8 cq_entry, u8 fcoe_encap, + u8 vlan_tag_insert, u16 vlan_tag, u8 loopback) +{ + desc->address = cpu_to_le64(address); + desc->length = cpu_to_le16(length & WQ_ENET_LEN_MASK); + desc->mss_loopback = cpu_to_le16((mss & WQ_ENET_MSS_MASK) << + WQ_ENET_MSS_SHIFT | (loopback & 1) << WQ_ENET_LOOPBACK_SHIFT); + desc->header_length_flags = cpu_to_le16( + (header_length & WQ_ENET_HDRLEN_MASK) | + (offload_mode & WQ_ENET_FLAGS_OM_MASK) << WQ_ENET_HDRLEN_BITS | + (eop & 1) << WQ_ENET_FLAGS_EOP_SHIFT | + (cq_entry & 1) << WQ_ENET_FLAGS_CQ_ENTRY_SHIFT | + (fcoe_encap & 1) << WQ_ENET_FLAGS_FCOE_ENCAP_SHIFT | + (vlan_tag_insert & 1) << WQ_ENET_FLAGS_VLAN_TAG_INSERT_SHIFT); + desc->vlan_tag = cpu_to_le16(vlan_tag); +} + +static inline void wq_enet_desc_dec(struct wq_enet_desc *desc, + u64 *address, u16 *length, u16 *mss, u16 *header_length, + u8 *offload_mode, u8 *eop, u8 *cq_entry, u8 *fcoe_encap, + u8 *vlan_tag_insert, u16 *vlan_tag, u8 *loopback) +{ + *address = le64_to_cpu(desc->address); + *length = le16_to_cpu(desc->length) & WQ_ENET_LEN_MASK; + *mss = (le16_to_cpu(desc->mss_loopback) >> WQ_ENET_MSS_SHIFT) & + WQ_ENET_MSS_MASK; + *loopback = (u8)((le16_to_cpu(desc->mss_loopback) >> + WQ_ENET_LOOPBACK_SHIFT) & 1); + *header_length = le16_to_cpu(desc->header_length_flags) & + WQ_ENET_HDRLEN_MASK; + *offload_mode = (u8)((le16_to_cpu(desc->header_length_flags) >> + WQ_ENET_HDRLEN_BITS) & WQ_ENET_FLAGS_OM_MASK); + *eop = (u8)((le16_to_cpu(desc->header_length_flags) >> + WQ_ENET_FLAGS_EOP_SHIFT) & 1); + *cq_entry = (u8)((le16_to_cpu(desc->header_length_flags) >> + WQ_ENET_FLAGS_CQ_ENTRY_SHIFT) & 1); + *fcoe_encap = (u8)((le16_to_cpu(desc->header_length_flags) >> + WQ_ENET_FLAGS_FCOE_ENCAP_SHIFT) & 1); + *vlan_tag_insert = (u8)((le16_to_cpu(desc->header_length_flags) >> + WQ_ENET_FLAGS_VLAN_TAG_INSERT_SHIFT) & 1); + *vlan_tag = le16_to_cpu(desc->vlan_tag); +} + +#endif /* _WQ_ENET_DESC_H_ */ From c53a284f8be23735dc6b53929640a987055f2933 Mon Sep 17 00:00:00 2001 From: Edward Goggin Date: Thu, 9 Apr 2009 10:02:22 -0700 Subject: [PATCH 449/900] [SCSI] initialize max_target_blocked in scsi_alloc_target This patch initializes the max_target_blocked field of a scsi target structure so that a queuecommand return value of SCSI_MLQUEUE_TARGET_BUSY will actually result in having the scsi_queue_insert blocking the device queue before requeuing the command and running the queue. Otherwise, can and does cause livelock on single CPU configurations if/when open-iSCSI software initiator's command PDU window fills. Signed-off-by: Ed Goggin Signed-off-by: James Bottomley --- drivers/scsi/scsi_scan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 6f51ca485f3..e2b50d8f57a 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -425,6 +425,7 @@ static struct scsi_target *scsi_alloc_target(struct device *parent, INIT_LIST_HEAD(&starget->devices); starget->state = STARGET_CREATED; starget->scsi_level = SCSI_2; + starget->max_target_blocked = SCSI_DEFAULT_TARGET_BLOCKED; retry: spin_lock_irqsave(shost->host_lock, flags); From 29a679754b1a2581ee456eada6c2de7ce95068bb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 May 2009 23:19:09 -0400 Subject: [PATCH 450/900] x86/stacktrace: return 0 instead of -1 for stack ops If we return -1 in the ops->stack for the stacktrace saving, we end up breaking out of the loop if the stack we are tracing is in the exception stack. This causes traces like: -0 [002] 34263.745825: raise_softirq_irqoff <-__blk_complete_request -0 [002] 34263.745826: <= 0 <= 0 <= 0 <= 0 <= 0 <= 0 <= 0 By returning "0" instead, the irq stack is saved as well, and we see: -0 [003] 883.280992: raise_softirq_irqoff <-__hrtimer_star t_range_ns -0 [003] 883.280992: <= hrtimer_start_range_ns <= tick_nohz_restart_sched_tick <= cpu_idle <= start_secondary <= <= 0 <= 0 [ Impact: record stacks from interrupts ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index f7bddc2e37d..4aaf7e48394 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) static int save_stack_stack(void *data, char *name) { - return -1; + return 0; } static void save_stack_address(void *data, unsigned long addr, int reliable) From 1ec7c4849c214fc78b023230264399836ea3b245 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 May 2009 23:40:06 -0400 Subject: [PATCH 451/900] tracing: stop stack trace on first empty entry The stack tracer stores eight entries in the ring buffer when an event traces the stack. The output outputs all eight entries regardless of how many entries were recorded. This patch breaks out of the loop when a null entry is discovered. [ Impact: only print the stack that is recorded ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8bd9a2c1a46..489c0e8ada0 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -898,6 +898,8 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (!field->caller[i]) + break; if (i) { if (!trace_seq_puts(s, " <= ")) goto partial; From 8cd995b6deedf98b7694ed32a786ee7f793d1eec Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 15 May 2009 11:07:27 +0800 Subject: [PATCH 452/900] tracing/filters: add missing unlock in a failure path [ Impact: fix deadlock in a rare case we fail to allocate memory ] Signed-off-by: Li Zefan LKML-Reference: <4A0CDC6F.7070200@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 85ad6a8939a..22c29984fe0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1079,9 +1079,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) return 0; } + err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - return -ENOMEM; + goto out_unlock; filter_disable_preds(call); replace_filter_string(call->filter, filter_string); @@ -1101,7 +1102,7 @@ out: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); - +out_unlock: mutex_unlock(&filter_mutex); return err; @@ -1123,9 +1124,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, return 0; } + err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - return -ENOMEM; + goto out_unlock; filter_free_subsystem_preds(system); replace_filter_string(system->filter, filter_string); @@ -1145,7 +1147,7 @@ out: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); - +out_unlock: mutex_unlock(&filter_mutex); return err; From 5872144f64b34a5942f6b4acedc90b02de72c58b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 15 May 2009 11:07:56 +0800 Subject: [PATCH 453/900] tracing/filters: fix off-by-one bug We should leave the last slot for the ending '\0'. [ Impact: fix possible crash when the length of an operand is 128 ] Signed-off-by: Li Zefan LKML-Reference: <4A0CDC8C.30602@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 22c29984fe0..a7430b16d24 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -736,7 +736,7 @@ static inline void clear_operand_string(struct filter_parse_state *ps) static inline int append_operand_char(struct filter_parse_state *ps, char c) { - if (ps->operand.tail == MAX_FILTER_STR_VAL) + if (ps->operand.tail == MAX_FILTER_STR_VAL - 1) return -EINVAL; ps->operand.string[ps->operand.tail++] = c; From f1a11e0576c7a73d759d05d776692b2b2d37172b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 May 2009 19:21:40 +0200 Subject: [PATCH 454/900] futex: remove the wait queue The waitqueue which is used in struct futex_q is a leftover from the futexfd implementation. There is no need to use a waitqueue at all, as the waiting task is the only user of it. The waitqueue just adds additional locking and a loop in the wake up path which both can be avoided. We have already a task reference in struct futex_q which is used for PI futexes. Use it for normal futexes as well and just wake up the task directly. The logic of signalling the futex wakeup via setting q->lock_ptr to NULL is kept with the difference that we set it NULL before doing the wakeup. This opens an exit race window vs. a non futex wake up of the to be woken up task, which we prevent with get_task_struct / put_task_struct on the waiter. [ Impact: simplification ] Signed-off-by: Thomas Gleixner --- kernel/futex.c | 60 ++++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index aec8bf89bf4..157bfcd725b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -100,8 +100,8 @@ struct futex_pi_state { */ struct futex_q { struct plist_node list; - /* There can only be a single waiter */ - wait_queue_head_t waiter; + /* Waiter reference */ + struct task_struct *task; /* Which hash list lock to use: */ spinlock_t *lock_ptr; @@ -111,7 +111,6 @@ struct futex_q { /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; - struct task_struct *task; /* rt_waiter storage for requeue_pi: */ struct rt_mutex_waiter *rt_waiter; @@ -694,22 +693,29 @@ retry: */ static void wake_futex(struct futex_q *q) { + struct task_struct *p = q->task; + + /* + * We set q->lock_ptr = NULL _before_ we wake up the task. If + * a non futex wake up happens on another CPU then the task + * might exit and p would dereference a non existing task + * struct. Prevent this by holding a reference on p across the + * wake up. + */ + get_task_struct(p); + plist_del(&q->list, &q->list.plist); /* - * The lock in wake_up_all() is a crucial memory barrier after the - * plist_del() and also before assigning to q->lock_ptr. - */ - wake_up(&q->waiter); - /* - * The waiting task can free the futex_q as soon as this is written, - * without taking any locks. This must come last. - * - * A memory barrier is required here to prevent the following store to - * lock_ptr from getting ahead of the wakeup. Clearing the lock at the - * end of wake_up() does not prevent this store from moving. + * The waiting task can free the futex_q as soon as + * q->lock_ptr = NULL is written, without taking any locks. A + * memory barrier is required here to prevent the following + * store to lock_ptr from getting ahead of the plist_del. */ smp_wmb(); q->lock_ptr = NULL; + + wake_up_state(p, TASK_NORMAL); + put_task_struct(p); } static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) @@ -1003,7 +1009,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; - wake_up(&q->waiter); + wake_up_state(q->task, TASK_NORMAL); } /** @@ -1280,8 +1286,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) { struct futex_hash_bucket *hb; - init_waitqueue_head(&q->waiter); - get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -1575,11 +1579,9 @@ out: * @hb: the futex hash bucket, must be locked by the caller * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout - * @wait: the wait_queue to add to the futex_q after queueing in the hb */ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout, - wait_queue_t *wait) + struct hrtimer_sleeper *timeout) { queue_me(q, hb); @@ -1587,19 +1589,11 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * There might have been scheduling since the queue_me(), as we * cannot hold a spinlock across the get_user() in case it * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to + * queueing ourselves into the futex hash. This code thus has to * rely on the futex_wake() code removing us from hash when it * wakes us up. */ - - /* add_wait_queue is the barrier after __set_current_state. */ - __set_current_state(TASK_INTERRUPTIBLE); - - /* - * Add current as the futex_q waiter. We don't remove ourselves from - * the wait_queue because we are the only user of it. - */ - add_wait_queue(&q->waiter, wait); + set_current_state(TASK_INTERRUPTIBLE); /* Arm the timer */ if (timeout) { @@ -1704,7 +1698,6 @@ static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset, int clockrt) { struct hrtimer_sleeper timeout, *to = NULL; - DECLARE_WAITQUEUE(wait, current); struct restart_block *restart; struct futex_hash_bucket *hb; struct futex_q q; @@ -1733,7 +1726,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, goto out; /* queue_me and wait for wakeup, timeout, or a signal. */ - futex_wait_queue_me(hb, &q, to, &wait); + futex_wait_queue_me(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; @@ -2147,7 +2140,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; struct rt_mutex *pi_mutex = NULL; - DECLARE_WAITQUEUE(wait, current); struct restart_block *restart; struct futex_hash_bucket *hb; union futex_key key2; @@ -2191,7 +2183,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ - futex_wait_queue_me(hb, &q, to, &wait); + futex_wait_queue_me(hb, &q, to); spin_lock(&hb->lock); ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); From dce48a84adf1806676319f6f480e30a6daa012f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 11 Apr 2009 10:43:41 +0200 Subject: [PATCH 455/900] sched, timers: move calc_load() to scheduler Dimitri Sivanich noticed that xtime_lock is held write locked across calc_load() which iterates over all online CPUs. That can cause long latencies for xtime_lock readers on large SMP systems. The load average calculation is an rough estimate anyway so there is no real need to protect the readers vs. the update. It's not a problem when the avenrun array is updated while a reader copies the values. Instead of iterating over all online CPUs let the scheduler_tick code update the number of active tasks shortly before the avenrun update happens. The avenrun update itself is handled by the CPU which calls do_timer(). [ Impact: reduce xtime_lock write locked section ] Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- include/linux/sched.h | 2 +- kernel/sched.c | 86 ++++++++++++++++++++++++++++++++++----- kernel/sched_idletask.c | 3 +- kernel/time/timekeeping.c | 2 +- kernel/timer.c | 54 +----------------------- 5 files changed, 81 insertions(+), 66 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049..6eb4892efe4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -135,8 +135,8 @@ DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); -extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern void calc_global_load(void); extern unsigned long get_parent_ip(unsigned long addr); diff --git a/kernel/sched.c b/kernel/sched.c index 8908d190a34..f4eb88153bd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -630,6 +630,10 @@ struct rq { struct list_head migration_queue; #endif + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; @@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) } #endif +static void calc_load_account_active(struct rq *this_rq); + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -2856,19 +2862,57 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_active(void) +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) { - unsigned long i, running = 0, uninterruptible = 0; + load *= exp; + load += active * (FIXED_1 - exp); + return load >> FSHIFT; +} - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(void) +{ + unsigned long upd = calc_load_update + 10; + long active; + + if (time_before(jiffies, upd)) + return; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; +} + +/* + * Either called from update_cpu_load() or from a cpu going idle + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long nr_active, delta; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + atomic_long_add(delta, &calc_load_tasks); } - - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; - - return running + uninterruptible; } /* @@ -2899,6 +2943,11 @@ static void update_cpu_load(struct rq *this_rq) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } + + if (time_after_eq(jiffies, this_rq->calc_load_update)) { + this_rq->calc_load_update += LOAD_FREQ; + calc_load_account_active(this_rq); + } } #ifdef CONFIG_SMP @@ -7091,6 +7140,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) } } + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +} #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -7325,6 +7382,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); + rq->calc_load_update = calc_load_update; + rq->calc_load_active = 0; if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -7364,7 +7423,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); - + calc_global_load_remove(rq); /* * No need to migrate the tasks: it was best-effort if * they didn't take sched_hotcpu_mutex. Just wake up @@ -9059,6 +9118,8 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->nr_running = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -9166,6 +9227,9 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + + calc_load_update = jiffies + LOAD_FREQ; + /* * During early bootup we pretend to be a normal task: */ diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 8a21a2e28c1..499672c10cb 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - + /* adjust the active tasks as we might go into a long sleep */ + calc_load_account_active(rq); return rq->idle; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff49f6e..52a8bf8931f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,7 +22,7 @@ /* * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. + * playing with xtime. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c3..6a21d7af962 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1122,47 +1122,6 @@ void update_process_times(int user_tick) run_posix_cpu_timers(p); } -/* - * Nr of active tasks - counted in fixed-point numbers - */ -static unsigned long count_active_tasks(void) -{ - return nr_active() * FIXED_1; -} - -/* - * Hmm.. Changed this, as the GNU make sources (load.c) seems to - * imply that avenrun[] is the standard name for this kind of thing. - * Nothing else seems to be standardized: the fractional size etc - * all seem to differ on different machines. - * - * Requires xtime_lock to access. - */ -unsigned long avenrun[3]; - -EXPORT_SYMBOL(avenrun); - -/* - * calc_load - given tick count, update the avenrun load estimates. - * This is called while holding a write_lock on xtime_lock. - */ -static inline void calc_load(unsigned long ticks) -{ - unsigned long active_tasks; /* fixed-point */ - static int count = LOAD_FREQ; - - count -= ticks; - if (unlikely(count < 0)) { - active_tasks = count_active_tasks(); - do { - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); - count += LOAD_FREQ; - } while (count < 0); - } -} - /* * This function runs timers and the timer-tq in bottom half context. */ @@ -1186,16 +1145,6 @@ void run_local_timers(void) softlockup_tick(); } -/* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! - */ -static inline void update_times(unsigned long ticks) -{ - update_wall_time(); - calc_load(ticks); -} - /* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. @@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks) void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_times(ticks); + update_wall_time(); + calc_global_load(); } #ifdef __ARCH_WANT_SYS_ALARM From 2d02494f5a90f2e4b3c4c6acc85ec94674cdc431 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 2 May 2009 20:08:52 +0200 Subject: [PATCH 456/900] sched, timers: cleanup avenrun users avenrun is an rough estimate so we don't have to worry about consistency of the three avenrun values. Remove the xtime lock dependency and provide a function to scale the values. Cleanup the users. [ Impact: cleanup ] Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- fs/proc/loadavg.c | 18 ++++++------------ include/linux/sched.h | 1 + kernel/sched.c | 15 +++++++++++++++ kernel/timer.c | 32 ++++++-------------------------- 4 files changed, 28 insertions(+), 38 deletions(-) diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 9bca39cf99e..1afa4dd4cae 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -12,20 +12,14 @@ static int loadavg_proc_show(struct seq_file *m, void *v) { - int a, b, c; - unsigned long seq; + unsigned long avnrun[3]; - do { - seq = read_seqbegin(&xtime_lock); - a = avenrun[0] + (FIXED_1/200); - b = avenrun[1] + (FIXED_1/200); - c = avenrun[2] + (FIXED_1/200); - } while (read_seqretry(&xtime_lock, seq)); + get_avenrun(avnrun, FIXED_1/200, 0); - seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n", - LOAD_INT(a), LOAD_FRAC(a), - LOAD_INT(b), LOAD_FRAC(b), - LOAD_INT(c), LOAD_FRAC(c), + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), + LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), + LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, task_active_pid_ns(current)->last_pid); return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index 6eb4892efe4..de7b3b21777 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -116,6 +116,7 @@ struct fs_struct; * 11 bit fractions. */ extern unsigned long avenrun[]; /* Load averages */ +extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); #define FSHIFT 11 /* nr of bits of precision */ #define FIXED_1 (1<uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - /* - * This is annoying. The below is the same thing - * posix_get_clock_monotonic() does, but it wants to - * take the lock which we want to cover the loads stuff - * too. - */ + get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); - getnstimeofday(&tp); - tp.tv_sec += wall_to_monotonic.tv_sec; - tp.tv_nsec += wall_to_monotonic.tv_nsec; - monotonic_to_bootbased(&tp); - if (tp.tv_nsec - NSEC_PER_SEC >= 0) { - tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; - tp.tv_sec++; - } - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - - info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - - info->procs = nr_threads; - } while (read_seqretry(&xtime_lock, seq)); + info->procs = nr_threads; si_meminfo(info); si_swapinfo(info); From d9bcc01d58d18ed287091707b0b45c6ac888a11a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:06:12 +0530 Subject: [PATCH 457/900] x86, mtrr: replace MTRRcap_MSR with msr-index's MSR_MTRRcap Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/main.c | 2 +- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0b776c09aff..de9c20ffa0d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -306,7 +306,7 @@ void __init get_mtrr_state(void) vrs = mtrr_state.var_ranges; - rdmsr(MTRRcap_MSR, lo, dummy); + rdmsr(MSR_MTRRcap, lo, dummy); mtrr_state.have_fixed = (lo >> 8) & 1; for (i = 0; i < num_var_ranges; i++) @@ -703,7 +703,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i static int generic_have_wrcomb(void) { unsigned long config, dummy; - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); return (config & (1 << 10)); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 03cda01f57c..8fc248b5aea 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void) unsigned long config = 0, dummy; if (use_intel()) { - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); } else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 77f67f7b347..5d37fb14523 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,7 +5,6 @@ #include #include -#define MTRRcap_MSR 0x0fe #define MTRRdefType_MSR 0x2ff #define MTRRfix64K_00000_MSR 0x250 From a036c7a358cc9d7ed28a188480b9a4d709e09b24 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:10:43 +0530 Subject: [PATCH 458/900] x86, mtrr: replace MTRRfix64K_00000_MSR with msr-index's MSR_MTRRfix64K_00000 Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index de9c20ffa0d..8b115c0e590 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -20,7 +20,7 @@ struct fixed_range_block { }; static struct fixed_range_block fixed_range_blocks[] = { - { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ {} @@ -194,7 +194,7 @@ get_fixed_ranges(mtrr_type * frs) k8_check_syscfg_dram_mod_en(); - rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); + rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 5d37fb14523..7f23caede71 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,7 +7,6 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix64K_00000_MSR 0x250 #define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 From 7d9d55e449089df8463bca2045d702ae6cda64a2 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:15:32 +0530 Subject: [PATCH 459/900] x86, mtrr: replace MTRRfix16K_80000_MSR with msr-index's MSR_MTRRfix16K_80000 Use standard msr-index.h's MSR declaration and no need to declare again [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 8b115c0e590..00437c2e8dd 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -21,7 +21,7 @@ struct fixed_range_block { static struct fixed_range_block fixed_range_blocks[] = { { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ - { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ {} }; @@ -197,7 +197,7 @@ get_fixed_ranges(mtrr_type * frs) rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) - rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); + rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); } diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 7f23caede71..712b60524e6 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,7 +7,6 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 #define MTRRfix4K_C8000_MSR 0x269 From 654ac05801ae806661c8fdeb3b5c420a31cbc5b1 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:21:54 +0530 Subject: [PATCH 460/900] x86, mtrr: remove mtrr MSRs double declaration Removed MTRR MSR from mtrr/mtrr.h as these are already declared in msr-index.h and nobody is using them: MTRRfix16K_A0000_MSR MTRRfix4K_C8000_MSR MTRRfix4K_D0000_MSR MTRRfix4K_D8000_MSR MTRRfix4K_E0000_MSR MTRRfix4K_E8000_MSR MTRRfix4K_F0000_MSR MTRRfix4K_F8000_MSR Use standard msr-index.h's MSR declaration and no need to declare again [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/mtrr.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 712b60524e6..5053793f912 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,15 +7,7 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 -#define MTRRfix4K_C8000_MSR 0x269 -#define MTRRfix4K_D0000_MSR 0x26a -#define MTRRfix4K_D8000_MSR 0x26b -#define MTRRfix4K_E0000_MSR 0x26c -#define MTRRfix4K_E8000_MSR 0x26d -#define MTRRfix4K_F0000_MSR 0x26e -#define MTRRfix4K_F8000_MSR 0x26f #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 From ba5673ff1ff5f428256db4cedd4b05b7be008bb6 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:29:25 +0530 Subject: [PATCH 461/900] x86, mtrr: replace MTRRfix4K_C0000_MSR with msr-index's MSR_MTRRfix4K_C0000 Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 00437c2e8dd..3cf58e26534 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -22,7 +22,7 @@ struct fixed_range_block { static struct fixed_range_block fixed_range_blocks[] = { { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ - { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; @@ -199,7 +199,7 @@ get_fixed_ranges(mtrr_type * frs) for (i = 0; i < 2; i++) rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) - rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); + rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); } void mtrr_save_fixed_ranges(void *info) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 5053793f912..e5ee686d2c3 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,8 +7,6 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix4K_C0000_MSR 0x268 - #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 From 52650257ea06bb15c2e2bbe854cbdf463726141a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:35:46 +0530 Subject: [PATCH 462/900] x86, mtrr: replace MTRRdefType_MSR with msr-index's MSR_MTRRdefType Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/cleanup.c | 4 ++-- arch/x86/kernel/cpu/mtrr/generic.c | 8 ++++---- arch/x86/kernel/cpu/mtrr/mtrr.h | 2 -- arch/x86/kernel/cpu/mtrr/state.c | 6 +++--- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ce0fe4b5c04..1d584a18a50 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits) if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; @@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) */ if (!is_cpu(INTEL) || disable_mtrr_trim) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 3cf58e26534..e930a311770 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -314,7 +314,7 @@ void __init get_mtrr_state(void) if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); - rdmsr(MTRRdefType_MSR, lo, dummy); + rdmsr(MSR_MTRRdefType, lo, dummy); mtrr_state.def_type = (lo & 0xff); mtrr_state.enabled = (lo & 0xc00) >> 10; @@ -579,10 +579,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) __flush_tlb(); /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) @@ -591,7 +591,7 @@ static void post_set(void) __releases(set_atomicity_lock) __flush_tlb(); /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index e5ee686d2c3..7538b767f20 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,8 +5,6 @@ #include #include -#define MTRRdefType_MSR 0x2ff - #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 7f7e2753685..1f5fb1588d1 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) if (use_intel()) /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else were excluded at the top */ ctxt->ccr3 = getCx86(CX86_CCR3); @@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { if (use_intel()) /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); else if (is_cpu(CYRIX)) /* Cyrix ARRs - everything else were excluded at the top */ @@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt) /* Restore MTRRdefType */ if (use_intel()) /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ctxt->ccr3); From 9a1a69a1f41cbefebf3172761f197db6aba71e68 Mon Sep 17 00:00:00 2001 From: Andrew Vasquez Date: Wed, 29 Apr 2009 13:12:39 -0500 Subject: [PATCH 463/900] [SCSI] fc-transport: Close state transition-window during rport deletion. Andrew Vasquez wrote: > fc-transport: Close state transition-window during rport deletion. > > After an rport's state has transitioned to FC_PORTSTATE_BLOCKED, > but, prior to making the upcall to 'block' the scsi-target > associated with an rport, queued commands can recycle and > ultimately run out of retries causing failures to propagate to > upper-level drivers. Close this transition-window by returning > the non-'retries' modifying DID_IMM_RETRY status for submitted > I/Os. The same can happen for iscsi when transitioning from logged in to failed and blocking the sdevs. This patch converts iscsi and fc's transitions back to use DID_IMM_RETRY instead of DID_TRANSPORT_DISRUPTED which has a limited number of retries that we do not want to use for handling this race. Signed-off-by: Andrew Vasquez [Addition of iscsi and fc port online devloss case conversion by Mike Christie] Signed-off-by: Mike Christie Signed-off-by: James Bottomley --- drivers/scsi/scsi_transport_iscsi.c | 2 +- include/scsi/scsi_transport_fc.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 09479545529..0a2ce7b6325 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -357,7 +357,7 @@ int iscsi_session_chkready(struct iscsi_cls_session *session) err = 0; break; case ISCSI_SESSION_FAILED: - err = DID_TRANSPORT_DISRUPTED << 16; + err = DID_IMM_RETRY << 16; break; case ISCSI_SESSION_FREE: err = DID_TRANSPORT_FAILFAST << 16; diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h index c9184f756ca..68a8d873bbd 100644 --- a/include/scsi/scsi_transport_fc.h +++ b/include/scsi/scsi_transport_fc.h @@ -680,7 +680,7 @@ fc_remote_port_chkready(struct fc_rport *rport) if (rport->roles & FC_PORT_ROLE_FCP_TARGET) result = 0; else if (rport->flags & FC_RPORT_DEVLOSS_PENDING) - result = DID_TRANSPORT_DISRUPTED << 16; + result = DID_IMM_RETRY << 16; else result = DID_NO_CONNECT << 16; break; @@ -688,7 +688,7 @@ fc_remote_port_chkready(struct fc_rport *rport) if (rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT) result = DID_TRANSPORT_FAILFAST << 16; else - result = DID_TRANSPORT_DISRUPTED << 16; + result = DID_IMM_RETRY << 16; break; default: result = DID_NO_CONNECT << 16; From 5e43754fd949193252ecb470d7fb08b547a1e310 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 30 Apr 2009 19:13:41 -0700 Subject: [PATCH 464/900] [SCSI] ses: fix problems caused by empty SES provided name We use the name provided by SES to name objects. An empty name is legal in SES but causes problems in our generic device hierarchy. Fix this by falling back to a number if the name is either NULL or empty. Also fix a secondary bug spotted in that dev_set_name(dev, name) uses a string format and so would go wrong if name contained a '%'. Signed-off-by: Yinghai Lu Signed-off-by: James Bottomley --- drivers/misc/enclosure.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c index 3cf61ece71d..348443bdb23 100644 --- a/drivers/misc/enclosure.c +++ b/drivers/misc/enclosure.c @@ -119,7 +119,7 @@ enclosure_register(struct device *dev, const char *name, int components, edev->edev.class = &enclosure_class; edev->edev.parent = get_device(dev); edev->cb = cb; - dev_set_name(&edev->edev, name); + dev_set_name(&edev->edev, "%s", name); err = device_register(&edev->edev); if (err) goto err; @@ -255,8 +255,8 @@ enclosure_component_register(struct enclosure_device *edev, ecomp->number = number; cdev = &ecomp->cdev; cdev->parent = get_device(&edev->edev); - if (name) - dev_set_name(cdev, name); + if (name && name[0]) + dev_set_name(cdev, "%s", name); else dev_set_name(cdev, "%u", number); From 8454e9888cb0316dd296fd5d47c612248ed5e1d1 Mon Sep 17 00:00:00 2001 From: adam radford Date: Tue, 5 May 2009 11:45:37 -0700 Subject: [PATCH 465/900] [SCSI] 3w-9xxx: scsi_dma_unmap fix This patch fixes the following regression the occurred during the scsi_dma_map()/unmap() changes: 3w-9xxx 0001:45:00.0: DMA-API: device driver tries to free DMA memory it has not allocated [device address=0x0000000000000000] [size=36 bytes] Signed-off-by: Adam Radford Signed-off-by: James Bottomley --- drivers/scsi/3w-9xxx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c index 8b7983aba8f..36c21b19e5d 100644 --- a/drivers/scsi/3w-9xxx.c +++ b/drivers/scsi/3w-9xxx.c @@ -1978,7 +1978,8 @@ static void twa_unmap_scsi_data(TW_Device_Extension *tw_dev, int request_id) { struct scsi_cmnd *cmd = tw_dev->srb[request_id]; - scsi_dma_unmap(cmd); + if (cmd->SCp.phase == TW_PHASE_SGLIST) + scsi_dma_unmap(cmd); } /* End twa_unmap_scsi_data() */ /* scsi_host_template initializer */ From 7b14f58ad65f9d74e4273fb45360cfea824495aa Mon Sep 17 00:00:00 2001 From: adam radford Date: Mon, 11 May 2009 14:55:55 -0700 Subject: [PATCH 466/900] [SCSI] 3w-xxxx: scsi_dma_unmap fix This patch fixes the following regression that occurred during the scsi_dma_map()/unmap() changes when compiling with CONFIG_DMA_API_DEBUG=y : WARNING: at lib/dma-debug.c:496 check_unmap+0x142/0x542() Hardware name: 3w-xxxx 0000:02:02.0: DMA-API: device driver tries to free DMA memory it has not allocated [device address=0x0000000000000000] [size=36 bytes] Signed-off-by: Adam Radford Signed-off-by: James Bottomley --- drivers/scsi/3w-xxxx.c | 5 +++-- drivers/scsi/3w-xxxx.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c index c03f1d2c9e2..faa0fcfed71 100644 --- a/drivers/scsi/3w-xxxx.c +++ b/drivers/scsi/3w-xxxx.c @@ -6,7 +6,7 @@ Arnaldo Carvalho de Melo Brad Strand - Copyright (C) 1999-2007 3ware Inc. + Copyright (C) 1999-2009 3ware Inc. Kernel compatiblity By: Andre Hedrick Non-Copyright (C) 2000 Andre Hedrick @@ -1294,7 +1294,8 @@ static void tw_unmap_scsi_data(struct pci_dev *pdev, struct scsi_cmnd *cmd) { dprintk(KERN_WARNING "3w-xxxx: tw_unmap_scsi_data()\n"); - scsi_dma_unmap(cmd); + if (cmd->SCp.phase == TW_PHASE_SGLIST) + scsi_dma_unmap(cmd); } /* End tw_unmap_scsi_data() */ /* This function will reset a device extension */ diff --git a/drivers/scsi/3w-xxxx.h b/drivers/scsi/3w-xxxx.h index 8e71e5e122b..a5a2ba2561d 100644 --- a/drivers/scsi/3w-xxxx.h +++ b/drivers/scsi/3w-xxxx.h @@ -6,7 +6,7 @@ Arnaldo Carvalho de Melo Brad Strand - Copyright (C) 1999-2007 3ware Inc. + Copyright (C) 1999-2009 3ware Inc. Kernel compatiblity By: Andre Hedrick Non-Copyright (C) 2000 Andre Hedrick From e5198075c67a22ec9a09565b1ce88d3d3f5ba855 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:05:16 -0700 Subject: [PATCH 467/900] x86, apic: introduce io_apic_irq_attr according to Ingo, io_apic irq-setup related functions have too many parameters with a repetitive signature. So reduce related funcs to get less params by passing a pointer to a newly defined io_apic_irq_attr structure. v2: io_apic_irq ==> irq_attr triggering ==> trigger v3: add set_io_apic_irq_attr [ Impact: cleanup ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A08ACD3.2070401@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 21 +++++++++++++-- arch/x86/include/asm/io_apic.h | 5 ++-- arch/x86/kernel/acpi/boot.c | 22 +++++++++------- arch/x86/kernel/apic/io_apic.c | 43 ++++++++++++++++++------------- arch/x86/pci/irq.c | 16 ++++-------- drivers/pci/hotplug/ibmphp_core.c | 6 ++--- 6 files changed, 66 insertions(+), 47 deletions(-) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 26a40ab7013..a7d14bbae11 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -63,9 +63,26 @@ extern unsigned long io_apic_irqs; extern void init_VISWS_APIC_irqs(void); extern void setup_IO_APIC(void); extern void disable_IO_APIC(void); + +struct io_apic_irq_attr { + int ioapic; + int ioapic_pin; + int trigger; + int polarity; +}; + +static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, + int ioapic, int ioapic_pin, + int trigger, int polarity) +{ + irq_attr->ioapic = ioapic; + irq_attr->ioapic_pin = ioapic_pin; + irq_attr->trigger = trigger; + irq_attr->polarity = polarity; +} + extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, - int *ioapic, int *ioapic_pin, - int *trigger, int *polarity); + struct io_apic_irq_attr *irq_attr); extern void setup_ioapic_dest(void); extern void enable_IO_APIC(void); diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 6fd99f96eb0..daf866ed061 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -156,8 +156,9 @@ extern int io_apic_get_version(int ioapic); extern int io_apic_get_redir_entries(int ioapic); #endif /* CONFIG_ACPI */ -extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, - int irq, int edge_level, int active_high_low); +struct io_apic_irq_attr; +extern int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr); extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index dcfbc3ab9e4..4af63dfb0f0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -523,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) * success: return IRQ number (>=0) * failure: return < 0 */ -int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { unsigned int irq; unsigned int plat_gsi = gsi; @@ -533,14 +533,14 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - if (triggering == ACPI_LEVEL_SENSITIVE) + if (trigger == ACPI_LEVEL_SENSITIVE) eisa_set_level_irq(gsi); } #endif #ifdef CONFIG_X86_IO_APIC if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(dev, gsi, triggering, polarity); + plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); } #endif acpi_gsi_to_irq(plat_gsi, &irq); @@ -1156,7 +1156,7 @@ void __init mp_config_acpi_legacy_irqs(void) } } -static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { #ifdef CONFIG_X86_MPPARSE @@ -1181,7 +1181,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, /* print the entry should happen on mptable identically */ mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); mp_irq.srcbus = number; mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); @@ -1194,10 +1194,11 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, return 0; } -int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) +int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { int ioapic; int ioapic_pin; + struct io_apic_irq_attr irq_attr; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -1225,11 +1226,12 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) ioapic_pin); return gsi; } - mp_config_acpi_gsi(dev, gsi, triggering, polarity); + mp_config_acpi_gsi(dev, gsi, trigger, polarity); - io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi, - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, + trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + io_apic_set_pci_routing(dev, gsi, &irq_attr); return gsi; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 74d2b480a20..ce1ac74baa7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1096,8 +1096,7 @@ static int pin_2_irq(int idx, int apic, int pin) * Not an __init, possibly needed by modules */ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, - int *ioapic, int *ioapic_pin, - int *trigger, int *polarity) + struct io_apic_irq_attr *irq_attr) { int apic, i, best_guess = -1; @@ -1127,10 +1126,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, continue; if (pin == (mp_irqs[i].srcbusirq & 3)) { - *ioapic = apic; - *ioapic_pin = mp_irqs[i].dstirq; - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); return irq; } /* @@ -1138,10 +1137,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, * best-guess fuzzy result for broken mptables. */ if (best_guess < 0) { - *ioapic = apic; - *ioapic_pin = mp_irqs[i].dstirq; - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); best_guess = irq; } } @@ -3865,13 +3864,16 @@ int __init arch_probe_nr_irqs(void) } #endif -static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, - int triggering, int polarity) +static int __io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { struct irq_desc *desc; struct irq_cfg *cfg; int node; + int ioapic, pin; + int trigger, polarity; + ioapic = irq_attr->ioapic; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ioapic); @@ -3889,6 +3891,10 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in return 0; } + pin = irq_attr->ioapic_pin; + trigger = irq_attr->trigger; + polarity = irq_attr->polarity; + /* * IRQs < 16 are already in the irq_2_pin[] map */ @@ -3897,20 +3903,22 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in add_pin_to_irq_node(cfg, node, ioapic, pin); } - setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); + setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); return 0; } -int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, - int triggering, int polarity) +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { - + int ioapic, pin; /* * Avoid pin reprogramming. PRTs typically include entries * with redundant pin->gsi mappings (but unique PCI devices); * we only program the IOAPIC on the first. */ + ioapic = irq_attr->ioapic; + pin = irq_attr->ioapic_pin; if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { pr_debug("Pin %d-%d already programmed\n", mp_ioapics[ioapic].apicid, pin); @@ -3918,8 +3926,7 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, } set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); - return __io_apic_set_pci_routing(dev, ioapic, pin, irq, - triggering, polarity); + return __io_apic_set_pci_routing(dev, irq, irq_attr); } /* -------------------------------------------------------------------------- diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 2f3e192615c..0696d506c4a 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1200,14 +1200,11 @@ static int pirq_enable_irq(struct pci_dev *dev) #ifdef CONFIG_X86_IO_APIC struct pci_dev *temp_dev; int irq; - int ioapic = -1, ioapic_pin = -1; - int triggering, polarity; + struct io_apic_irq_attr irq_attr; irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), - pin - 1, - &ioapic, &ioapic_pin, - &triggering, &polarity); + pin - 1, &irq_attr); /* * Busses behind bridges are typically not listed in the MP-table. * In this case we have to look up the IRQ based on the parent bus, @@ -1221,9 +1218,7 @@ static int pirq_enable_irq(struct pci_dev *dev) pin = pci_swizzle_interrupt_pin(dev, pin); irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, PCI_SLOT(bridge->devfn), - pin - 1, - &ioapic, &ioapic_pin, - &triggering, &polarity); + pin - 1, &irq_attr); if (irq >= 0) dev_warn(&dev->dev, "using bridge %s " "INT %c to get IRQ %d\n", @@ -1233,9 +1228,8 @@ static int pirq_enable_irq(struct pci_dev *dev) } dev = temp_dev; if (irq >= 0) { - io_apic_set_pci_routing(&dev->dev, ioapic, - ioapic_pin, irq, - triggering, polarity); + io_apic_set_pci_routing(&dev->dev, irq, + &irq_attr); dev->irq = irq; dev_info(&dev->dev, "PCI->APIC IRQ transform: " "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c index 79901a0db88..42e4260c3b1 100644 --- a/drivers/pci/hotplug/ibmphp_core.c +++ b/drivers/pci/hotplug/ibmphp_core.c @@ -155,15 +155,13 @@ int ibmphp_init_devno(struct slot **cur_slot) for (loop = 0; loop < len; loop++) { if ((*cur_slot)->number == rtable->slots[loop].slot && (*cur_slot)->bus == rtable->slots[loop].bus) { - int ioapic = -1, ioapic_pin = -1; - int triggering, polarity; + struct io_apic_irq_attr irq_attr; (*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn); for (i = 0; i < 4; i++) (*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus, (int) (*cur_slot)->device, i, - &ioapic, &ioapic_pin, - &triggering, &polarity); + &irq_attr); debug("(*cur_slot)->irq[0] = %x\n", (*cur_slot)->irq[0]); From 2759c3287de27266e06f1f4e82cbd2d65f6a044c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:05:16 -0700 Subject: [PATCH 468/900] x86: don't call read_apic_id if !cpu_has_apic should not call that if apic is disabled. [ Impact: fix crash on certain UP configs ] Signed-off-by: Yinghai Lu Cc: Cyrill Gorcunov LKML-Reference: <4A09CCBB.2000306@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/common.c | 6 ++++++ arch/x86/kernel/cpu/intel.c | 6 +++--- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 744e6d8af27..d0c99abc26c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -161,7 +161,7 @@ static int flat_apic_id_registered(void) static int flat_phys_pkg_id(int initial_apic_id, int index_msb) { - return hard_smp_processor_id() >> index_msb; + return initial_apic_id >> index_msb; } struct apic apic_flat = { diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459daa6..728b3750a3e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -272,7 +272,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) int cpu = smp_processor_id(); int node; - unsigned apicid = hard_smp_processor_id(); + unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; node = c->phys_proc_id; if (apicid_to_node[apicid] != NUMA_NO_NODE) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1caefc82e6..017c600e05a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -761,6 +761,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_identify) this_cpu->c_identify(c); + /* Clear/Set all flags overriden by options, after probe */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + #ifdef CONFIG_X86_64 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7437fa133c0..daed39ba261 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -229,12 +229,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) } #endif -static void __cpuinit srat_detect_node(void) +static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) unsigned node; int cpu = smp_processor_id(); - int apicid = hard_smp_processor_id(); + int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; /* Don't do the funky fallback heuristics the AMD version employs for now. */ @@ -400,7 +400,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } /* Work around errata */ - srat_detect_node(); + srat_detect_node(c); if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); From 888a589f6be07d624e21e2174d98375e9f95911b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:59:37 -0700 Subject: [PATCH 469/900] mm, x86: remove MEMORY_HOTPLUG_RESERVE related code after: | commit b263295dbffd33b0fbff670720fa178c30e3392a | Author: Christoph Lameter | Date: Wed Jan 30 13:30:47 2008 +0100 | | x86: 64-bit, make sparsemem vmemmap the only memory model we don't have MEMORY_HOTPLUG_RESERVE anymore. Historically, x86-64 had an architecture-specific method for memory hotplug whereby it scanned the SRAT for physical memory ranges that could be potentially used for memory hot-add later. By reserving those ranges without physical memory, the memmap would be allocated and left dormant until needed. This depended on the DISCONTIG memory model which has been removed so the code implementing HOTPLUG_RESERVE is now dead. This patch removes the dead code used by MEMORY_HOTPLUG_RESERVE. (Changelog authored by Mel.) v2: updated changelog, and remove hotadd= in doc [ Impact: remove dead code ] Signed-off-by: Yinghai Lu Reviewed-by: Christoph Lameter Reviewed-by: Mel Gorman Workflow-found-OK-by: Andrew Morton LKML-Reference: <4A0C4910.7090508@kernel.org> Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/boot-options.txt | 5 -- arch/x86/include/asm/numa_64.h | 3 - arch/x86/mm/numa_64.c | 5 -- arch/x86/mm/srat_64.c | 63 ++++----------------- include/linux/mm.h | 2 - mm/page_alloc.c | 69 ----------------------- 6 files changed, 12 insertions(+), 135 deletions(-) diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 34c13040a71..2db5893d6c9 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -150,11 +150,6 @@ NUMA Otherwise, the remaining system RAM is allocated to an additional node. - numa=hotadd=percent - Only allow hotadd memory to preallocate page structures upto - percent of already available memory. - numa=hotadd=0 will disable hotadd memory. - ACPI acpi=off Don't enable ACPI diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 064ed6df4cb..7feff0648d7 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks, extern void numa_init_array(void); extern int numa_off; -extern void srat_reserve_add_area(int nodeid); -extern int hotadd_percent; - extern s16 apicid_to_node[MAX_LOCAL_APIC]; extern unsigned long numa_free_all_bootmem(void); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index fb61d81a656..a6a93c39523 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -272,9 +272,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<> PAGE_SHIFT; unsigned long e_pfn = end >> PAGE_SHIFT; - int ret = 0, changed = 0; + int changed = 0; struct bootnode *nd = &nodes_add[node]; /* I had some trouble with strange memory hotadd regions breaking @@ -210,7 +201,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end) mistakes */ if ((signed long)(end - start) < NODE_MIN_SIZE) { printk(KERN_ERR "SRAT: Hotplug area too small\n"); - return -1; + return; } /* This check might be a bit too strict, but I'm keeping it for now. */ @@ -218,12 +209,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end) printk(KERN_ERR "SRAT: Hotplug area %lu -> %lu has existing memory\n", s_pfn, e_pfn); - return -1; - } - - if (!hotadd_enough_memory(&nodes_add[node])) { - printk(KERN_ERR "SRAT: Hotplug area too large\n"); - return -1; + return; } /* Looks good */ @@ -245,11 +231,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end) printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); } - ret = update_end_of_memory(nd->end); - if (changed) - printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); - return ret; + printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", + nd->start, nd->end); } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ @@ -310,13 +294,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) start, end); e820_register_active_regions(node, start >> PAGE_SHIFT, end >> PAGE_SHIFT); - push_node_boundaries(node, nd->start >> PAGE_SHIFT, - nd->end >> PAGE_SHIFT); - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && - (reserve_hotadd(node, start, end) < 0)) { - /* Ignore hotadd region. Undo damage */ - printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); + if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { + update_nodes_add(node, start, end); + /* restore nodes[node] */ *nd = oldnode; if ((nd->start | nd->end) == 0) node_clear(node, nodes_parsed); @@ -510,26 +491,6 @@ static int null_slit_node_compare(int a, int b) } #endif /* CONFIG_NUMA_EMU */ -void __init srat_reserve_add_area(int nodeid) -{ - if (found_add_area && nodes_add[nodeid].end) { - u64 total_mb; - - printk(KERN_INFO "SRAT: Reserving hot-add memory space " - "for node %d at %Lx-%Lx\n", - nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); - total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) - >> PAGE_SHIFT; - total_mb *= sizeof(struct page); - total_mb >>= 20; - printk(KERN_INFO "SRAT: This will cost you %Lu MB of " - "pre-allocated memory.\n", (unsigned long long)total_mb); - reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, - nodes_add[nodeid].end - nodes_add[nodeid].start, - BOOTMEM_DEFAULT); - } -} - int __node_distance(int a, int b) { int index; diff --git a/include/linux/mm.h b/include/linux/mm.h index bff1f0d475c..511b0986709 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1031,8 +1031,6 @@ extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn); extern void remove_all_active_ranges(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe753ecf2aa..474c7e9dd51 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -149,10 +149,6 @@ static unsigned long __meminitdata dma_reserve; static int __meminitdata nr_nodemap_entries; static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE - static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; - static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; -#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; @@ -3102,64 +3098,6 @@ void __init sparse_memory_present_with_active_regions(int nid) early_node_map[i].end_pfn); } -/** - * push_node_boundaries - Push node boundaries to at least the requested boundary - * @nid: The nid of the node to push the boundary for - * @start_pfn: The start pfn of the node - * @end_pfn: The end pfn of the node - * - * In reserve-based hot-add, mem_map is allocated that is unused until hotadd - * time. Specifically, on x86_64, SRAT will report ranges that can potentially - * be hotplugged even though no physical memory exists. This function allows - * an arch to push out the node boundaries so mem_map is allocated that can - * be used later. - */ -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -void __init push_node_boundaries(unsigned int nid, - unsigned long start_pfn, unsigned long end_pfn) -{ - mminit_dprintk(MMINIT_TRACE, "zoneboundary", - "Entering push_node_boundaries(%u, %lu, %lu)\n", - nid, start_pfn, end_pfn); - - /* Initialise the boundary for this node if necessary */ - if (node_boundary_end_pfn[nid] == 0) - node_boundary_start_pfn[nid] = -1UL; - - /* Update the boundaries */ - if (node_boundary_start_pfn[nid] > start_pfn) - node_boundary_start_pfn[nid] = start_pfn; - if (node_boundary_end_pfn[nid] < end_pfn) - node_boundary_end_pfn[nid] = end_pfn; -} - -/* If necessary, push the node boundary out for reserve hotadd */ -static void __meminit account_node_boundary(unsigned int nid, - unsigned long *start_pfn, unsigned long *end_pfn) -{ - mminit_dprintk(MMINIT_TRACE, "zoneboundary", - "Entering account_node_boundary(%u, %lu, %lu)\n", - nid, *start_pfn, *end_pfn); - - /* Return if boundary information has not been provided */ - if (node_boundary_end_pfn[nid] == 0) - return; - - /* Check the boundaries and update if necessary */ - if (node_boundary_start_pfn[nid] < *start_pfn) - *start_pfn = node_boundary_start_pfn[nid]; - if (node_boundary_end_pfn[nid] > *end_pfn) - *end_pfn = node_boundary_end_pfn[nid]; -} -#else -void __init push_node_boundaries(unsigned int nid, - unsigned long start_pfn, unsigned long end_pfn) {} - -static void __meminit account_node_boundary(unsigned int nid, - unsigned long *start_pfn, unsigned long *end_pfn) {} -#endif - - /** * get_pfn_range_for_nid - Return the start and end page frames for a node * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. @@ -3185,9 +3123,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, if (*start_pfn == -1UL) *start_pfn = 0; - - /* Push the node boundaries out if requested */ - account_node_boundary(nid, start_pfn, end_pfn); } /* @@ -3793,10 +3728,6 @@ void __init remove_all_active_ranges(void) { memset(early_node_map, 0, sizeof(early_node_map)); nr_nodemap_entries = 0; -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE - memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); - memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); -#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ } /* Compare two active node_active_regions */ From 7c43769a9776141ec23ca81a1bdd5a9c0512f165 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:59:37 -0700 Subject: [PATCH 470/900] x86, mm: Fix node_possible_map logic Recently there were some changes to the meaning of node_possible_map, and it is quite strange: - the node without memory would be set in node_possible_map - but some node with less NODE_MIN_SIZE will be kicked out of node_possible_map. fix it by adding strict_setup_node_bootmem(). Also, remove unparse_node(). so result will be: 1. cpu_to_node() will return online node only (nearest one) 2. apicid_to_node() still returns the node that could be not online but is set in node_possible_map. 3. node_possible_map will include nodes that mem on it are less NODE_MIN_SIZE v2: after move_cpus_to_node change. [ Impact: get node_possible_map right ] Signed-off-by: Yinghai Lu Tested-by: Jack Steiner LKML-Reference: <4A0C49BE.6080800@kernel.org> [ v3: various small cleanups and comment clarifications ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/numa_64.h | 7 +++++++ arch/x86/mm/numa_64.c | 13 ++++++++++--- arch/x86/mm/srat_64.c | 29 ++--------------------------- 3 files changed, 19 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 7feff0648d7..c4ae822e415 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -24,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); #ifdef CONFIG_NUMA +/* + * Too small node sizes may confuse the VM badly. Usually they + * result from BIOS bugs. So dont recognize nodes as standalone + * NUMA entities that have less than this amount of RAM listed: + */ +#define NODE_MIN_SIZE (4*1024*1024) + extern void __init init_cpu_to_node(void); extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a6a93c39523..459913beac7 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start, } /* Initialize bootmem allocator for a node */ -void __init setup_node_bootmem(int nodeid, unsigned long start, - unsigned long end) +void __init +setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; + const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); unsigned long bootmap_start, nodedata_phys; void *bootmap; - const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); int nid; if (!end) return; + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + return; + start = roundup(start, ZONE_ALIGN); printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index b0dbbd48e58..2dfcbf9df2a 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -36,10 +36,6 @@ static int num_node_memblks __initdata; static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; -/* Too small nodes confuse the VM badly. Usually they result - from BIOS bugs. */ -#define NODE_MIN_SIZE (4*1024*1024) - static __init int setup_node(int pxm) { return acpi_map_pxm_to_node(pxm); @@ -338,17 +334,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) return 1; } -static void __init unparse_node(int node) -{ - int i; - node_clear(node, nodes_parsed); - node_clear(node, cpu_nodes_parsed); - for (i = 0; i < MAX_LOCAL_APIC; i++) { - if (apicid_to_node[i] == node) - apicid_to_node[i] = NUMA_NO_NODE; - } -} - void __init acpi_numa_arch_fixup(void) {} /* Use the information discovered above to actually set up the nodes. */ @@ -360,18 +345,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; /* First clean up the node list */ - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < MAX_NUMNODES; i++) cutoff_node(i, start, end); - /* - * don't confuse VM with a node that doesn't have the - * minimum memory. - */ - if (nodes[i].end && - (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { - unparse_node(i); - node_set_offline(i); - } - } if (!nodes_cover_memory(nodes)) { bad_srat(); @@ -404,7 +379,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (node == NUMA_NO_NODE) continue; - if (!node_isset(node, node_possible_map)) + if (!node_online(node)) numa_clear_node(i); } numa_init_array(); From 35d5a9a61490bf39d2e48d7f499c8c801a39ebe9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:59:37 -0700 Subject: [PATCH 471/900] x86: fix system without memory on node0 Jack found a boot crash on a system which doesn't have memory on node0. It turns out with recent per_cpu changes, node_number for BSP will always be 0, and it is not consistent to cpu_to_node() that might set it to a different (nearer) node already. aka when numa_set_node() for node0 is called early before per_cpu area is setup: two places touched that per_cpu(node_number,): 1. in cpu/common.c::cpu_init() and it is not for BP | #ifdef CONFIG_NUMA | if (cpu != 0 && percpu_read(node_number) == 0 && | cpu_to_node(cpu) != NUMA_NO_NODE) | percpu_write(node_number, cpu_to_node(cpu)); | #endif for BP: traps_init ==> cpu_init for AP: start_secondary ==> cpu_init 2. cpu/intel.c or amd.c::srat_detect_node via numa_set_node() for BP: check_bugs ==> identify_boot_cpu ==> identify_cpu() that is rather later before numa_node_id() is used for BP... for AP: start_secondary => smp_callin => smp_store_cpu_info() => => identify_secondary_cpu => identify_cpu() so try to set that for BP earlier in setup_per_cpu_areas(), and don't bother to set that for APs there (it will be updated later and will be used later) (and don't mess the 0 before the copying BP per_cpu data to APs) [ Impact: fix boot crash on memoryless node-0 ] Reported-and-tested-by: Jack Steiner Cc: Tejun Heo Signed-off-by: Yinghai Lu LKML-Reference: <4A0C4A02.7050401@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 3a97a4cf187..3b5f3271e73 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -423,6 +423,14 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) + /* + * make sure boot cpu node_number is right, when boot cpu is on the + * node that doesn't have mem installed + */ + per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); +#endif + /* Setup node to cpumask map */ setup_node_to_cpumask_map(); From 629e15d245f46bef9d26199b450f882f9437a8fe Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:05:16 -0700 Subject: [PATCH 472/900] x86, irq: update_mptable needs pci_routeirq To get all device irq routing and to save them. This is basically an implicit pci=routeirq enablement if (and on if) the update_mptable boot option (which is off by default) has been specified. [ Impact: extend the update_mptable boot opion's scope ] Signed-off-by: Yinghai Lu Cc: Jesse Barnes LKML-Reference: <4A0DB7B4.4060702@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index cd2a41a7c45..e6bf9d08e50 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -961,6 +962,9 @@ static int __initdata enable_update_mptable; static int __init update_mptable_setup(char *str) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif return 0; } early_param("update_mptable", update_mptable_setup); @@ -973,6 +977,9 @@ static int __initdata alloc_mptable; static int __init parse_alloc_mptable_opt(char *p) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif alloc_mptable = 1; if (!p) return 0; From f1bdb523880c7f6990e9e8e50b0fc972ca475e84 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:05:16 -0700 Subject: [PATCH 473/900] x86, irq: don't call mp_config_acpi_gsi() if update_mptable is not enabled Len expressed concern that the update_mptable feature has side-effects on the ACPI code. Make it sure explicitly that the code only ever gets called if the (default disabled) update_mptable boot quirk option is disabled. [ Impact: isolate the update_mptable feature from ACPI code more ] Signed-off-by: Yinghai Lu Cc: Len Brown LKML-Reference: <4A0DC832.5090200@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mpspec.h | 9 +++++++++ arch/x86/kernel/acpi/boot.c | 4 +++- arch/x86/kernel/mpparse.c | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index c34961a45ec..3dcbaaaa363 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -87,6 +87,15 @@ static inline int acpi_probe_gsi(void) } #endif /* CONFIG_ACPI */ +#ifdef CONFIG_X86_MPPARSE +extern int enable_update_mptable; +#else +static inline int enable_update_mptable(void) +{ + return 0; +} +#endif + #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) struct physid_mask { diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4af63dfb0f0..844e5e25213 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1226,7 +1226,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) ioapic_pin); return gsi; } - mp_config_acpi_gsi(dev, gsi, trigger, polarity); + + if (enable_update_mptable) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e6bf9d08e50..651c93b2886 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -957,7 +957,7 @@ out: return 0; } -static int __initdata enable_update_mptable; +int enable_update_mptable; static int __init update_mptable_setup(char *str) { From 24ed0c4bfc7d2d7507bb9d50f7f3bbdcd85d76dd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 17 May 2009 15:31:38 +0800 Subject: [PATCH 474/900] tracing: fix check for return value of register_module_notifier return zero should be correct, so fix it. [ Impact: eliminate incorrect syslog message ] Signed-off-by: Ming Lei Acked-by: Frederic Weisbecker Acked-by: Li Zefan Cc: rostedt@goodmis.org LKML-Reference: <1242545498-7285-1-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b606f45b6c..140699a9a8a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2826,7 +2826,7 @@ void __init ftrace_init(void) __stop_mcount_loc); ret = register_module_notifier(&ftrace_module_nb); - if (!ret) + if (ret) pr_warning("Failed to register trace ftrace module notifier\n"); return; From 818bc814447a35350ae90a329133e474bf1a2bd7 Mon Sep 17 00:00:00 2001 From: Daniel Ribeiro Date: Sat, 2 May 2009 15:05:59 -0300 Subject: [PATCH 475/900] [ARM] pxa: save/restore PGSR on suspend/resume. Signed-off-by: Daniel Ribeiro Signed-off-by: Eric Miao --- arch/arm/mach-pxa/mfp-pxa2xx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/mach-pxa/mfp-pxa2xx.c b/arch/arm/mach-pxa/mfp-pxa2xx.c index 7ffb91d64c3..6ae50604170 100644 --- a/arch/arm/mach-pxa/mfp-pxa2xx.c +++ b/arch/arm/mach-pxa/mfp-pxa2xx.c @@ -322,6 +322,7 @@ static inline void pxa27x_mfp_init(void) {} #ifdef CONFIG_PM static unsigned long saved_gafr[2][4]; static unsigned long saved_gpdr[4]; +static unsigned long saved_pgsr[4]; static int pxa2xx_mfp_suspend(struct sys_device *d, pm_message_t state) { @@ -332,6 +333,7 @@ static int pxa2xx_mfp_suspend(struct sys_device *d, pm_message_t state) saved_gafr[0][i] = GAFR_L(i); saved_gafr[1][i] = GAFR_U(i); saved_gpdr[i] = GPDR(i * 32); + saved_pgsr[i] = PGSR(i); GPDR(i * 32) = gpdr_lpm[i]; } @@ -346,6 +348,7 @@ static int pxa2xx_mfp_resume(struct sys_device *d) GAFR_L(i) = saved_gafr[0][i]; GAFR_U(i) = saved_gafr[1][i]; GPDR(i * 32) = saved_gpdr[i]; + PGSR(i) = saved_pgsr[i]; } PSSR = PSSR_RDH | PSSR_PH; return 0; From 216e3b7abbd05c35d2d1a3f86629ade485351f0d Mon Sep 17 00:00:00 2001 From: Daniel Ribeiro Date: Tue, 5 May 2009 22:43:18 -0300 Subject: [PATCH 476/900] [ARM] pxa: allow gpio_reset drive high during normal work I want to reuse tosa/spitz gpio_reset code, but my board needs the reset gpio to be driven high during normal operation. Signed-off-by: Daniel Ribeiro Acked-by: Dmitry Eremin-Solenikov Signed-off-by: Eric Miao --- arch/arm/mach-pxa/include/mach/reset.h | 5 +++-- arch/arm/mach-pxa/reset.c | 4 ++-- arch/arm/mach-pxa/spitz.c | 2 +- arch/arm/mach-pxa/tosa.c | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/arm/mach-pxa/include/mach/reset.h b/arch/arm/mach-pxa/include/mach/reset.h index 31e6a7b6ad8..b6c10556fbc 100644 --- a/arch/arm/mach-pxa/include/mach/reset.h +++ b/arch/arm/mach-pxa/include/mach/reset.h @@ -13,8 +13,9 @@ extern void clear_reset_status(unsigned int mask); /** * init_gpio_reset() - register GPIO as reset generator * @gpio: gpio nr - * @output: set gpio as out/low instead of input during normal work + * @output: set gpio as output instead of input during normal work + * @level: output level */ -extern int init_gpio_reset(int gpio, int output); +extern int init_gpio_reset(int gpio, int output, int level); #endif /* __ASM_ARCH_RESET_H */ diff --git a/arch/arm/mach-pxa/reset.c b/arch/arm/mach-pxa/reset.c index df29d45fb4e..01e9d643394 100644 --- a/arch/arm/mach-pxa/reset.c +++ b/arch/arm/mach-pxa/reset.c @@ -20,7 +20,7 @@ static void do_hw_reset(void); static int reset_gpio = -1; -int init_gpio_reset(int gpio, int output) +int init_gpio_reset(int gpio, int output, int level) { int rc; @@ -31,7 +31,7 @@ int init_gpio_reset(int gpio, int output) } if (output) - rc = gpio_direction_output(gpio, 0); + rc = gpio_direction_output(gpio, level); else rc = gpio_direction_input(gpio); if (rc) { diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c index c18e34acafc..cdacea09abf 100644 --- a/arch/arm/mach-pxa/spitz.c +++ b/arch/arm/mach-pxa/spitz.c @@ -731,7 +731,7 @@ static void spitz_restart(char mode, const char *cmd) static void __init common_init(void) { - init_gpio_reset(SPITZ_GPIO_ON_RESET, 1); + init_gpio_reset(SPITZ_GPIO_ON_RESET, 1, 0); pm_power_off = spitz_poweroff; arm_pm_restart = spitz_restart; diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c index afac5b6d3d7..a0bd46ef5d3 100644 --- a/arch/arm/mach-pxa/tosa.c +++ b/arch/arm/mach-pxa/tosa.c @@ -897,7 +897,7 @@ static void __init tosa_init(void) gpio_set_wake(MFP_PIN_GPIO1, 1); /* We can't pass to gpio-keys since it will drop the Reset altfunc */ - init_gpio_reset(TOSA_GPIO_ON_RESET, 0); + init_gpio_reset(TOSA_GPIO_ON_RESET, 0, 0); pm_power_off = tosa_poweroff; arm_pm_restart = tosa_restart; From 866bd435819df8d97767c407f8828a7a2ff971e6 Mon Sep 17 00:00:00 2001 From: Timothy Clacy Date: Thu, 7 May 2009 19:40:33 +0200 Subject: [PATCH 477/900] [ARM] pxa: enable GPIO receivers after configuring pins 'mach-pxa' platforms currently rely on a bootloader to setup GPIO pins and clear RDH (to enable inputs). A kernel loaded by a 'minimal' bootloader, that doesn't touch any pins, will not function correctly; inputs will remain disabled, even after the pins are configured. The following change fixes the issue and has been verified on Gumstix Verdex XL6P and a custom PXA270 platform. Signed-off-by: Timothy Clacy Signed-off-by: Eric Miao --- arch/arm/mach-pxa/mfp-pxa2xx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/mach-pxa/mfp-pxa2xx.c b/arch/arm/mach-pxa/mfp-pxa2xx.c index 6ae50604170..cf6b720c055 100644 --- a/arch/arm/mach-pxa/mfp-pxa2xx.c +++ b/arch/arm/mach-pxa/mfp-pxa2xx.c @@ -377,6 +377,9 @@ static int __init pxa2xx_mfp_init(void) if (cpu_is_pxa27x()) pxa27x_mfp_init(); + /* clear RDH bit to enable GPIO receivers after reset/sleep exit */ + PSSR = PSSR_RDH; + /* initialize gafr_run[], pgsr_lpm[] from existing values */ for (i = 0; i <= gpio_to_bank(pxa_last_gpio); i++) gpdr_lpm[i] = GPDR(i * 32); From a81b38688f50f51123490d45d51f4a10e8dc1184 Mon Sep 17 00:00:00 2001 From: Dmitry Eremin-Solenikov Date: Fri, 15 May 2009 10:11:22 +0400 Subject: [PATCH 478/900] [ARM] pxa/spitz: provide spitz_ohci_exit() that unregisters USB_HOST GPIO Currently spitz_ohci_init() that requests GPIO doesn't have corresponding spitz_ohci_exit() which will gpio_free(). This causes minor problems e.g. during resume when the OHCI device can't be resumed. Signed-off-by: Dmitry Eremin-Solenikov Signed-off-by: Eric Miao --- arch/arm/mach-pxa/spitz.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c index cdacea09abf..5a45fe340a1 100644 --- a/arch/arm/mach-pxa/spitz.c +++ b/arch/arm/mach-pxa/spitz.c @@ -531,9 +531,15 @@ static int spitz_ohci_init(struct device *dev) return gpio_direction_output(SPITZ_GPIO_USB_HOST, 1); } +static void spitz_ohci_exit(struct device *dev) +{ + gpio_free(SPITZ_GPIO_USB_HOST); +} + static struct pxaohci_platform_data spitz_ohci_platform_data = { .port_mode = PMM_NPS_MODE, .init = spitz_ohci_init, + .exit = spitz_ohci_exit, .flags = ENABLE_PORT_ALL | NO_OC_PROTECTION, .power_budget = 150, }; From 2b69a8a2b6e5f5d26a038a6494a88a1a776ac88f Mon Sep 17 00:00:00 2001 From: Eric Moore Date: Mon, 18 May 2009 12:57:24 -0600 Subject: [PATCH 479/900] [SCSI] mpt2sas: fix driver version inconsistency In Commit commit 3b8b5c9b1f08660583e5dfe095c24170df62f1d2 Author: Eric Moore Date: Tue Apr 21 15:44:27 2009 -0600 [SCSI] mpt2sas : bump driver version to 01.100.02.00 The MPT2SAS_MAJOR_VERSION didn't get bumped from 00 to 01 so applications will see it incorrectly as 00.100.02.00 driver instead of 01.100.02.00. Fix by making MPT2SAS_MAJOR_VERSION match the major number in MPT2SAS_DRIVER_VERSION Signed-off-by: Eric Moore Signed-off-by: James Bottomley --- drivers/scsi/mpt2sas/mpt2sas_base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/mpt2sas/mpt2sas_base.h b/drivers/scsi/mpt2sas/mpt2sas_base.h index babd4cc0cb2..36b1d1052ba 100644 --- a/drivers/scsi/mpt2sas/mpt2sas_base.h +++ b/drivers/scsi/mpt2sas/mpt2sas_base.h @@ -69,7 +69,7 @@ #define MPT2SAS_AUTHOR "LSI Corporation " #define MPT2SAS_DESCRIPTION "LSI MPT Fusion SAS 2.0 Device Driver" #define MPT2SAS_DRIVER_VERSION "01.100.02.00" -#define MPT2SAS_MAJOR_VERSION 00 +#define MPT2SAS_MAJOR_VERSION 01 #define MPT2SAS_MINOR_VERSION 100 #define MPT2SAS_BUILD_VERSION 02 #define MPT2SAS_RELEASE_VERSION 00 From 4200efd9acda4accf24640f1e77d24fdcdb524df Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 May 2009 09:22:19 +0200 Subject: [PATCH 480/900] sched: properly define the sched_group::cpumask and sched_domain::span fields Properly document the variable-size structure tricks we are doing wrt. struct sched_group and sched_domain, and use the field[0] GCC extension instead of defining a vla array. Dont use unions for this, as pointed out by Linus. [ Impact: cleanup, un-confuse Sparse and LLVM ] Reported-by: Jeff Garzik Acked-by: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 25 ++++++++++++++++++++++--- kernel/sched.c | 5 +++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index de7b3b21777..dbb1043e865 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -839,7 +839,17 @@ struct sched_group { */ u32 reciprocal_cpu_power; - unsigned long cpumask[]; + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + * + * It is also be embedded into static data structures at build + * time. (See 'struct static_sched_group' in kernel/sched.c) + */ + unsigned long cpumask[0]; }; static inline struct cpumask *sched_group_cpus(struct sched_group *sg) @@ -925,8 +935,17 @@ struct sched_domain { char *name; #endif - /* span of all CPUs in this domain */ - unsigned long span[]; + /* + * Span of all CPUs in this domain. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + * + * It is also be embedded into static data structures at build + * time. (See 'struct static_sched_domain' in kernel/sched.c) + */ + unsigned long span[0]; }; static inline struct cpumask *sched_domain_span(struct sched_domain *sd) diff --git a/kernel/sched.c b/kernel/sched.c index 497c09ba61e..228acae8821 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7948,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * The cpus mask in sched_group and sched_domain hangs off the end. - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space - * for nr_cpu_ids < CONFIG_NR_CPUS. + * + * ( See the the comments in include/linux/sched.h:struct sched_group + * and struct sched_domain. ) */ struct static_sched_group { struct sched_group sg; From 143c145e3a475065a4be661468d0df1bd0b25f74 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 19 May 2009 14:43:15 +0800 Subject: [PATCH 481/900] tracing/events: Documentation updates - fix some typos - document the difference between '>' and '>>' - document the 'enable' toggle - remove section "Defining an event-enabled tracepoint", since it's out-dated and sample/trace_events/ already serves this purpose. v2: add "Updated by Li Zefan" [ Impact: make documentation up-to-date ] Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: "Theodore Ts'o" LKML-Reference: <4A125503.5060406@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- Documentation/trace/events.txt | 129 +++++++++++---------------------- 1 file changed, 42 insertions(+), 87 deletions(-) diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index abdee664c0f..f157d7594ea 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -1,9 +1,10 @@ Event Tracing Documentation written by Theodore Ts'o + Updated by Li Zefan -Introduction -============ +1. Introduction +=============== Tracepoints (see Documentation/trace/tracepoints.txt) can be used without creating custom kernel modules to register probe functions @@ -12,30 +13,37 @@ using the event tracing infrastructure. Not all tracepoints can be traced using the event tracing system; the kernel developer must provide code snippets which define how the tracing information is saved into the tracing buffer, and how the -the tracing information should be printed. +tracing information should be printed. -Using Event Tracing -=================== +2. Using Event Tracing +====================== + +2.1 Via the 'set_event' interface +--------------------------------- The events which are available for tracing can be found in the file -/sys/kernel/debug/tracing/available_events. +/debug/tracing/available_events. To enable a particular event, such as 'sched_wakeup', simply echo it -to /sys/debug/tracing/set_event. For example: +to /debug/tracing/set_event. For example: - # echo sched_wakeup > /sys/kernel/debug/tracing/set_event + # echo sched_wakeup >> /debug/tracing/set_event -[ Note: events can also be enabled/disabled via the 'enabled' toggle - found in the /sys/kernel/tracing/events/ hierarchy of directories. ] +[ Note: '>>' is necessary, otherwise it will firstly disable + all the events. ] To disable an event, echo the event name to the set_event file prefixed with an exclamation point: - # echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event + # echo '!sched_wakeup' >> /debug/tracing/set_event -To disable events, echo an empty line to the set_event file: +To disable all events, echo an empty line to the set_event file: - # echo > /sys/kernel/debug/tracing/set_event + # echo > /debug/tracing/set_event + +To enable all events, echo '*:*' or '*:' to the set_event file: + + # echo *:* > /debug/tracing/set_event The events are organized into subsystems, such as ext4, irq, sched, etc., and a full event name looks like this: :. The @@ -44,92 +52,39 @@ file. All of the events in a subsystem can be specified via the syntax ":*"; for example, to enable all irq events, you can use the command: - # echo 'irq:*' > /sys/kernel/debug/tracing/set_event + # echo 'irq:*' > /debug/tracing/set_event -Defining an event-enabled tracepoint ------------------------------------- +2.2 Via the 'enable' toggle +--------------------------- -A kernel developer which wishes to define an event-enabled tracepoint -must declare the tracepoint using TRACE_EVENT instead of DECLARE_TRACE. -This is done via two header files in include/trace. For example, to -event-enable the jbd2 subsystem, we must create two files, -include/trace/jbd2.h and include/trace/jbd2_event_types.h. The -include/trace/jbd2.h file should be included by kernel source files that -will have a tracepoint inserted, and might look like this: +The events available are also listed in /debug/tracing/events/ hierarchy +of directories. -#ifndef _TRACE_JBD2_H -#define _TRACE_JBD2_H +To enable event 'sched_wakeup': -#include -#include + # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable -#include +To disable it: -#endif + # echo 0 > /debug/tracing/events/sched/sched_wakeup/enable -In a file that utilizes a jbd2 tracepoint, this header file would be -included. Note that you still have to use DEFINE_TRACE(). So for -example, if fs/jbd2/commit.c planned to use the jbd2_start_commit -tracepoint, it would have the following near the beginning of the file: +To enable all events in sched subsystem: -#include + # echo 1 > /debug/tracing/events/sched/enable -DEFINE_TRACE(jbd2_start_commit); +To eanble all events: -Then in the function that would call the tracepoint, it would call the -tracepoint function. (For more information, please see the tracepoint -documentation in Documentation/trace/tracepoints.txt): + # echo 1 > /debug/tracing/events/enable - trace_jbd2_start_commit(journal, commit_transaction); +When reading one of these enable files, there are four results: -The code snippets which allow jbd2_start_commit to be an event-enabled -tracepoint are placed in the file include/trace/jbd2_event_types.h: + 0 - all events this file affects are disabled + 1 - all events this file affects are enabled + X - there is a mixture of events enabled and disabled + ? - this file does not affect any event -/* use instead */ -#ifndef TRACE_EVENT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif +3. Defining an event-enabled tracepoint +======================================= -#undef TRACE_SYSTEM -#define TRACE_SYSTEM jbd2 +See The example provided in samples/trace_events -#include - -TRACE_EVENT(jbd2_start_commit, - TP_PROTO(journal_t *journal, transaction_t *commit_transaction), - TP_ARGS(journal, commit_transaction), - TP_STRUCT__entry( - __array( char, devname, BDEVNAME_SIZE+24 ) - __field( int, transaction ) - ), - TP_fast_assign( - memcpy(__entry->devname, journal->j_devname, BDEVNAME_SIZE+24); - __entry->transaction = commit_transaction->t_tid; - ), - TP_printk("dev %s transaction %d", - __entry->devname, __entry->transaction) -); - -The TP_PROTO and TP_ARGS are unchanged from DECLARE_TRACE. The new -arguments to TRACE_EVENT are TP_STRUCT__entry, TP_fast_assign, and -TP_printk. - -TP_STRUCT__entry defines the data structure which will be stored in the -trace buffer. Normally, fields in __entry will be arrays or simple -types. It is possible to place data structures in __entry --- however, -pointers in the data structure can not be trusted, since they will be -accessed sometime later by TP_printk, and if the data structure contains -fields that will not or cannot be used by TP_printk, this will waste -space in the trace buffer. In general, data structures should be -avoided, unless they do only contain non-pointer types and all of the -fields will be used by TP_printk. - -TP_fast_assign defines the code snippet which saves information into the -__entry data structure, using the passed-in arguments defined in -TP_PROTO and TP_ARGS. - -Finally, TP_printk will print the __entry data structure. At the time -when the code snippet defined by TP_printk is executed, it will not have -access to the TP_ARGS arguments; it can only use the information saved -in the __entry data structure. From fd51d251e4cdb21f68e9dbc4336514d64a105a79 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Tue, 19 May 2009 09:59:08 +0200 Subject: [PATCH 482/900] blktrace: remove debugfs entries on bad path debugfs directory entries for devices are not removed on some of the failure pathes in do_blk_trace_setup(). One way to reproduce is to start blktrace on multiple devices with insufficient Vmalloc space: Devices will fail with a message like this: BLKTRACESETUP(2) /dev/sdu failed: 5/Input/output error If so, the respective entries in debugfs (e.g. /sys/kernel/debug/block/sdu) will remain and subsequent attempts to start blktrace on the respective devices will not succeed due to existing directories. [ Impact: fix /debug/tracing file cleanup corner case ] Signed-off-by: Stefan Raspl Acked-by: Li Zefan Cc: Li Zefan Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com LKML-Reference: <4A1266CC.5040801@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 05b4747fd87..e3abf55bc8e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -262,6 +262,7 @@ static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); + debugfs_remove(bt->dir); relay_close(bt->rchan); free_percpu(bt->sequence); free_percpu(bt->msg_data); From 4aee2ad461889132bfb5a1518a9580d00e17008c Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 19 May 2009 17:07:01 +0530 Subject: [PATCH 483/900] x86: asm/processor.h: remove double declaration Remove double declaration of: extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; they are already defined in the same file. [ Impact: cleanup ] Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1242733021.3377.1.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 34c52370f2f..85628ea9d9b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -403,9 +403,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary); extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern unsigned short num_cache_leaves; struct thread_struct { /* Cached TLS descriptors: */ From 4c6f18fc81565967da20f2d4a3922cdba33f8e2b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 18 May 2009 10:23:28 -0700 Subject: [PATCH 484/900] x86, io-apic: Don't mark pin_programmed early Peter bisected that: | commit b9c61b70075c87a8612624736faf4a2de5b1ed30 | Date: Wed May 6 10:10:06 2009 -0700 | | x86/pci: update pirq_enable_irq() to setup io apic routing | | So we can set io apic routing only when enabling the device irq. wrecked his opteron box, ata1 interrupts fail to get through. ata1 is using irq 11: [ 1.451839] sata_svw 0000:01:0e.0: version 2.3 [ 1.456333] sata_svw 0000:01:0e.0: PCI INT A -> GSI 11 (level, low) -> IRQ 11 [ 1.463639] scsi0 : sata_svw [ 1.466949] scsi1 : sata_svw [ 1.470022] scsi2 : sata_svw [ 1.473090] scsi3 : sata_svw [ 1.476112] ata1: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe000 irq 11 [ 1.483490] ata2: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe100 irq 11 [ 1.490870] ata3: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe200 irq 11 [ 1.498247] ata4: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe300 irq 11 that pin is overlapped with pin with legacy ones. We should not set bits in pin_programmed here, so that those bit could be set later via io_apic_set_pci_routing(). [ Impact: fix boot hang on certain systems ] Reported-by: Peter Zijlstra Signed-off-by: Yinghai Lu Tested-by: Peter Zijlstra Cc: Jack Steiner LKML-Reference: <4A119990.9020606@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ce1ac74baa7..ac7f3b6ad58 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1537,7 +1537,10 @@ static void __init setup_IO_APIC_irqs(void) } cfg = desc->chip_data; add_pin_to_irq_node(cfg, node, apic_id, pin); - set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + /* + * don't mark it in pin_programmed, so later acpi could + * set it correctly when irq < 16 + */ setup_IO_APIC_irq(apic_id, pin, irq, desc, irq_trigger(idx), irq_polarity(idx)); } From 8e7d2b2c6ecd3c21a54b877eae3d5be48292e6b5 Mon Sep 17 00:00:00 2001 From: Jesse Barnes Date: Fri, 8 May 2009 16:13:25 -0700 Subject: [PATCH 485/900] drm/i915: allocate large pointer arrays with vmalloc For awhile now, many of the GEM code paths have allocated page or object arrays with the slab allocator. This is nice and fast, but won't work well if memory is fragmented, since the slab allocator works with physically contiguous memory (i.e. order > 2 allocations are likely to fail fairly early after booting and doing some work). This patch works around the issue by falling back to vmalloc for >PAGE_SIZE allocations. This is ugly, but much less work than chaining a bunch of pages together by hand (suprisingly there's not a bunch of generic kernel helpers for this yet afaik). vmalloc space is somewhat precious on 32 bit kernels, but our allocations shouldn't be big enough to cause problems, though they're routinely more than a page. Note that this patch doesn't address the unchecked alloc-based-on-ioctl-args in GEM; that needs to be fixed in a separate patch. Also, I've deliberately ignored the DRM's "area" junk. I don't think anyone actually uses it anymore and I'm hoping it gets ripped out soon. [Updated: removed size arg to new free function. We could unify the free functions as well once the DRM mem tracking is ripped out.] fd.o bug #20152 (part 1/3) Signed-off-by: Jesse Barnes Signed-off-by: Eric Anholt --- drivers/gpu/drm/i915/i915_gem.c | 38 +++++++++++++-------------------- include/drm/drmP.h | 24 +++++++++++++++++++++ 2 files changed, 39 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index b189b49c760..4a24c90fb94 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -349,7 +349,7 @@ i915_gem_shmem_pread_slow(struct drm_device *dev, struct drm_gem_object *obj, last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE; num_pages = last_data_page - first_data_page + 1; - user_pages = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); + user_pages = drm_calloc_large(num_pages, sizeof(struct page *)); if (user_pages == NULL) return -ENOMEM; @@ -429,7 +429,7 @@ fail_put_user_pages: SetPageDirty(user_pages[i]); page_cache_release(user_pages[i]); } - kfree(user_pages); + drm_free_large(user_pages); return ret; } @@ -649,7 +649,7 @@ i915_gem_gtt_pwrite_slow(struct drm_device *dev, struct drm_gem_object *obj, last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE; num_pages = last_data_page - first_data_page + 1; - user_pages = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); + user_pages = drm_calloc_large(num_pages, sizeof(struct page *)); if (user_pages == NULL) return -ENOMEM; @@ -719,7 +719,7 @@ out_unlock: out_unpin_pages: for (i = 0; i < pinned_pages; i++) page_cache_release(user_pages[i]); - kfree(user_pages); + drm_free_large(user_pages); return ret; } @@ -824,7 +824,7 @@ i915_gem_shmem_pwrite_slow(struct drm_device *dev, struct drm_gem_object *obj, last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE; num_pages = last_data_page - first_data_page + 1; - user_pages = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); + user_pages = drm_calloc_large(num_pages, sizeof(struct page *)); if (user_pages == NULL) return -ENOMEM; @@ -902,7 +902,7 @@ fail_unlock: fail_put_user_pages: for (i = 0; i < pinned_pages; i++) page_cache_release(user_pages[i]); - kfree(user_pages); + drm_free_large(user_pages); return ret; } @@ -1408,9 +1408,7 @@ i915_gem_object_put_pages(struct drm_gem_object *obj) } obj_priv->dirty = 0; - drm_free(obj_priv->pages, - page_count * sizeof(struct page *), - DRM_MEM_DRIVER); + drm_free_large(obj_priv->pages); obj_priv->pages = NULL; } @@ -2024,8 +2022,7 @@ i915_gem_object_get_pages(struct drm_gem_object *obj) */ page_count = obj->size / PAGE_SIZE; BUG_ON(obj_priv->pages != NULL); - obj_priv->pages = drm_calloc(page_count, sizeof(struct page *), - DRM_MEM_DRIVER); + obj_priv->pages = drm_calloc_large(page_count, sizeof(struct page *)); if (obj_priv->pages == NULL) { DRM_ERROR("Faled to allocate page list\n"); obj_priv->pages_refcount--; @@ -3111,7 +3108,7 @@ i915_gem_get_relocs_from_user(struct drm_i915_gem_exec_object *exec_list, reloc_count += exec_list[i].relocation_count; } - *relocs = drm_calloc(reloc_count, sizeof(**relocs), DRM_MEM_DRIVER); + *relocs = drm_calloc_large(reloc_count, sizeof(**relocs)); if (*relocs == NULL) return -ENOMEM; @@ -3125,8 +3122,7 @@ i915_gem_get_relocs_from_user(struct drm_i915_gem_exec_object *exec_list, exec_list[i].relocation_count * sizeof(**relocs)); if (ret != 0) { - drm_free(*relocs, reloc_count * sizeof(**relocs), - DRM_MEM_DRIVER); + drm_free_large(*relocs); *relocs = NULL; return -EFAULT; } @@ -3165,7 +3161,7 @@ i915_gem_put_relocs_to_user(struct drm_i915_gem_exec_object *exec_list, } err: - drm_free(relocs, reloc_count * sizeof(*relocs), DRM_MEM_DRIVER); + drm_free_large(relocs); return ret; } @@ -3198,10 +3194,8 @@ i915_gem_execbuffer(struct drm_device *dev, void *data, return -EINVAL; } /* Copy in the exec list from userland */ - exec_list = drm_calloc(sizeof(*exec_list), args->buffer_count, - DRM_MEM_DRIVER); - object_list = drm_calloc(sizeof(*object_list), args->buffer_count, - DRM_MEM_DRIVER); + exec_list = drm_calloc_large(sizeof(*exec_list), args->buffer_count); + object_list = drm_calloc_large(sizeof(*object_list), args->buffer_count); if (exec_list == NULL || object_list == NULL) { DRM_ERROR("Failed to allocate exec or object list " "for %d buffers\n", @@ -3462,10 +3456,8 @@ err: } pre_mutex_err: - drm_free(object_list, sizeof(*object_list) * args->buffer_count, - DRM_MEM_DRIVER); - drm_free(exec_list, sizeof(*exec_list) * args->buffer_count, - DRM_MEM_DRIVER); + drm_free_large(object_list); + drm_free_large(exec_list); drm_free(cliprects, sizeof(*cliprects) * args->num_cliprects, DRM_MEM_DRIVER); diff --git a/include/drm/drmP.h b/include/drm/drmP.h index c8c42215143..b84d8ae35e6 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -1519,6 +1519,30 @@ static __inline__ void *drm_calloc(size_t nmemb, size_t size, int area) { return kcalloc(nmemb, size, GFP_KERNEL); } + +static __inline__ void *drm_calloc_large(size_t nmemb, size_t size) +{ + u8 *addr; + + if (size <= PAGE_SIZE) + return kcalloc(nmemb, size, GFP_KERNEL); + + addr = vmalloc(nmemb * size); + if (!addr) + return NULL; + + memset(addr, 0, nmemb * size); + + return addr; +} + +static __inline void drm_free_large(void *ptr) +{ + if (!is_vmalloc_addr(ptr)) + return kfree(ptr); + + vfree(ptr); +} #else extern void *drm_alloc(size_t size, int area); extern void drm_free(void *pt, size_t size, int area); From b3bad72e494fb2ff0c81be4ca2ddb94adf6a47c2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 17 May 2009 20:17:06 +0200 Subject: [PATCH 486/900] PCI PM: Fix initialization and kexec breakage for some devices Recent PCI PM changes introduced a bug that causes some devices to be mishandled after kexec and during early initialization. The failure scenario in the kexec case is the following: * Assume a PCI device is not power-manageable by the platform and has PCI_PM_CTRL_NO_SOFT_RESET set in PMCSR. * The device is put into D3 before kexec (using the native PCI PM). * After kexec, pci_setup_device() sets the device's power state to PCI_UNKNOWN. * pci_set_power_state(dev, PCI_D0) is called by the device's driver. * __pci_start_power_transition(dev, PCI_D0) is called and since the device is not power-manageable by the platform, it causes pci_update_current_state(dev, PCI_D0) to be called. As a result the device's current_state field is updated to PCI_D3, in accordance with the contents of its PCI PM registers. * pci_raw_set_power_state() is called and it changes the device power state to D0. *However*, it should also call pci_restore_bars() to reinitialize the device, but it doesn't, because the device's current_state field has been modified earlier. To prevent this from happening, modify pci_platform_power_transition() so that it doesn't use pci_update_current_state() to update the current_state field for devices that aren't power-manageable by the platform. Instead, this field should be updated directly for devices that don't support the native PCI PM. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 34bf0fdf504..1a91bf9687a 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -557,7 +557,8 @@ static int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state) } else { error = -ENODEV; /* Fall back to PCI_D0 if native PM is not supported */ - pci_update_current_state(dev, PCI_D0); + if (!dev->pm_cap) + dev->current_state = PCI_D0; } return error; From ff71338ed31398384b2e5992623d52f9aaba1da1 Mon Sep 17 00:00:00 2001 From: Daniel Ribeiro Date: Fri, 15 May 2009 06:33:50 -0300 Subject: [PATCH 487/900] [ARM] pxa/ezx: fix pin configuration for low power mode Fix LPM configuration on ezx.c Signed-off-by: Daniel Ribeiro Signed-off-by: Eric Miao --- arch/arm/mach-pxa/ezx.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/arm/mach-pxa/ezx.c b/arch/arm/mach-pxa/ezx.c index 92ba16e1b6f..7db966dc29c 100644 --- a/arch/arm/mach-pxa/ezx.c +++ b/arch/arm/mach-pxa/ezx.c @@ -111,9 +111,9 @@ static unsigned long ezx_pin_config[] __initdata = { GPIO25_SSP1_TXD, GPIO26_SSP1_RXD, GPIO24_GPIO, /* pcap chip select */ - GPIO1_GPIO, /* pcap interrupt */ - GPIO4_GPIO, /* WDI_AP */ - GPIO55_GPIO, /* SYS_RESTART */ + GPIO1_GPIO | WAKEUP_ON_EDGE_RISE, /* pcap interrupt */ + GPIO4_GPIO | MFP_LPM_DRIVE_HIGH, /* WDI_AP */ + GPIO55_GPIO | MFP_LPM_DRIVE_HIGH, /* SYS_RESTART */ /* MMC */ GPIO32_MMC_CLK, @@ -144,20 +144,20 @@ static unsigned long ezx_pin_config[] __initdata = { #if defined(CONFIG_MACH_EZX_A780) || defined(CONFIG_MACH_EZX_E680) static unsigned long gen1_pin_config[] __initdata = { /* flip / lockswitch */ - GPIO12_GPIO, + GPIO12_GPIO | WAKEUP_ON_EDGE_BOTH, /* bluetooth (bcm2035) */ - GPIO14_GPIO | WAKEUP_ON_LEVEL_HIGH, /* HOSTWAKE */ + GPIO14_GPIO | WAKEUP_ON_EDGE_RISE, /* HOSTWAKE */ GPIO48_GPIO, /* RESET */ GPIO28_GPIO, /* WAKEUP */ /* Neptune handshake */ - GPIO0_GPIO | WAKEUP_ON_LEVEL_HIGH, /* BP_RDY */ - GPIO57_GPIO, /* AP_RDY */ - GPIO13_GPIO | WAKEUP_ON_LEVEL_HIGH, /* WDI */ - GPIO3_GPIO | WAKEUP_ON_LEVEL_HIGH, /* WDI2 */ - GPIO82_GPIO, /* RESET */ - GPIO99_GPIO, /* TC_MM_EN */ + GPIO0_GPIO | WAKEUP_ON_EDGE_FALL, /* BP_RDY */ + GPIO57_GPIO | MFP_LPM_DRIVE_HIGH, /* AP_RDY */ + GPIO13_GPIO | WAKEUP_ON_EDGE_BOTH, /* WDI */ + GPIO3_GPIO | WAKEUP_ON_EDGE_BOTH, /* WDI2 */ + GPIO82_GPIO | MFP_LPM_DRIVE_HIGH, /* RESET */ + GPIO99_GPIO | MFP_LPM_DRIVE_HIGH, /* TC_MM_EN */ /* sound */ GPIO52_SSP3_SCLK, @@ -199,21 +199,21 @@ static unsigned long gen1_pin_config[] __initdata = { defined(CONFIG_MACH_EZX_E2) || defined(CONFIG_MACH_EZX_E6) static unsigned long gen2_pin_config[] __initdata = { /* flip / lockswitch */ - GPIO15_GPIO, + GPIO15_GPIO | WAKEUP_ON_EDGE_BOTH, /* EOC */ - GPIO10_GPIO, + GPIO10_GPIO | WAKEUP_ON_EDGE_RISE, /* bluetooth (bcm2045) */ - GPIO13_GPIO | WAKEUP_ON_LEVEL_HIGH, /* HOSTWAKE */ + GPIO13_GPIO | WAKEUP_ON_EDGE_RISE, /* HOSTWAKE */ GPIO37_GPIO, /* RESET */ GPIO57_GPIO, /* WAKEUP */ /* Neptune handshake */ - GPIO0_GPIO | WAKEUP_ON_LEVEL_HIGH, /* BP_RDY */ - GPIO96_GPIO, /* AP_RDY */ - GPIO3_GPIO | WAKEUP_ON_LEVEL_HIGH, /* WDI */ - GPIO116_GPIO, /* RESET */ + GPIO0_GPIO | WAKEUP_ON_EDGE_FALL, /* BP_RDY */ + GPIO96_GPIO | MFP_LPM_DRIVE_HIGH, /* AP_RDY */ + GPIO3_GPIO | WAKEUP_ON_EDGE_FALL, /* WDI */ + GPIO116_GPIO | MFP_LPM_DRIVE_HIGH, /* RESET */ GPIO41_GPIO, /* BP_FLASH */ /* sound */ From c8b15a706d921baed3195407e4f55270112bb3c6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 May 2009 09:18:50 +0200 Subject: [PATCH 488/900] futex: cleanup error exit Reuse the put_key_ref(key2) call in the exit path. Signed-off-by: Thomas Gleixner --- kernel/futex.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 476603afd14..381125a9f1e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2185,10 +2185,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); - if (ret) { - put_futex_key(fshared, &key2); - goto out; - } + if (ret) + goto out_key2; /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); @@ -2282,6 +2280,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, out_put_keys: put_futex_key(fshared, &q.key); +out_key2: put_futex_key(fshared, &key2); out: From 1c840c14906d4ddf66c1f4f5daea059aad951c82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 May 2009 09:22:40 +0200 Subject: [PATCH 489/900] futex: fix restart for early wakeup in futex_wait_requeue_pi() The futex_wait_requeue_pi op should restart unconditionally like futex_lock_pi. The user of that function e.g. pthread_cond_wait can not be interrupted so we do not care about the SA_RESTART flag of the signal. Clean up the FIXMEs. Signed-off-by: Thomas Gleixner --- kernel/futex.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 381125a9f1e..2aa216e5b59 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2060,7 +2060,7 @@ pi_faulted: * * Returns * 0 - no early wakeup detected - * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?) + * <0 - -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2087,15 +2087,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, if (timeout && !timeout->task) ret = -ETIMEDOUT; - else { - /* - * We expect signal_pending(current), but another - * thread may have handled it for us already. - */ - /* FIXME: ERESTARTSYS or ERESTARTNOINTR? Do we care if - * the user specified SA_RESTART or not? */ - ret = -ERESTARTSYS; - } + else + ret = -ERESTARTNOINTR; } return ret; } From 2070887fdeacd9c13f3e805e3f0086c9f22a4d93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 May 2009 23:04:59 +0200 Subject: [PATCH 490/900] futex: fix restart in wait_requeue_pi If the waiter has been requeued to the outer PI futex and is interrupted by a signal and the thread handles the signal then ERESTART_RESTARTBLOCK is changed to EINTR and the restart block is discarded. That way we return an unexcpected EINTR to user space instead of ending up in futex_lock_pi_restart. But we do not need to restart the syscall because we know that the condition has changed since we have been requeued. If we would simply restart the syscall then we would drop out via the comparison of the user space value with EWOULDBLOCK. The user space side needs to handle EWOULDBLOCK anyway as the enqueueing on the inner futex can race with a requeue/wake. So we can simply return EWOULDBLOCK to user space which also signals that we did not take the outer futex and let user space handle it in the same way it has to handle the requeue/wake race. Signed-off-by: Thomas Gleixner --- kernel/futex.c | 49 +++++++++---------------------------------------- 1 file changed, 9 insertions(+), 40 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 2aa216e5b59..80b5ce71659 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1507,7 +1507,6 @@ handle_fault: #define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); -static long futex_lock_pi_restart(struct restart_block *restart); /** * fixup_owner() - Post lock pi_state and corner case management @@ -1930,21 +1929,6 @@ uaddr_faulted: goto retry; } -static long futex_lock_pi_restart(struct restart_block *restart) -{ - u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; - ktime_t t, *tp = NULL; - int fshared = restart->futex.flags & FLAGS_SHARED; - - if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t.tv64 = restart->futex.time; - tp = &t; - } - restart->fn = do_no_restart_syscall; - - return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0); -} - /* * Userspace attempted a TID -> 0 atomic transition, and failed. * This is the in-kernel slowpath: we look up the PI state (if any), @@ -2141,12 +2125,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; struct rt_mutex *pi_mutex = NULL; - struct restart_block *restart; struct futex_hash_bucket *hb; union futex_key key2; struct futex_q q; int res, ret; - u32 uval; if (!bitset) return -EINVAL; @@ -2245,30 +2227,17 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, if (rt_mutex_owner(pi_mutex) == current) rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { - ret = -EFAULT; - if (get_user(uval, uaddr2)) - goto out_put_keys; - /* - * We've already been requeued, so restart by calling - * futex_lock_pi() directly, rather then returning to this - * function. + * We've already been requeued, but we have no way to + * restart by calling futex_lock_pi() directly. We + * could restart the syscall, but that will look at + * the user space value and return right away. So we + * drop back with EWOULDBLOCK to tell user space that + * "val" has been changed. That's the same what the + * restart of the syscall would do in + * futex_wait_setup(). */ - ret = -ERESTART_RESTARTBLOCK; - restart = ¤t_thread_info()->restart_block; - restart->fn = futex_lock_pi_restart; - restart->futex.uaddr = (u32 *)uaddr2; - restart->futex.val = uval; - restart->futex.flags = 0; - if (abs_time) { - restart->futex.flags |= FLAGS_HAS_TIMEOUT; - restart->futex.time = abs_time->tv64; - } - - if (fshared) - restart->futex.flags |= FLAGS_SHARED; - if (clockrt) - restart->futex.flags |= FLAGS_CLOCKRT; + ret = -EWOULDBLOCK; } out_put_keys: From b3b778b387ed3849ebc4a51baf8617be90df6625 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 20 May 2009 17:05:52 +0200 Subject: [PATCH 491/900] ALSA: pcsp - fix printk format warning again The commit 5a641bcd6398841cc4606b0a732d41a09256fd94 changed the printk format to '%lu', but the value passed seems to be dependent on the architecture. On x86-64, I got a new warning now because an int value is passed actaully. As a workaround, just cast the value always to unsigned long. Signed-off-by: Takashi Iwai --- sound/drivers/pcsp/pcsp_mixer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/drivers/pcsp/pcsp_mixer.c b/sound/drivers/pcsp/pcsp_mixer.c index 771955a9be7..199b0337714 100644 --- a/sound/drivers/pcsp/pcsp_mixer.c +++ b/sound/drivers/pcsp/pcsp_mixer.c @@ -51,7 +51,7 @@ static int pcsp_treble_info(struct snd_kcontrol *kcontrol, if (uinfo->value.enumerated.item > chip->max_treble) uinfo->value.enumerated.item = chip->max_treble; sprintf(uinfo->value.enumerated.name, "%lu", - PCSP_CALC_RATE(uinfo->value.enumerated.item)); + (unsigned long)PCSP_CALC_RATE(uinfo->value.enumerated.item)); return 0; } From 5537937696c55530447c20aa27daccb8d0d29b33 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 May 2009 23:04:46 +0800 Subject: [PATCH 492/900] ftrace: fix check for return value of register_module_notifier in event_trace_init register_module_notifier() returns zero in the success case. So fix the inverted fail case check in trace events modules handler. [ Impact: fix spurious warning on ftrace initialization] Reported-by: Li Zefan Signed-off-by: Ming Lei Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0eec0c55dd8..9e91c4ad7c8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1174,7 +1174,7 @@ static __init int event_trace_init(void) } ret = register_module_notifier(&trace_module_nb); - if (!ret) + if (ret) pr_warning("Failed to register trace events module notifier\n"); return 0; From 2b611cb6eed04062d0a9861c82248e02c844ba3f Mon Sep 17 00:00:00 2001 From: Pavel Roskin Date: Fri, 27 Mar 2009 17:47:27 -0400 Subject: [PATCH 493/900] ath5k: fix scanning in AR2424 AR5K_PHY_PLL_40MHZ_5413 should not be ORed with AR5K_PHY_MODE_RAD_RF5112 for 5 GHz channels. The incorrect PLL value breaks scanning in the countries where 5 GHz channels are allowed. Signed-off-by: Pavel Roskin Acked-by: Nick Kossifidis Signed-off-by: John W. Linville --- drivers/net/wireless/ath5k/reset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath5k/reset.c b/drivers/net/wireless/ath5k/reset.c index 7a17d31b2fd..faede828c8f 100644 --- a/drivers/net/wireless/ath5k/reset.c +++ b/drivers/net/wireless/ath5k/reset.c @@ -359,7 +359,7 @@ int ath5k_hw_nic_wakeup(struct ath5k_hw *ah, int flags, bool initial) mode |= AR5K_PHY_MODE_FREQ_5GHZ; if (ah->ah_radio == AR5K_RF5413) - clock |= AR5K_PHY_PLL_40MHZ_5413; + clock = AR5K_PHY_PLL_40MHZ_5413; else clock |= AR5K_PHY_PLL_40MHZ; From 88f16db7a2fa63b9242e8a0fbc40d51722f2e2f9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 13 May 2009 12:04:30 +0200 Subject: [PATCH 494/900] wext: verify buffer size for SIOCSIWENCODEEXT Another design flaw in wireless extensions (is anybody surprised?) in the way it handles the iw_encode_ext structure: The structure is part of the 'extra' memory but contains the key length explicitly, instead of it just being the length of the extra buffer - size of the struct and using the explicit key length only for the get operation (which only writes it). Therefore, we have this layout: extra: +-------------------------+ | struct iw_encode_ext { | | ... | | u16 key_len; | | u8 key[0]; | | }; | +-------------------------+ | key material | +-------------------------+ Now, all drivers I checked use ext->key_len without checking that both key_len and the struct fit into the extra buffer that has been copied from userspace. This leads to a buffer overrun while reading that buffer, depending on the driver it may be possible to specify arbitrary key_len or it may need to be a proper length for the key algorithm specified. Thankfully, this is only exploitable by root, but root can actually cause a segfault or use kernel memory as a key (which you can even get back with siocgiwencode or siocgiwencodeext from the key buffer). Fix this by verifying that key_len fits into the buffer along with struct iw_encode_ext. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/wext.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/wireless/wext.c b/net/wireless/wext.c index cb6a5bb85d8..0e59f9ae9b8 100644 --- a/net/wireless/wext.c +++ b/net/wireless/wext.c @@ -786,6 +786,13 @@ static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd, err = -EFAULT; goto out; } + + if (cmd == SIOCSIWENCODEEXT) { + struct iw_encode_ext *ee = (void *) extra; + + if (iwp->length < sizeof(*ee) + ee->key_len) + return -EFAULT; + } } err = handler(dev, info, (union iwreq_data *) iwp, extra); From a54be5d43aa2d6febc5a4f8dd3b87b9429b60437 Mon Sep 17 00:00:00 2001 From: Forrest Zhang Date: Wed, 13 May 2009 11:14:39 -0400 Subject: [PATCH 495/900] ath5k: fix exp off-by-one when computing OFDM delta slope Commit e8f055f0c3b ("ath5k: Update reset code") subtly changed the code that computes floating point values for the PHY3_TIMING register such that the exponent is off by a decimal point, which can cause problems with OFDM channel operation. get_bitmask_order() actually returns the highest bit set plus one, whereas the previous code wanted the highest bit set. Instead, use ilog2 which is what this code is really calculating. Also check coef_scaled to handle the (invalid) case where we need log2(0). Signed-off-by: Bob Copeland Signed-off-by: John W. Linville --- drivers/net/wireless/ath5k/reset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath5k/reset.c b/drivers/net/wireless/ath5k/reset.c index faede828c8f..5f72c111c2e 100644 --- a/drivers/net/wireless/ath5k/reset.c +++ b/drivers/net/wireless/ath5k/reset.c @@ -26,7 +26,7 @@ \*****************************/ #include /* To determine if a card is pci-e */ -#include /* For get_bitmask_order */ +#include #include "ath5k.h" #include "reg.h" #include "base.h" @@ -69,10 +69,10 @@ static inline int ath5k_hw_write_ofdm_timings(struct ath5k_hw *ah, /* Get exponent * ALGO: coef_exp = 14 - highest set bit position */ - coef_exp = get_bitmask_order(coef_scaled); + coef_exp = ilog2(coef_scaled); /* Doesn't make sense if it's zero*/ - if (!coef_exp) + if (!coef_scaled || !coef_exp) return -EINVAL; /* Note: we've shifted coef_scaled by 24 */ From c6ac4c18fbc92a26df71ece609b082bc3099676b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 20 May 2009 11:26:09 -0700 Subject: [PATCH 496/900] x86, boot: correct the calculation of ZO_INIT_SIZE Correct the calculation of ZO_INIT_SIZE (the amount of memory we need during decompression). One symbol (ZO_startup_32) was missing from zoffset.h, and another (ZO_z_extract_offset) was misspelled. [ Impact: build fix ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 2 +- arch/x86/boot/header.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 75e0301fc69..619d297aa2b 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -95,7 +95,7 @@ targets += voffset.h $(obj)/voffset.h: vmlinux FORCE $(call if_changed,voffset) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 68c3bfbaff2..1040f6e8010 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -226,7 +226,7 @@ setup_data: .quad 0 # 64-bit physical pointer to pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr -#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_extract_offset) +#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) #define VO_INIT_SIZE (VO__end - VO__text) #if ZO_INIT_SIZE > VO_INIT_SIZE #define INIT_SIZE ZO_INIT_SIZE From fbc9f97bbf5e1eaee562eba93dc60faaff3f3bfa Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 15 May 2009 16:13:46 -0700 Subject: [PATCH 497/900] iwlwifi: do not cancel delayed work inside spin_lock_irqsave Calling cancel_delayed_work() from inside spin_lock_irqsave, introduces a potential deadlock. As explained by Johannes Berg A - lock T - timer phase CPU 1 CPU 2 --------------------------------------------- some place that calls cancel_timer_sync() (which is the | code) lock-irq(A) | "lock-irq"(T) | "unlock"(T) | wait(T) unlock(A) timer softirq "lock"(T) run(T) "unlock"(T) irq handler lock(A) unlock(A) Now all that again, interleaved, leading to deadlock: lock-irq(A) "lock"(T) run(T) IRQ during or maybe before run(T) --> lock(A) "lock-irq"(T) wait(T) We fix this by moving the call to cancel_delayed_work() into workqueue. There are cases where the work may not actually be queued or running at the time we are trying to cancel it, but cancel_delayed_work() is able to deal with this. Also cleanup iwl_set_mode related to this call. This function (iwl_set_mode) is only called when bringing interface up and there will thus not be any scanning done. No need to try to cancel scanning. Fixes http://bugzilla.kernel.org/show_bug.cgi?id=13224, which was also reported at http://marc.info/?l=linux-wireless&m=124081921903223&w=2 . Tested-by: Miles Lane Signed-off-by: Reinette Chatre Acked-by: Zhu Yi Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-agn.c | 7 ------- drivers/net/wireless/iwlwifi/iwl-scan.c | 7 ++++--- drivers/net/wireless/iwlwifi/iwl3945-base.c | 9 ++------- 3 files changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c index 3bb28db4a40..f46ba247577 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn.c @@ -669,13 +669,6 @@ static int iwl_set_mode(struct iwl_priv *priv, int mode) if (!iwl_is_ready_rf(priv)) return -EAGAIN; - cancel_delayed_work(&priv->scan_check); - if (iwl_scan_cancel_timeout(priv, 100)) { - IWL_WARN(priv, "Aborted scan still in progress after 100ms\n"); - IWL_DEBUG_MAC80211(priv, "leaving - scan abort failed.\n"); - return -EAGAIN; - } - iwl_commit_rxon(priv); return 0; diff --git a/drivers/net/wireless/iwlwifi/iwl-scan.c b/drivers/net/wireless/iwlwifi/iwl-scan.c index e7c65c4f741..6330b91e37c 100644 --- a/drivers/net/wireless/iwlwifi/iwl-scan.c +++ b/drivers/net/wireless/iwlwifi/iwl-scan.c @@ -227,9 +227,6 @@ static void iwl_rx_scan_complete_notif(struct iwl_priv *priv, /* The HW is no longer scanning */ clear_bit(STATUS_SCAN_HW, &priv->status); - /* The scan completion notification came in, so kill that timer... */ - cancel_delayed_work(&priv->scan_check); - IWL_DEBUG_INFO(priv, "Scan pass on %sGHz took %dms\n", (priv->scan_bands & BIT(IEEE80211_BAND_2GHZ)) ? "2.4" : "5.2", @@ -712,6 +709,8 @@ static void iwl_bg_request_scan(struct work_struct *data) mutex_lock(&priv->mutex); + cancel_delayed_work(&priv->scan_check); + if (!iwl_is_ready(priv)) { IWL_WARN(priv, "request scan called when driver not ready.\n"); goto done; @@ -925,6 +924,8 @@ void iwl_bg_scan_completed(struct work_struct *work) IWL_DEBUG_SCAN(priv, "SCAN complete scan\n"); + cancel_delayed_work(&priv->scan_check); + ieee80211_scan_completed(priv->hw, false); if (test_bit(STATUS_EXIT_PENDING, &priv->status)) diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c index 4cce6613350..ff4d0e41d7c 100644 --- a/drivers/net/wireless/iwlwifi/iwl3945-base.c +++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c @@ -782,13 +782,6 @@ static int iwl3945_set_mode(struct iwl_priv *priv, int mode) if (!iwl_is_ready_rf(priv)) return -EAGAIN; - cancel_delayed_work(&priv->scan_check); - if (iwl_scan_cancel_timeout(priv, 100)) { - IWL_WARN(priv, "Aborted scan still in progress after 100ms\n"); - IWL_DEBUG_MAC80211(priv, "leaving - scan abort failed.\n"); - return -EAGAIN; - } - iwl3945_commit_rxon(priv); return 0; @@ -3298,6 +3291,8 @@ static void iwl3945_bg_request_scan(struct work_struct *data) mutex_lock(&priv->mutex); + cancel_delayed_work(&priv->scan_check); + if (!iwl_is_ready(priv)) { IWL_WARN(priv, "request scan called when driver not ready.\n"); goto done; From 875690c378d64d9ee2de15cad8206d3f11ae5096 Mon Sep 17 00:00:00 2001 From: Fabio Rossi Date: Wed, 1 Apr 2009 20:37:50 +0200 Subject: [PATCH 498/900] ath5k: fix interpolation with equal power levels When the EEPROM contains weird values for the power levels we have to fix the interpolation process. Signed-off-by: Fabio Rossi Acked-by: Nick Kossifidis Signed-off-by: John W. Linville --- drivers/net/wireless/ath5k/phy.c | 43 +++++++++++++++++++------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/drivers/net/wireless/ath5k/phy.c b/drivers/net/wireless/ath5k/phy.c index 9e2faae5ae9..b48b29dca3d 100644 --- a/drivers/net/wireless/ath5k/phy.c +++ b/drivers/net/wireless/ath5k/phy.c @@ -1487,28 +1487,35 @@ ath5k_get_linear_pcdac_min(const u8 *stepL, const u8 *stepR, { s8 tmp; s16 min_pwrL, min_pwrR; - s16 pwr_i = pwrL[0]; + s16 pwr_i; - do { - pwr_i--; - tmp = (s8) ath5k_get_interpolated_value(pwr_i, - pwrL[0], pwrL[1], - stepL[0], stepL[1]); + if (pwrL[0] == pwrL[1]) + min_pwrL = pwrL[0]; + else { + pwr_i = pwrL[0]; + do { + pwr_i--; + tmp = (s8) ath5k_get_interpolated_value(pwr_i, + pwrL[0], pwrL[1], + stepL[0], stepL[1]); + } while (tmp > 1); - } while (tmp > 1); + min_pwrL = pwr_i; + } - min_pwrL = pwr_i; + if (pwrR[0] == pwrR[1]) + min_pwrR = pwrR[0]; + else { + pwr_i = pwrR[0]; + do { + pwr_i--; + tmp = (s8) ath5k_get_interpolated_value(pwr_i, + pwrR[0], pwrR[1], + stepR[0], stepR[1]); + } while (tmp > 1); - pwr_i = pwrR[0]; - do { - pwr_i--; - tmp = (s8) ath5k_get_interpolated_value(pwr_i, - pwrR[0], pwrR[1], - stepR[0], stepR[1]); - - } while (tmp > 1); - - min_pwrR = pwr_i; + min_pwrR = pwr_i; + } /* Keep the right boundary so that it works for both curves */ return max(min_pwrL, min_pwrR); From 267d493b322b05984048aef8ea9b5b213490bbe0 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 20 May 2009 10:51:41 -0400 Subject: [PATCH 499/900] airo: fix airo_get_encode{,ext} buffer overflow like I mean it... "airo: airo_get_encode{,ext} potential buffer overflow" was actually a no-op, due to an unrecognized type overflow in an assignment. Oddly, gcc only seems to tell me about it when using -Wextra...grrr... Signed-off-by: John W. Linville --- drivers/net/wireless/airo.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c index d7347573912..9eabf4d1f2e 100644 --- a/drivers/net/wireless/airo.c +++ b/drivers/net/wireless/airo.c @@ -6467,6 +6467,7 @@ static int airo_get_encode(struct net_device *dev, { struct airo_info *local = dev->ml_priv; int index = (dwrq->flags & IW_ENCODE_INDEX) - 1; + int wep_key_len; u8 buf[16]; if (!local->wep_capable) @@ -6500,11 +6501,13 @@ static int airo_get_encode(struct net_device *dev, dwrq->flags |= index + 1; /* Copy the key to the user buffer */ - dwrq->length = get_wep_key(local, index, &buf[0], sizeof(buf)); - if (dwrq->length != -1) - memcpy(extra, buf, dwrq->length); - else + wep_key_len = get_wep_key(local, index, &buf[0], sizeof(buf)); + if (wep_key_len < 0) { dwrq->length = 0; + } else { + dwrq->length = wep_key_len; + memcpy(extra, buf, dwrq->length); + } return 0; } @@ -6617,7 +6620,7 @@ static int airo_get_encodeext(struct net_device *dev, struct airo_info *local = dev->ml_priv; struct iw_point *encoding = &wrqu->encoding; struct iw_encode_ext *ext = (struct iw_encode_ext *)extra; - int idx, max_key_len; + int idx, max_key_len, wep_key_len; u8 buf[16]; if (!local->wep_capable) @@ -6661,11 +6664,13 @@ static int airo_get_encodeext(struct net_device *dev, memset(extra, 0, 16); /* Copy the key to the user buffer */ - ext->key_len = get_wep_key(local, idx, &buf[0], sizeof(buf)); - if (ext->key_len != -1) - memcpy(extra, buf, ext->key_len); - else + wep_key_len = get_wep_key(local, idx, &buf[0], sizeof(buf)); + if (wep_key_len < 0) { ext->key_len = 0; + } else { + ext->key_len = wep_key_len; + memcpy(extra, buf, ext->key_len); + } return 0; } From 5078b2e32ad4b1f753b1c837c15892202f753c97 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 May 2009 17:04:42 -0400 Subject: [PATCH 500/900] cfg80211: fix race between core hint and driver's custom apply Its possible for cfg80211 to have scheduled the work and for the global workqueue to not have kicked in prior to a cfg80211 driver's regulatory hint or wiphy_apply_custom_regulatory(). Although this is very unlikely its possible and should fix this race. When this race would happen you are expected to have hit a null pointer dereference panic. Cc: stable@kernel.org Signed-off-by: Luis R. Rodriguez Tested-by: Alan Jenkins Signed-off-by: John W. Linville --- net/wireless/reg.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 08265ca1578..487cb627ddb 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1551,6 +1551,13 @@ static int regulatory_hint_core(const char *alpha2) queue_regulatory_request(request); + /* + * This ensures last_request is populated once modules + * come swinging in and calling regulatory hints and + * wiphy_apply_custom_regulatory(). + */ + flush_scheduled_work(); + return 0; } From c9d2fbf36df5e04efa226614093bb1bacc6fe131 Mon Sep 17 00:00:00 2001 From: Jay Sternberg Date: Tue, 19 May 2009 14:56:36 -0700 Subject: [PATCH 501/900] iwlwifi: update 5000 ucode support to version 2 of API enable iwl driver to support 5000 ucode having version 2 of API Signed-off-by: Jay Sternberg Signed-off-by: Reinette Chatre Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-5000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/iwlwifi/iwl-5000.c b/drivers/net/wireless/iwlwifi/iwl-5000.c index e5ca2511a81..9452461ce86 100644 --- a/drivers/net/wireless/iwlwifi/iwl-5000.c +++ b/drivers/net/wireless/iwlwifi/iwl-5000.c @@ -46,7 +46,7 @@ #include "iwl-6000-hw.h" /* Highest firmware API version supported */ -#define IWL5000_UCODE_API_MAX 1 +#define IWL5000_UCODE_API_MAX 2 #define IWL5150_UCODE_API_MAX 2 /* Lowest firmware API version supported */ From a6c67339784db5763d6f20ae1881aeebe8c5a9f4 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 20 May 2009 02:12:56 +0200 Subject: [PATCH 502/900] wireless: beyond ARRAY_SIZE of intf->crypto_stats Do not go beyond ARRAY_SIZE of intf->crypto_stats Signed-off-by: Roel Kluin Acked-by: Ivo van Doorn Signed-off-by: John W. Linville --- drivers/net/wireless/rt2x00/rt2x00debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/rt2x00/rt2x00debug.c b/drivers/net/wireless/rt2x00/rt2x00debug.c index 07d378ef0b4..7b3ee8c2eae 100644 --- a/drivers/net/wireless/rt2x00/rt2x00debug.c +++ b/drivers/net/wireless/rt2x00/rt2x00debug.c @@ -138,7 +138,7 @@ void rt2x00debug_update_crypto(struct rt2x00_dev *rt2x00dev, if (cipher == CIPHER_TKIP_NO_MIC) cipher = CIPHER_TKIP; - if (cipher == CIPHER_NONE || cipher > CIPHER_MAX) + if (cipher == CIPHER_NONE || cipher >= CIPHER_MAX) return; /* Remove CIPHER_NONE index */ From cf8da764fc6959b7efb482f375dfef9830e98205 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 May 2009 18:54:22 +0000 Subject: [PATCH 503/900] net: fix length computation in rt_check_expire() rt_check_expire() computes average and standard deviation of chain lengths, but not correclty reset length to 0 at beginning of each chain. This probably gives overflows for sum2 (and sum) on loaded machines instead of meaningful results. Signed-off-by: Eric Dumazet Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/ipv4/route.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c4c60e9f068..869cf1c44b7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -785,7 +785,7 @@ static void rt_check_expire(void) static unsigned int rover; unsigned int i = rover, goal; struct rtable *rth, **rthp; - unsigned long length = 0, samples = 0; + unsigned long samples = 0; unsigned long sum = 0, sum2 = 0; u64 mult; @@ -795,9 +795,9 @@ static void rt_check_expire(void) goal = (unsigned int)mult; if (goal > rt_hash_mask) goal = rt_hash_mask + 1; - length = 0; for (; goal > 0; goal--) { unsigned long tmo = ip_rt_gc_timeout; + unsigned long length; i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; @@ -809,6 +809,7 @@ static void rt_check_expire(void) if (*rthp == NULL) continue; + length = 0; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { if (rt_is_expired(rth)) { From 1ddbcb005c395518c2cd0df504cff3d4b5c85853 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 May 2009 20:14:28 +0000 Subject: [PATCH 504/900] net: fix rtable leak in net/ipv4/route.c Alexander V. Lukyanov found a regression in 2.6.29 and made a complete analysis found in http://bugzilla.kernel.org/show_bug.cgi?id=13339 Quoted here because its a perfect one : begin_of_quotation 2.6.29 patch has introduced flexible route cache rebuilding. Unfortunately the patch has at least one critical flaw, and another problem. rt_intern_hash calculates rthi pointer, which is later used for new entry insertion. The same loop calculates cand pointer which is used to clean the list. If the pointers are the same, rtable leak occurs, as first the cand is removed then the new entry is appended to it. This leak leads to unregister_netdevice problem (usage count > 0). Another problem of the patch is that it tries to insert the entries in certain order, to facilitate counting of entries distinct by all but QoS parameters. Unfortunately, referencing an existing rtable entry moves it to list beginning, to speed up further lookups, so the carefully built order is destroyed. For the first problem the simplest patch it to set rthi=0 when rthi==cand, but it will also destroy the ordering. end_of_quotation Problematic commit is 1080d709fb9d8cd4392f93476ee46a9d6ea05a5b (net: implement emergency route cache rebulds when gc_elasticity is exceeded) Trying to keep dst_entries ordered is too complex and breaks the fact that order should depend on the frequency of use for garbage collection. A possible fix is to make rt_intern_hash() simpler, and only makes rt_check_expire() a litle bit smarter, being able to cope with an arbitrary entries order. The added loop is running on cache hot data, while cpu is prefetching next object, so should be unnoticied. Reported-and-analyzed-by: Alexander V. Lukyanov Signed-off-by: Eric Dumazet Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/ipv4/route.c | 55 +++++++++++++++--------------------------------- 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 869cf1c44b7..28205e5bfa9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -784,7 +784,7 @@ static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; - struct rtable *rth, **rthp; + struct rtable *rth, *aux, **rthp; unsigned long samples = 0; unsigned long sum = 0, sum2 = 0; u64 mult; @@ -812,6 +812,7 @@ static void rt_check_expire(void) length = 0; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { + prefetch(rth->u.dst.rt_next); if (rt_is_expired(rth)) { *rthp = rth->u.dst.rt_next; rt_free(rth); @@ -820,33 +821,30 @@ static void rt_check_expire(void) if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ if (time_before_eq(jiffies, rth->u.dst.expires)) { +nofree: tmo >>= 1; rthp = &rth->u.dst.rt_next; /* - * Only bump our length if the hash - * inputs on entries n and n+1 are not - * the same, we only count entries on + * We only count entries on * a chain with equal hash inputs once * so that entries for different QOS * levels, and other non-hash input * attributes don't unfairly skew * the length computation */ - if ((*rthp == NULL) || - !compare_hash_inputs(&(*rthp)->fl, - &rth->fl)) - length += ONE; + for (aux = rt_hash_table[i].chain;;) { + if (aux == rth) { + length += ONE; + break; + } + if (compare_hash_inputs(&aux->fl, &rth->fl)) + break; + aux = aux->u.dst.rt_next; + } continue; } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { - tmo >>= 1; - rthp = &rth->u.dst.rt_next; - if ((*rthp == NULL) || - !compare_hash_inputs(&(*rthp)->fl, - &rth->fl)) - length += ONE; - continue; - } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) + goto nofree; /* Cleanup aged off entries. */ *rthp = rth->u.dst.rt_next; @@ -1069,7 +1067,6 @@ out: return 0; static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) { struct rtable *rth, **rthp; - struct rtable *rthi; unsigned long now; struct rtable *cand, **candp; u32 min_score; @@ -1089,7 +1086,6 @@ restart: } rthp = &rt_hash_table[hash].chain; - rthi = NULL; spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { @@ -1135,17 +1131,6 @@ restart: chain_length++; rthp = &rth->u.dst.rt_next; - - /* - * check to see if the next entry in the chain - * contains the same hash input values as rt. If it does - * This is where we will insert into the list, instead of - * at the head. This groups entries that differ by aspects not - * relvant to the hash function together, which we use to adjust - * our chain length - */ - if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl)) - rthi = rth; } if (cand) { @@ -1206,10 +1191,7 @@ restart: } } - if (rthi) - rt->u.dst.rt_next = rthi->u.dst.rt_next; - else - rt->u.dst.rt_next = rt_hash_table[hash].chain; + rt->u.dst.rt_next = rt_hash_table[hash].chain; #if RT_CACHE_DEBUG >= 2 if (rt->u.dst.rt_next) { @@ -1225,10 +1207,7 @@ restart: * previous writes to rt are comitted to memory * before making rt visible to other CPUS. */ - if (rthi) - rcu_assign_pointer(rthi->u.dst.rt_next, rt); - else - rcu_assign_pointer(rt_hash_table[hash].chain, rt); + rcu_assign_pointer(rt_hash_table[hash].chain, rt); spin_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; From 4f72427998b105392e60bae7a6798a0c96fe4f0a Mon Sep 17 00:00:00 2001 From: Jean-Mickael Guerin Date: Wed, 20 May 2009 17:38:59 -0700 Subject: [PATCH 505/900] IPv6: set RTPROT_KERNEL to initial route The use of unspecified protocol in IPv6 initial route prevents quagga to install IPv6 default route: # show ipv6 route S ::/0 [1/0] via fe80::1, eth1_0 K>* ::/0 is directly connected, lo, rej C>* ::1/128 is directly connected, lo C>* fe80::/64 is directly connected, eth1_0 # ip -6 route fe80::/64 dev eth1_0 proto kernel metric 256 mtu 1500 advmss 1440 hoplimit -1 ff00::/8 dev eth1_0 metric 256 mtu 1500 advmss 1440 hoplimit -1 unreachable default dev lo proto none metric -1 error -101 hoplimit 255 The attached patch ensures RTPROT_KERNEL to the default initial route and fixes the problem for quagga. This is similar to "ipv6: protocol for address routes" f410a1fba7afa79d2992620e874a343fdba28332. # show ipv6 route S>* ::/0 [1/0] via fe80::1, eth1_0 C>* ::1/128 is directly connected, lo C>* fe80::/64 is directly connected, eth1_0 # ip -6 route fe80::/64 dev eth1_0 proto kernel metric 256 mtu 1500 advmss 1440 hoplimit -1 fe80::/64 dev eth1_0 proto kernel metric 256 mtu 1500 advmss 1440 hoplimit -1 ff00::/8 dev eth1_0 metric 256 mtu 1500 advmss 1440 hoplimit -1 default via fe80::1 dev eth1_0 proto zebra metric 1024 mtu 1500 advmss 1440 hoplimit -1 unreachable default dev lo proto kernel metric -1 error -101 hoplimit 255 Signed-off-by: Jean-Mickael Guerin Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv6/route.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1394ddb6e35..032a5ec391c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -137,6 +137,7 @@ static struct rt6_info ip6_null_entry_template = { } }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32) 0, .rt6i_ref = ATOMIC_INIT(1), }; @@ -159,6 +160,7 @@ static struct rt6_info ip6_prohibit_entry_template = { } }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32) 0, .rt6i_ref = ATOMIC_INIT(1), }; @@ -176,6 +178,7 @@ static struct rt6_info ip6_blk_hole_entry_template = { } }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32) 0, .rt6i_ref = ATOMIC_INIT(1), }; From 385aa9e7012d35b017981e67b3464aef4e1e7108 Mon Sep 17 00:00:00 2001 From: Thomas Reitmayr Date: Tue, 19 May 2009 19:35:26 +0200 Subject: [PATCH 506/900] [ARM] Kirkwood: Correct MPP for SATA activity/presence LEDs of QNAP TS-119/TS-219. For the QNAP TS-119 and TS-219 the wrong MPPs were used for the SATA activity/presence LEDs. The new settings make these LEDs work as expected. Signed-off-by: Thomas Reitmayr Tested-by: Martin Michlmayr Signed-off-by: Nicolas Pitre --- arch/arm/mach-kirkwood/ts219-setup.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm/mach-kirkwood/ts219-setup.c b/arch/arm/mach-kirkwood/ts219-setup.c index dda5743cf3e..01aa213c0a6 100644 --- a/arch/arm/mach-kirkwood/ts219-setup.c +++ b/arch/arm/mach-kirkwood/ts219-setup.c @@ -142,6 +142,8 @@ static unsigned int qnap_ts219_mpp_config[] __initdata = { MPP1_SPI_MOSI, MPP2_SPI_SCK, MPP3_SPI_MISO, + MPP4_SATA1_ACTn, + MPP5_SATA0_ACTn, MPP8_TW_SDA, MPP9_TW_SCK, MPP10_UART0_TXD, @@ -150,10 +152,6 @@ static unsigned int qnap_ts219_mpp_config[] __initdata = { MPP14_UART1_RXD, /* PIC controller */ MPP15_GPIO, /* USB Copy button */ MPP16_GPIO, /* Reset button */ - MPP20_SATA1_ACTn, - MPP21_SATA0_ACTn, - MPP22_SATA1_PRESENTn, - MPP23_SATA0_PRESENTn, 0 }; From 98f32602d42951e61a059685f842aa7d778ffab0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 21 May 2009 20:33:58 +0100 Subject: [PATCH 507/900] hugh: update email address My old address will shut down in a few days time: remove it from the tree, and add a tmpfs (shmem filesystem) maintainer entry with the new address. Signed-off-by: Hugh Dickins Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- Documentation/filesystems/tmpfs.txt | 2 +- MAINTAINERS | 8 ++++++++ mm/rmap.c | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index 222437efd75..3015da0c6b2 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt @@ -133,4 +133,4 @@ RAM/SWAP in 10240 inodes and it is only accessible by root. Author: Christoph Rohland , 1.12.01 Updated: - Hugh Dickins , 4 June 2007 + Hugh Dickins, 4 June 2007 diff --git a/MAINTAINERS b/MAINTAINERS index 2b349ba4add..64ea80e45e3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5579,6 +5579,14 @@ M: ian@mnementh.co.uk S: Maintained F: drivers/mmc/host/tmio_mmc.* +TMPFS (SHMEM FILESYSTEM) +P: Hugh Dickins +M: hugh.dickins@tiscali.co.uk +L: linux-mm@kvack.org +S: Maintained +F: include/linux/shmem_fs.h +F: mm/shmem.c + TPM DEVICE DRIVER P: Debora Velarde M: debora@linux.vnet.ibm.com diff --git a/mm/rmap.c b/mm/rmap.c index 16521664010..23122af3261 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -14,7 +14,7 @@ * Original design by Rik van Riel 2001 * File methods by Dave McCracken 2003, 2004 * Anonymous methods by Andrea Arcangeli 2004 - * Contributions by Hugh Dickins 2003, 2004 + * Contributions by Hugh Dickins 2003, 2004 */ /* From 85bc26211c6a2c6e82c2403697f8ce44e9587215 Mon Sep 17 00:00:00 2001 From: Martin Michlmayr Date: Tue, 19 May 2009 12:30:52 +0200 Subject: [PATCH 508/900] [ARM] Orion: Remove explicit name for platform device resources Remove explicit names from platform device resources since they will automatically be named after the platform device they're associated with. Signed-off-by: Martin Michlmayr Acked-by: Russell King Signed-off-by: Nicolas Pitre --- arch/arm/mach-kirkwood/common.c | 2 -- arch/arm/mach-mv78xx0/common.c | 4 ---- arch/arm/mach-orion5x/common.c | 2 -- 3 files changed, 8 deletions(-) diff --git a/arch/arm/mach-kirkwood/common.c b/arch/arm/mach-kirkwood/common.c index eeb00240d78..3fab82a4c8f 100644 --- a/arch/arm/mach-kirkwood/common.c +++ b/arch/arm/mach-kirkwood/common.c @@ -386,12 +386,10 @@ static struct mv64xxx_i2c_pdata kirkwood_i2c_pdata = { static struct resource kirkwood_i2c_resources[] = { { - .name = "i2c", .start = I2C_PHYS_BASE, .end = I2C_PHYS_BASE + 0x1f, .flags = IORESOURCE_MEM, }, { - .name = "i2c", .start = IRQ_KIRKWOOD_TWSI, .end = IRQ_KIRKWOOD_TWSI, .flags = IORESOURCE_IRQ, diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c index 9ba595083da..0d88eea6a09 100644 --- a/arch/arm/mach-mv78xx0/common.c +++ b/arch/arm/mach-mv78xx0/common.c @@ -532,12 +532,10 @@ static struct mv64xxx_i2c_pdata mv78xx0_i2c_0_pdata = { static struct resource mv78xx0_i2c_0_resources[] = { { - .name = "i2c 0 base", .start = I2C_0_PHYS_BASE, .end = I2C_0_PHYS_BASE + 0x1f, .flags = IORESOURCE_MEM, }, { - .name = "i2c 0 irq", .start = IRQ_MV78XX0_I2C_0, .end = IRQ_MV78XX0_I2C_0, .flags = IORESOURCE_IRQ, @@ -567,12 +565,10 @@ static struct mv64xxx_i2c_pdata mv78xx0_i2c_1_pdata = { static struct resource mv78xx0_i2c_1_resources[] = { { - .name = "i2c 1 base", .start = I2C_1_PHYS_BASE, .end = I2C_1_PHYS_BASE + 0x1f, .flags = IORESOURCE_MEM, }, { - .name = "i2c 1 irq", .start = IRQ_MV78XX0_I2C_1, .end = IRQ_MV78XX0_I2C_1, .flags = IORESOURCE_IRQ, diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c index 6af99ddabdf..a51fb9dd65a 100644 --- a/arch/arm/mach-orion5x/common.c +++ b/arch/arm/mach-orion5x/common.c @@ -248,12 +248,10 @@ static struct mv64xxx_i2c_pdata orion5x_i2c_pdata = { static struct resource orion5x_i2c_resources[] = { { - .name = "i2c base", .start = I2C_PHYS_BASE, .end = I2C_PHYS_BASE + 0x1f, .flags = IORESOURCE_MEM, }, { - .name = "i2c irq", .start = IRQ_ORION5X_I2C, .end = IRQ_ORION5X_I2C, .flags = IORESOURCE_IRQ, From c40499e04b2005e61f989824251f9343b55f96bb Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 21 May 2009 15:04:15 -0700 Subject: [PATCH 509/900] gigaset: beyond ARRAY_SIZE of iwb->data Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- drivers/isdn/gigaset/isocdata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/isdn/gigaset/isocdata.c b/drivers/isdn/gigaset/isocdata.c index b171e75cb52..29808c4fb1c 100644 --- a/drivers/isdn/gigaset/isocdata.c +++ b/drivers/isdn/gigaset/isocdata.c @@ -175,7 +175,7 @@ int gigaset_isowbuf_getbytes(struct isowbuf_t *iwb, int size) return -EINVAL; } src = iwb->read; - if (unlikely(limit > BAS_OUTBUFSIZE + BAS_OUTBUFPAD || + if (unlikely(limit >= BAS_OUTBUFSIZE + BAS_OUTBUFPAD || (read < src && limit >= src))) { pr_err("isoc write buffer frame reservation violated\n"); return -EFAULT; From 5b5f792a6a9a2f9ae812d151ed621f72e99b1725 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 May 2009 15:07:12 -0700 Subject: [PATCH 510/900] pktgen: do not access flows[] beyond its length typo -- pkt_dev->nflows is for stats only, the number of concurrent flows is stored in cflows. Reported-By: Vladimir Ivashchenko Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 3779c1438c1..0666a827bc6 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2447,7 +2447,7 @@ static inline void free_SAs(struct pktgen_dev *pkt_dev) if (pkt_dev->cflows) { /* let go of the SAs if we have them */ int i = 0; - for (; i < pkt_dev->nflows; i++){ + for (; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; if (x) { xfrm_state_put(x); From 3ed18d76d959e5cbfa5d70c8f7ba95476582a556 Mon Sep 17 00:00:00 2001 From: Robert Olsson Date: Thu, 21 May 2009 15:20:59 -0700 Subject: [PATCH 511/900] ipv4: Fix oops with FIB_TRIE It seems we can fix this by disabling preemption while we re-balance the trie. This is with the CONFIG_CLASSIC_RCU. It's been stress-tested at high loads continuesly taking a full BGP table up/down via iproute -batch. Note. fib_trie is not updated for CONFIG_PREEMPT_RCU Reported-by: Andrei Popa Signed-off-by: Robert Olsson Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ec0ae490f0b..33c7c85dfe4 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -986,9 +986,12 @@ fib_find_node(struct trie *t, u32 key) static struct node *trie_rebalance(struct trie *t, struct tnode *tn) { int wasfull; - t_key cindex, key = tn->key; + t_key cindex, key; struct tnode *tp; + preempt_disable(); + key = tn->key; + while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); @@ -1007,6 +1010,7 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) if (IS_TNODE(tn)) tn = (struct tnode *)resize(t, (struct tnode *)tn); + preempt_enable(); return (struct node *)tn; } From 0975ecba3b670df7c488a5e0e6fe9f1f370a8ad8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 21 May 2009 15:22:02 -0700 Subject: [PATCH 512/900] RxRPC: Error handling for rxrpc_alloc_connection() rxrpc_alloc_connection() doesn't return an error code on failure, it just returns NULL. IS_ERR(NULL) is false. Signed-off-by: Dan Carpenter Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/ar-connection.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 0f1218b8d28..67e38a05624 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -343,9 +343,9 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, /* not yet present - create a candidate for a new connection * and then redo the check */ conn = rxrpc_alloc_connection(gfp); - if (IS_ERR(conn)) { - _leave(" = %ld", PTR_ERR(conn)); - return PTR_ERR(conn); + if (!conn) { + _leave(" = -ENOMEM"); + return -ENOMEM; } conn->trans = trans; @@ -508,9 +508,9 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, /* not yet present - create a candidate for a new connection and then * redo the check */ candidate = rxrpc_alloc_connection(gfp); - if (IS_ERR(candidate)) { - _leave(" = %ld", PTR_ERR(candidate)); - return PTR_ERR(candidate); + if (!candidate) { + _leave(" = -ENOMEM"); + return -ENOMEM; } candidate->trans = trans; From 703a3b8e5c01cf6fb33c6d8dc99905f889a4e992 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 21 May 2009 22:21:53 +0000 Subject: [PATCH 513/900] [CIFS] fix posix open regression Posix open code was not properly adding the file to the list of open files. Fix allocating cifsFileInfo more than once, and adding twice to flist and tlist. Also fix mode setting to be done in one place in these paths. Signed-off-by: Steve French Reviewed-by: Shirish Pargaonkar Tested-by: Jeff Layton Tested-by: Luca Tettamanti --- fs/cifs/dir.c | 14 +++++------ fs/cifs/file.c | 66 ++++++++++++++++++++++++++++---------------------- 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 11431ed72a7..f49d684edd9 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -225,6 +225,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, if (!(oflags & FMODE_READ)) write_only = true; + mode &= ~current_umask(); rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode, pnetfid, presp_data, &oplock, full_path, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & @@ -310,7 +311,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, return -ENOMEM; } - mode &= ~current_umask(); if (oplockEnabled) oplock = REQ_OPLOCK; @@ -336,7 +336,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, else /* success, no need to query */ goto cifs_create_set_dentry; } else if ((rc != -EIO) && (rc != -EREMOTE) && - (rc != -EOPNOTSUPP)) /* path not found or net err */ + (rc != -EOPNOTSUPP) && (rc != -EINVAL)) goto cifs_create_out; /* else fallthrough to retry, using older open call, this is case where server does not support this SMB level, and @@ -609,7 +609,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, int xid; int rc = 0; /* to get around spurious gcc warning, set to zero here */ int oplock = 0; - int mode; __u16 fileHandle = 0; bool posix_open = false; struct cifs_sb_info *cifs_sb; @@ -660,13 +659,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, if (pTcon->unix_ext) { if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && - (nd->flags & LOOKUP_OPEN)) { + (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open) { if (!((nd->intent.open.flags & O_CREAT) && (nd->intent.open.flags & O_EXCL))) { - mode = nd->intent.open.create_mode & - ~current_umask(); rc = cifs_posix_open(full_path, &newInode, - parent_dir_inode->i_sb, mode, + parent_dir_inode->i_sb, + nd->intent.open.create_mode, nd->intent.open.flags, &oplock, &fileHandle, xid); /* @@ -681,6 +679,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, */ if ((rc != -EINVAL) && (rc != -EOPNOTSUPP)) posix_open = true; + else + pTcon->broken_posix_open = true; } } if (!posix_open) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 38c06f82657..302ea15f02e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -130,10 +130,6 @@ static inline int cifs_posix_open_inode_helper(struct inode *inode, struct cifsFileInfo *pCifsFile, int oplock, u16 netfid) { - file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); - if (file->private_data == NULL) - return -ENOMEM; - pCifsFile = cifs_init_private(file->private_data, inode, file, netfid); write_lock(&GlobalSMBSeslock); pCifsInode = CIFS_I(file->f_path.dentry->d_inode); @@ -184,6 +180,38 @@ psx_client_can_cache: return 0; } +static struct cifsFileInfo * +cifs_fill_filedata(struct file *file) +{ + struct list_head *tmp; + struct cifsFileInfo *pCifsFile = NULL; + struct cifsInodeInfo *pCifsInode = NULL; + + /* search inode for this file and fill in file->private_data */ + pCifsInode = CIFS_I(file->f_path.dentry->d_inode); + read_lock(&GlobalSMBSeslock); + list_for_each(tmp, &pCifsInode->openFileList) { + pCifsFile = list_entry(tmp, struct cifsFileInfo, flist); + if ((pCifsFile->pfile == NULL) && + (pCifsFile->pid == current->tgid)) { + /* mode set in cifs_create */ + + /* needed for writepage */ + pCifsFile->pfile = file; + file->private_data = pCifsFile; + break; + } + } + read_unlock(&GlobalSMBSeslock); + + if (file->private_data != NULL) { + return pCifsFile; + } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL)) + cERROR(1, ("could not find file instance for " + "new file %p", file)); + return NULL; +} + /* all arguments to this function must be checked for validity in caller */ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile, @@ -258,7 +286,6 @@ int cifs_open(struct inode *inode, struct file *file) struct cifsTconInfo *tcon; struct cifsFileInfo *pCifsFile; struct cifsInodeInfo *pCifsInode; - struct list_head *tmp; char *full_path = NULL; int desiredAccess; int disposition; @@ -270,32 +297,12 @@ int cifs_open(struct inode *inode, struct file *file) cifs_sb = CIFS_SB(inode->i_sb); tcon = cifs_sb->tcon; - /* search inode for this file and fill in file->private_data */ pCifsInode = CIFS_I(file->f_path.dentry->d_inode); - read_lock(&GlobalSMBSeslock); - list_for_each(tmp, &pCifsInode->openFileList) { - pCifsFile = list_entry(tmp, struct cifsFileInfo, - flist); - if ((pCifsFile->pfile == NULL) && - (pCifsFile->pid == current->tgid)) { - /* mode set in cifs_create */ - - /* needed for writepage */ - pCifsFile->pfile = file; - - file->private_data = pCifsFile; - break; - } - } - read_unlock(&GlobalSMBSeslock); - - if (file->private_data != NULL) { - rc = 0; + pCifsFile = cifs_fill_filedata(file); + if (pCifsFile) { FreeXid(xid); - return rc; - } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL)) - cERROR(1, ("could not find file instance for " - "new file %p", file)); + return 0; + } full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { @@ -325,6 +332,7 @@ int cifs_open(struct inode *inode, struct file *file) /* no need for special case handling of setting mode on read only files needed here */ + pCifsFile = cifs_fill_filedata(file); cifs_posix_open_inode_helper(inode, file, pCifsInode, pCifsFile, oplock, netfid); goto out; From 6eb0ac03899a1363ba176abe0830a9e6698c0503 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 21 May 2009 19:10:23 +0000 Subject: [PATCH 514/900] powerpc/maple: Add a quirk to disable MSI for IPR on Bimini Something in the HW or FW setup is busted and MSIs aren't working with IPR on Bimini, so until we figure out exaxtly what's up, we quirk them out Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/maple/pci.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/platforms/maple/pci.c b/arch/powerpc/platforms/maple/pci.c index 301855263b8..04296ffff8b 100644 --- a/arch/powerpc/platforms/maple/pci.c +++ b/arch/powerpc/platforms/maple/pci.c @@ -592,3 +592,17 @@ int maple_pci_get_legacy_ide_irq(struct pci_dev *pdev, int channel) } return irq; } + +static void __devinit quirk_ipr_msi(struct pci_dev *dev) +{ + /* Something prevents MSIs from the IPR from working on Bimini, + * and the driver has no smarts to recover. So disable MSI + * on it for now. */ + + if (machine_is(maple)) { + dev->no_msi = 1; + dev_info(&dev->dev, "Quirk disabled MSI\n"); + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_OBSIDIAN, + quirk_ipr_msi); From 87488957a68293357a94c8142de7d0ae17914912 Mon Sep 17 00:00:00 2001 From: Adam Williamson Date: Thu, 21 May 2009 18:32:59 -0400 Subject: [PATCH 515/900] ALSA: hda - fix audio on HP TX25xx series notebooks Fixes https://bugtrack.alsa-project.org/alsa-bug/view.php?id=4121 Taken from https://bugzilla.redhat.com/show_bug.cgi?id=498060 Signed-off-by: Adam Williamson Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index b8a0d3e7927..bcbb736f94f 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -12058,6 +12058,7 @@ static struct snd_pci_quirk alc268_cfg_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0253, "Dell OEM", ALC268_DELL), SND_PCI_QUIRK(0x1028, 0x02b0, "Dell Inspiron Mini9", ALC268_DELL), SND_PCI_QUIRK(0x103c, 0x30cc, "TOSHIBA", ALC268_TOSHIBA), + SND_PCI_QUIRK(0x103c, 0x30f1, "HP TX25xx series", ALC268_TOSHIBA), SND_PCI_QUIRK(0x1043, 0x1205, "ASUS W7J", ALC268_3ST), SND_PCI_QUIRK(0x1179, 0xff10, "TOSHIBA A205", ALC268_TOSHIBA), SND_PCI_QUIRK(0x1179, 0xff50, "TOSHIBA A305", ALC268_TOSHIBA), From 88dff4936c0a5fa53080cca68dc963a8a2a674b0 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 22 May 2009 11:35:50 +0800 Subject: [PATCH 516/900] x86: DMI match for the Sony VGN-Z540N as it needs BIOS reboot x86: DMI match for the Sony VGN-Z540N as it needs BIOS reboot, see: http://bugzilla.kernel.org/show_bug.cgi?id=12901 [ Impact: fix hung reboot on certain systems ] Signed-off-by: Zhang Rui Cc: Len Brown LKML-Reference: <1242963350.32574.53.camel@rzhang-dt> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1340dad417f..667188e0b5a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -232,6 +232,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), }, }, + { /* Handle problems with rebooting on Sony VGN-Z540N */ + .callback = set_bios_reboot, + .ident = "Sony VGN-Z540N", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), + }, + }, { } }; From e069c0cf7c169ae5a8bfdc8d083a5d66fbef73d8 Mon Sep 17 00:00:00 2001 From: Inaky Perez-Gonzalez Date: Fri, 8 May 2009 15:51:44 -0700 Subject: [PATCH 517/900] wimax/i2400m: usb: fix device reset on autosuspend while not yet idle When the i2400m is connected to a network, the host interface (USB) cannot be suspended. For that to happen, the device has to have negotiated with the basestation to put the link on IDLE state. If the host tries to put the device in standby while it is connected but not idle, the device resets, as the driver should not do that. To avoid triggering that, when the USB susbsytem requires the driver to autosuspend the device, the driver checks if the device is not yet idle. If it is not, the request is rejected (will be retried again later on after the autosuspend timeout). At some point the device will enter idle and the request will succeed (unless of course, there is network traffic, but at that point, there is no idle neither in the link or the host interface). Signed-off-by: Inaky Perez-Gonzalez --- drivers/net/wimax/i2400m/usb.c | 35 +++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/drivers/net/wimax/i2400m/usb.c b/drivers/net/wimax/i2400m/usb.c index ca4151a9e22..17851321b7f 100644 --- a/drivers/net/wimax/i2400m/usb.c +++ b/drivers/net/wimax/i2400m/usb.c @@ -505,27 +505,52 @@ int i2400mu_suspend(struct usb_interface *iface, pm_message_t pm_msg) #ifdef CONFIG_PM struct usb_device *usb_dev = i2400mu->usb_dev; #endif + unsigned is_autosuspend = 0; struct i2400m *i2400m = &i2400mu->i2400m; +#ifdef CONFIG_PM + if (usb_dev->auto_pm > 0) + is_autosuspend = 1; +#endif + d_fnstart(3, dev, "(iface %p pm_msg %u)\n", iface, pm_msg.event); if (i2400m->updown == 0) goto no_firmware; - d_printf(1, dev, "fw up, requesting standby\n"); + if (i2400m->state == I2400M_SS_DATA_PATH_CONNECTED && is_autosuspend) { + /* ugh -- the device is connected and this suspend + * request is an autosuspend one (not a system standby + * / hibernate). + * + * The only way the device can go to standby is if the + * link with the base station is in IDLE mode; that + * were the case, we'd be in status + * I2400M_SS_CONNECTED_IDLE. But we are not. + * + * If we *tell* him to go power save now, it'll reset + * as a precautionary measure, so if this is an + * autosuspend thing, say no and it'll come back + * later, when the link is IDLE + */ + result = -EBADF; + d_printf(1, dev, "fw up, link up, not-idle, autosuspend: " + "not entering powersave\n"); + goto error_not_now; + } + d_printf(1, dev, "fw up: entering powersave\n"); atomic_dec(&i2400mu->do_autopm); result = i2400m_cmd_enter_powersave(i2400m); atomic_inc(&i2400mu->do_autopm); -#ifdef CONFIG_PM - if (result < 0 && usb_dev->auto_pm == 0) { + if (result < 0 && !is_autosuspend) { /* System suspend, can't fail */ dev_err(dev, "failed to suspend, will reset on resume\n"); result = 0; } -#endif if (result < 0) goto error_enter_powersave; i2400mu_notification_release(i2400mu); - d_printf(1, dev, "fw up, got standby\n"); + d_printf(1, dev, "powersave requested\n"); error_enter_powersave: +error_not_now: no_firmware: d_fnend(3, dev, "(iface %p pm_msg %u) = %d\n", iface, pm_msg.event, result); From 0899d6349c60e4021224b51c8c97f49b829dfefd Mon Sep 17 00:00:00 2001 From: Li Yang Date: Fri, 22 May 2009 16:39:59 +0800 Subject: [PATCH 518/900] fsldma: update mailling list address in MAINTAINERS linuxppc-embedded has been merged into linuxppc-dev. Signed-off-by: Li Yang --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2b349ba4add..cac3e3b71d3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2241,7 +2241,7 @@ P: Li Yang M: leoli@freescale.com P: Zhang Wei M: zw@zh-kernel.org -L: linuxppc-embedded@ozlabs.org +L: linuxppc-dev@ozlabs.org L: linux-kernel@vger.kernel.org S: Maintained F: drivers/dma/fsldma.* From f47edc6dab11801c2e97088ba7bbce042ded867c Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Fri, 22 May 2009 16:46:52 +0800 Subject: [PATCH 519/900] fsldma: fix check on potential fdev->chan[] overflow Fix the check of potential array overflow when using corrupted channel device tree nodes. Signed-off-by: Roel Kluin Signed-off-by: Li Yang --- drivers/dma/fsldma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index da8a8ed9e41..391b1bd7098 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -830,7 +830,7 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev, new_fsl_chan->reg.end - new_fsl_chan->reg.start + 1); new_fsl_chan->id = ((new_fsl_chan->reg.start - 0x100) & 0xfff) >> 7; - if (new_fsl_chan->id > FSL_DMA_MAX_CHANS_PER_DEVICE) { + if (new_fsl_chan->id >= FSL_DMA_MAX_CHANS_PER_DEVICE) { dev_err(fdev->dev, "There is no %d channel!\n", new_fsl_chan->id); err = -EINVAL; From 138ef0185177a6d221d24b6aa8f12d867fbbef90 Mon Sep 17 00:00:00 2001 From: Ira Snyder Date: Tue, 19 May 2009 15:42:13 -0700 Subject: [PATCH 520/900] fsldma: fix "DMA halt timeout!" errors When using the DMA controller from multiple threads at the same time, it is possible to get lots of "DMA halt timeout!" errors printed to the kernel log. This occurs due to a race between fsl_dma_memcpy_issue_pending() and the interrupt handler, fsl_dma_chan_do_interrupt(). Both call the fsl_chan_xfer_ld_queue() function, which does not protect against concurrent accesses to dma_halt() and dma_start(). The existing spinlock is moved to cover the dma_halt() and dma_start() functions. Testing shows that the "DMA halt timeout!" errors disappear. Signed-off-by: Ira W. Snyder Signed-off-by: Li Yang --- drivers/dma/fsldma.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 391b1bd7098..a4151c3bb78 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -598,15 +598,16 @@ static void fsl_chan_xfer_ld_queue(struct fsl_dma_chan *fsl_chan) dma_addr_t next_dest_addr; unsigned long flags; + spin_lock_irqsave(&fsl_chan->desc_lock, flags); + if (!dma_is_idle(fsl_chan)) - return; + goto out_unlock; dma_halt(fsl_chan); /* If there are some link descriptors * not transfered in queue. We need to start it. */ - spin_lock_irqsave(&fsl_chan->desc_lock, flags); /* Find the first un-transfer desciptor */ for (ld_node = fsl_chan->ld_queue.next; @@ -617,8 +618,6 @@ static void fsl_chan_xfer_ld_queue(struct fsl_dma_chan *fsl_chan) fsl_chan->common.cookie) == DMA_SUCCESS); ld_node = ld_node->next); - spin_unlock_irqrestore(&fsl_chan->desc_lock, flags); - if (ld_node != &fsl_chan->ld_queue) { /* Get the ld start address from ld_queue */ next_dest_addr = to_fsl_desc(ld_node)->async_tx.phys; @@ -630,6 +629,9 @@ static void fsl_chan_xfer_ld_queue(struct fsl_dma_chan *fsl_chan) set_cdar(fsl_chan, 0); set_ndar(fsl_chan, 0); } + +out_unlock: + spin_unlock_irqrestore(&fsl_chan->desc_lock, flags); } /** From bcfb7465c03a8c62c89da374677df56f6b894d44 Mon Sep 17 00:00:00 2001 From: Ira Snyder Date: Fri, 15 May 2009 14:27:16 -0700 Subject: [PATCH 521/900] fsldma: fix infinite loop on multi-descriptor DMA chain completion When creating a DMA transaction with multiple descriptors, the async_tx cookie is set to 0 for each descriptor in the chain, excluding the last descriptor, whose cookie is set to -EBUSY. When fsl_dma_tx_submit() is run, it only assigns a cookie to the first descriptor. All of the remaining descriptors keep their original value, including the last descriptor, which is set to -EBUSY. After the DMA completes, the driver will update the last completed cookie to be -EBUSY, which is an error code instead of a valid cookie. This causes dma_async_is_complete() to always return DMA_IN_PROGRESS. This causes the fsldma driver to never cleanup the queue of link descriptors, and the driver will re-run the DMA transaction on the hardware each time it receives the End-of-Chain interrupt. This causes an infinite loop. With this patch, fsl_dma_tx_submit() is changed to assign a cookie to every descriptor in the chain. The rest of the code then works without problems. Signed-off-by: Ira W. Snyder Signed-off-by: Li Yang --- drivers/dma/fsldma.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index a4151c3bb78..7313a1ae5f8 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -313,8 +313,8 @@ static void fsl_chan_toggle_ext_start(struct fsl_dma_chan *fsl_chan, int enable) static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx) { - struct fsl_desc_sw *desc = tx_to_fsl_desc(tx); struct fsl_dma_chan *fsl_chan = to_fsl_chan(tx->chan); + struct fsl_desc_sw *desc; unsigned long flags; dma_cookie_t cookie; @@ -322,14 +322,17 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx) spin_lock_irqsave(&fsl_chan->desc_lock, flags); cookie = fsl_chan->common.cookie; - cookie++; - if (cookie < 0) - cookie = 1; - desc->async_tx.cookie = cookie; - fsl_chan->common.cookie = desc->async_tx.cookie; + list_for_each_entry(desc, &tx->tx_list, node) { + cookie++; + if (cookie < 0) + cookie = 1; - append_ld_queue(fsl_chan, desc); - list_splice_init(&desc->async_tx.tx_list, fsl_chan->ld_queue.prev); + desc->async_tx.cookie = cookie; + } + + fsl_chan->common.cookie = cookie; + append_ld_queue(fsl_chan, tx_to_fsl_desc(tx)); + list_splice_init(&tx->tx_list, fsl_chan->ld_queue.prev); spin_unlock_irqrestore(&fsl_chan->desc_lock, flags); From 776c8943f2766f2819fafd88fdfbaf418ecd6e41 Mon Sep 17 00:00:00 2001 From: Ira Snyder Date: Fri, 15 May 2009 11:33:20 -0700 Subject: [PATCH 522/900] fsldma: snooping is not enabled for last entry in descriptor chain On the 83xx controller, snooping is necessary for the DMA controller to ensure cache coherence with the CPU when transferring to/from RAM. The last descriptor in a chain will always have the End-of-Chain interrupt bit set, so we can set the snoop bit while adding the End-of-Chain interrupt bit. Signed-off-by: Ira W. Snyder Signed-off-by: Li Yang --- drivers/dma/fsldma.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 7313a1ae5f8..ff9194d7ebb 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -179,9 +179,14 @@ static void dma_halt(struct fsl_dma_chan *fsl_chan) static void set_ld_eol(struct fsl_dma_chan *fsl_chan, struct fsl_desc_sw *desc) { + u64 snoop_bits; + + snoop_bits = ((fsl_chan->feature & FSL_DMA_IP_MASK) == FSL_DMA_IP_83XX) + ? FSL_DMA_SNEN : 0; + desc->hw.next_ln_addr = CPU_TO_DMA(fsl_chan, - DMA_TO_CPU(fsl_chan, desc->hw.next_ln_addr, 64) | FSL_DMA_EOL, - 64); + DMA_TO_CPU(fsl_chan, desc->hw.next_ln_addr, 64) | FSL_DMA_EOL + | snoop_bits, 64); } static void append_ld_queue(struct fsl_dma_chan *fsl_chan, From 2e077f8e8337e52eef3c39c24c31e103b11a0326 Mon Sep 17 00:00:00 2001 From: Ira Snyder Date: Fri, 15 May 2009 09:59:46 -0700 Subject: [PATCH 523/900] fsldma: fix memory leak on error path in fsl_dma_prep_memcpy() When preparing a memcpy operation, if the kernel fails to allocate memory for a link descriptor after the first link descriptor has already been allocated, then some memory will never be released. Fix the problem by walking the list of allocated descriptors backwards, and freeing the allocated descriptors back into the DMA pool. Signed-off-by: Ira W. Snyder Signed-off-by: Li Yang --- drivers/dma/fsldma.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index ff9194d7ebb..15783102bf1 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -462,8 +462,8 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy( { struct fsl_dma_chan *fsl_chan; struct fsl_desc_sw *first = NULL, *prev = NULL, *new; + struct list_head *list; size_t copy; - LIST_HEAD(link_chain); if (!chan) return NULL; @@ -480,7 +480,7 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy( if (!new) { dev_err(fsl_chan->dev, "No free memory for link descriptor\n"); - return NULL; + goto fail; } #ifdef FSL_DMA_LD_DEBUG dev_dbg(fsl_chan->dev, "new link desc alloc %p\n", new); @@ -515,7 +515,19 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy( /* Set End-of-link to the last link descriptor of new list*/ set_ld_eol(fsl_chan, new); - return first ? &first->async_tx : NULL; + return &first->async_tx; + +fail: + if (!first) + return NULL; + + list = &first->async_tx.tx_list; + list_for_each_entry_safe_reverse(new, prev, list, node) { + list_del(&new->node); + dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys); + } + + return NULL; } /** From 0e1b74df992c1ef92213ab26f952befda2087f59 Mon Sep 17 00:00:00 2001 From: Mingwei Wang Date: Wed, 20 May 2009 16:49:57 +0800 Subject: [PATCH 524/900] [ARM] pxa: fix the incorrectly defined drive strength macros for pxa{168,910} Signed-off-by: Mingwei Wang Signed-off-by: Eric Miao --- arch/arm/mach-mmp/include/mach/mfp-pxa168.h | 5 +++++ arch/arm/mach-mmp/include/mach/mfp-pxa910.h | 5 +++++ arch/arm/mach-mmp/include/mach/mfp.h | 9 +++------ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/arm/mach-mmp/include/mach/mfp-pxa168.h b/arch/arm/mach-mmp/include/mach/mfp-pxa168.h index d0bdb6e3682..2e914649b9e 100644 --- a/arch/arm/mach-mmp/include/mach/mfp-pxa168.h +++ b/arch/arm/mach-mmp/include/mach/mfp-pxa168.h @@ -3,6 +3,11 @@ #include +#define MFP_DRIVE_VERY_SLOW (0x0 << 13) +#define MFP_DRIVE_SLOW (0x1 << 13) +#define MFP_DRIVE_MEDIUM (0x2 << 13) +#define MFP_DRIVE_FAST (0x3 << 13) + /* GPIO */ #define GPIO0_GPIO MFP_CFG(GPIO0, AF5) #define GPIO1_GPIO MFP_CFG(GPIO1, AF5) diff --git a/arch/arm/mach-mmp/include/mach/mfp-pxa910.h b/arch/arm/mach-mmp/include/mach/mfp-pxa910.h index 48a1cbc7c56..d97de36c50a 100644 --- a/arch/arm/mach-mmp/include/mach/mfp-pxa910.h +++ b/arch/arm/mach-mmp/include/mach/mfp-pxa910.h @@ -3,6 +3,11 @@ #include +#define MFP_DRIVE_VERY_SLOW (0x0 << 13) +#define MFP_DRIVE_SLOW (0x2 << 13) +#define MFP_DRIVE_MEDIUM (0x4 << 13) +#define MFP_DRIVE_FAST (0x8 << 13) + /* UART2 */ #define GPIO47_UART2_RXD MFP_CFG(GPIO47, AF6) #define GPIO48_UART2_TXD MFP_CFG(GPIO48, AF6) diff --git a/arch/arm/mach-mmp/include/mach/mfp.h b/arch/arm/mach-mmp/include/mach/mfp.h index 277ea4cd0f9..62e510e80a5 100644 --- a/arch/arm/mach-mmp/include/mach/mfp.h +++ b/arch/arm/mach-mmp/include/mach/mfp.h @@ -12,16 +12,13 @@ * possible, we make the following compromise: * * 1. SLEEP_OE_N will always be programmed to '1' (by MFP_LPM_FLOAT) - * 2. DRIVE strength definitions redefined to include the reserved bit10 + * 2. DRIVE strength definitions redefined to include the reserved bit + * - the reserved bit differs between pxa168 and pxa910, and the + * MFP_DRIVE_* macros are individually defined in mfp-pxa{168,910}.h * 3. Override MFP_CFG() and MFP_CFG_DRV() * 4. Drop the use of MFP_CFG_LPM() and MFP_CFG_X() */ -#define MFP_DRIVE_VERY_SLOW (0x0 << 13) -#define MFP_DRIVE_SLOW (0x2 << 13) -#define MFP_DRIVE_MEDIUM (0x4 << 13) -#define MFP_DRIVE_FAST (0x8 << 13) - #undef MFP_CFG #undef MFP_CFG_DRV #undef MFP_CFG_LPM From f5c81a327015844eb91087dd102648b5d984f33c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 23 Apr 2009 03:04:45 +0800 Subject: [PATCH 525/900] [ARM] pxa: add parameter to clksrc_read() for pxa168/910 This patch modifies parameter of clksrc_read() from 'void' to 'struct clocksource *cs', which fixes compile warning for incompatible parameter type. Signed-off-by: Coly Li Cc: Thomas Gleixner Signed-off-by: Eric Miao --- arch/arm/mach-mmp/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-mmp/time.c b/arch/arm/mach-mmp/time.c index b03a6eda741..a8400bb891e 100644 --- a/arch/arm/mach-mmp/time.c +++ b/arch/arm/mach-mmp/time.c @@ -136,7 +136,7 @@ static struct clock_event_device ckevt = { .set_mode = timer_set_mode, }; -static cycle_t clksrc_read(void) +static cycle_t clksrc_read(struct clocksource *cs) { return timer_read(); } From 6ec04f434d29aed33608e0ca4d8b100190e71e96 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 22 May 2009 01:39:10 +0200 Subject: [PATCH 526/900] [ARM] pxa/palm: fix PalmLD/T5/TX AC97 MFP Signed-off-by: Marek Vasut Signed-off-by: Eric Miao --- arch/arm/mach-pxa/palmld.c | 2 ++ arch/arm/mach-pxa/palmt5.c | 1 + arch/arm/mach-pxa/palmtx.c | 1 + 3 files changed, 4 insertions(+) diff --git a/arch/arm/mach-pxa/palmld.c b/arch/arm/mach-pxa/palmld.c index 1cec1806f00..471a853e548 100644 --- a/arch/arm/mach-pxa/palmld.c +++ b/arch/arm/mach-pxa/palmld.c @@ -62,6 +62,8 @@ static unsigned long palmld_pin_config[] __initdata = { GPIO29_AC97_SDATA_IN_0, GPIO30_AC97_SDATA_OUT, GPIO31_AC97_SYNC, + GPIO89_AC97_SYSCLK, + GPIO95_AC97_nRESET, /* IrDA */ GPIO108_GPIO, /* ir disable */ diff --git a/arch/arm/mach-pxa/palmt5.c b/arch/arm/mach-pxa/palmt5.c index 30662363907..05bf979b78a 100644 --- a/arch/arm/mach-pxa/palmt5.c +++ b/arch/arm/mach-pxa/palmt5.c @@ -64,6 +64,7 @@ static unsigned long palmt5_pin_config[] __initdata = { GPIO29_AC97_SDATA_IN_0, GPIO30_AC97_SDATA_OUT, GPIO31_AC97_SYNC, + GPIO89_AC97_SYSCLK, GPIO95_AC97_nRESET, /* IrDA */ diff --git a/arch/arm/mach-pxa/palmtx.c b/arch/arm/mach-pxa/palmtx.c index e2d44b1a8a9..e99a893c58a 100644 --- a/arch/arm/mach-pxa/palmtx.c +++ b/arch/arm/mach-pxa/palmtx.c @@ -65,6 +65,7 @@ static unsigned long palmtx_pin_config[] __initdata = { GPIO29_AC97_SDATA_IN_0, GPIO30_AC97_SDATA_OUT, GPIO31_AC97_SYNC, + GPIO89_AC97_SYSCLK, GPIO95_AC97_nRESET, /* IrDA */ From d5046853634a8d73f28bad3cf68d182c4a99035d Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 22 May 2009 20:36:21 +0900 Subject: [PATCH 527/900] nilfs2: fix memory leak in nilfs_ioctl_clean_segments This fixes a new memory leak problem in garbage collection. The problem was brought by the bugfix patch ("nilfs2: fix lock order reversal in nilfs_clean_segments ioctl"). Thanks to Kentaro Suzuki for finding this problem. Reported-by: Kentaro Suzuki Signed-off-by: Ryusuke Konishi --- fs/nilfs2/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 50ff3f2cdf2..d6759b92006 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -576,7 +576,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); out_free: - while (--n > 0) + while (--n >= 0) vfree(kbufs[n]); kfree(kbufs[4]); return ret; From 63d3892379f93b73ef905fb3449f4e4438a53b40 Mon Sep 17 00:00:00 2001 From: Wu Zhangjin Date: Thu, 21 May 2009 05:50:01 +0800 Subject: [PATCH 528/900] MIPS: Fix sparse warning in incompatiable argument type of clear_user. The type of the second argument of access_ok should be (void __user *). The unnecessary conversion of the clear_user address argument was causing sparse to emit warnings on the __chk_user_ptr check. Signed-off-by: Wu Zhangjin Signed-off-by: Ralf Baechle --- arch/mips/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index 8de858f5449..c2d53c18fd3 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -956,7 +956,7 @@ __clear_user(void __user *addr, __kernel_size_t size) void __user * __cl_addr = (addr); \ unsigned long __cl_size = (n); \ if (__cl_size && access_ok(VERIFY_WRITE, \ - ((unsigned long)(__cl_addr)), __cl_size)) \ + __cl_addr, __cl_size)) \ __cl_size = __clear_user(__cl_addr, __cl_size); \ __cl_size; \ }) From 63c901c7e6fb878805cd2f8f14fa3eee8c03ee84 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Fri, 22 May 2009 10:48:17 +0100 Subject: [PATCH 529/900] MIPS: IP32: Fix build error due to uninitialized variable. CC arch/mips/sgi-ip32/ip32-reset.o cc1: warnings being treated as errors arch/mips/sgi-ip32/ip32-reset.c: In function 'debounce': arch/mips/sgi-ip32/ip32-reset.c:97: error: 'reg_a' is used uninitialized in this function The issues is old but due to the volatile keyword gcc older than 4.4 did not warn about this obvious bug. Signed-off-by: Ralf Baechle --- arch/mips/sgi-ip32/ip32-reset.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c index 667da932b7b..cc549a9a99e 100644 --- a/arch/mips/sgi-ip32/ip32-reset.c +++ b/arch/mips/sgi-ip32/ip32-reset.c @@ -94,6 +94,7 @@ static void debounce(unsigned long data) volatile unsigned char reg_a, reg_c, xctrl_a; reg_c = CMOS_READ(RTC_INTR_FLAGS); + reg_a = CMOS_READ(RTC_REG_A); CMOS_WRITE(reg_a | DS_REGA_DV0, RTC_REG_A); wbflush(); xctrl_a = CMOS_READ(DS_B1_XCTRL4A); From d2f82c2f70d56ba4623de25edb383fec01f43b89 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Fri, 22 May 2009 10:58:43 +0100 Subject: [PATCH 530/900] MIPS: IP32: Remove unnecessary if not even harmful volatile keywords. They are unneeded and as the issue fixed in lmo commit 63f7ec59053e3f850ab67a9938e631bcba64c6ce shows even harmful. Signed-off-by: Ralf Baechle --- arch/mips/sgi-ip32/ip32-reset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c index cc549a9a99e..9b95d80ebc6 100644 --- a/arch/mips/sgi-ip32/ip32-reset.c +++ b/arch/mips/sgi-ip32/ip32-reset.c @@ -53,7 +53,7 @@ static inline void ip32_machine_halt(void) static void ip32_machine_power_off(void) { - volatile unsigned char reg_a, xctrl_a, xctrl_b; + unsigned char reg_a, xctrl_a, xctrl_b; disable_irq(MACEISA_RTC_IRQ); reg_a = CMOS_READ(RTC_REG_A); @@ -91,7 +91,7 @@ static void blink_timeout(unsigned long data) static void debounce(unsigned long data) { - volatile unsigned char reg_a, reg_c, xctrl_a; + unsigned char reg_a, reg_c, xctrl_a; reg_c = CMOS_READ(RTC_INTR_FLAGS); reg_a = CMOS_READ(RTC_REG_A); @@ -138,7 +138,7 @@ static inline void ip32_power_button(void) static irqreturn_t ip32_rtc_int(int irq, void *dev_id) { - volatile unsigned char reg_c; + unsigned char reg_c; reg_c = CMOS_READ(RTC_INTR_FLAGS); if (!(reg_c & RTC_IRQF)) { From 8369d5fa63260cc54464b4687aa6a0f78402d98e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 22 May 2009 16:23:36 +0200 Subject: [PATCH 531/900] ide: fix 40-wire cable detection for TSST SH-S202* ATAPI devices (v2) Since 2.6.26 we support UDMA66 on ATAPI devices requiring IVB quirk: commit 8588a2b732928b343233af9b1855705b8286bed4 ("ide: add SH-S202J to ivb_list[]") We also later added support for more such devices in: commit e97564f362a93f8c248246c19828895950341252 ("ide: More TSST drives with broken cable detection") and in: commit 3ced5c49bd2d1f2c7f769e3a54385883de63a652 ("ide: add TSSTcorp CDDVDW SH-S202H to ivb_list[]") It turns out that such devices lack cable detection altogether (which in turn results in incorrect detection of 40-wire cables by our current cable detection strategy) so always handle them by trusting host-side cable detection only. v2: Model detection fixup from Martin. Reported-and-tested-by: Martin Lottermoser Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-iops.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index c19a221b1e1..06fe002116e 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -206,8 +206,6 @@ EXPORT_SYMBOL_GPL(ide_in_drive_list); /* * Early UDMA66 devices don't set bit14 to 1, only bit13 is valid. - * We list them here and depend on the device side cable detection for them. - * * Some optical devices with the buggy firmwares have the same problem. */ static const struct drive_list_entry ivb_list[] = { @@ -251,10 +249,25 @@ u8 eighty_ninty_three(ide_drive_t *drive) * - force bit13 (80c cable present) check also for !ivb devices * (unless the slave device is pre-ATA3) */ - if ((id[ATA_ID_HW_CONFIG] & 0x4000) || - (ivb && (id[ATA_ID_HW_CONFIG] & 0x2000))) + if (id[ATA_ID_HW_CONFIG] & 0x4000) return 1; + if (ivb) { + const char *model = (char *)&id[ATA_ID_PROD]; + + if (strstr(model, "TSSTcorp CDDVDW SH-S202")) { + /* + * These ATAPI devices always report 80c cable + * so we have to depend on the host in this case. + */ + if (hwif->cbl == ATA_CBL_PATA80) + return 1; + } else { + /* Depend on the device side cable detection. */ + if (id[ATA_ID_HW_CONFIG] & 0x2000) + return 1; + } + } no_80w: if (drive->dev_flags & IDE_DFLAG_UDMA33_WARNED) return 0; From e3b29f05124b07303088795396ff858811d2acb8 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 22 May 2009 16:23:37 +0200 Subject: [PATCH 532/900] ide: fix OOPS during ide-cd error recovery On Tuesday 19 May 2009 20:29:28 Martin Lottermoser wrote: > hdc: cdrom_decode_status: error=0x40 <3>{ LastFailedSense=0x04 } > ide: failed opcode was: unknown > hdc: DMA disabled > ------------[ cut here ]------------ > kernel BUG at drivers/ide/ide-io.c:872! It is possible for ide-cd to ignore ide_error()'s return value under some circumstances. Workaround it in ide_intr() and ide_timer_expiry() by checking if there is a device/port reset pending currently. Fixes bug #13345: http://bugzilla.kernel.org/show_bug.cgi?id=13345 Reported-by: Martin Lottermoser Reported-and-tested-by: Modestas Vainius Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 35dc38d3b2c..6415a2e2ba8 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -696,7 +696,7 @@ void ide_timer_expiry (unsigned long data) } spin_lock_irq(&hwif->lock); enable_irq(hwif->irq); - if (startstop == ide_stopped) { + if (startstop == ide_stopped && hwif->polling == 0) { ide_unlock_port(hwif); plug_device = 1; } @@ -868,7 +868,7 @@ irqreturn_t ide_intr (int irq, void *dev_id) * same irq as is currently being serviced here, and Linux * won't allow another of the same (on any CPU) until we return. */ - if (startstop == ide_stopped) { + if (startstop == ide_stopped && hwif->polling == 0) { BUG_ON(hwif->handler); ide_unlock_port(hwif); plug_device = 1; From 26bfcf21e25fa090f099fa0ccf201424989cbd7b Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 22 May 2009 16:23:37 +0200 Subject: [PATCH 533/900] ide: fix printk() levels in ide_dump_ata[pi]_error() Fixes "<3>" in error messages like this one: hdc: cdrom_decode_status: error=0x40 <3>{ LastFailedSense=0x04 } Reported-by: Martin Lottermoser Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c index 56ff8c46c7d..85b9bae111f 100644 --- a/drivers/ide/ide-lib.c +++ b/drivers/ide/ide-lib.c @@ -91,7 +91,7 @@ static void ide_dump_sector(ide_drive_t *drive) static void ide_dump_ata_error(ide_drive_t *drive, u8 err) { - printk(KERN_ERR "{ "); + printk(KERN_CONT "{ "); if (err & ATA_ABORTED) printk(KERN_CONT "DriveStatusError "); if (err & ATA_ICRC) @@ -121,7 +121,7 @@ static void ide_dump_ata_error(ide_drive_t *drive, u8 err) static void ide_dump_atapi_error(ide_drive_t *drive, u8 err) { - printk(KERN_ERR "{ "); + printk(KERN_CONT "{ "); if (err & ATAPI_ILI) printk(KERN_CONT "IllegalLengthIndication "); if (err & ATAPI_EOM) From cc30137a221372c67a943ad9ea68121a2bd57a6e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 22 May 2009 16:23:38 +0200 Subject: [PATCH 534/900] ide: improve failed opcode reporting Nowadays we (almost) always store the currently executing command in hwif->cmd so we can use it for the failed opcode reporting. Cc: Martin Lottermoser Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-lib.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c index 85b9bae111f..2148df836ce 100644 --- a/drivers/ide/ide-lib.c +++ b/drivers/ide/ide-lib.c @@ -31,24 +31,6 @@ void ide_toggle_bounce(ide_drive_t *drive, int on) blk_queue_bounce_limit(drive->queue, addr); } -static void ide_dump_opcode(ide_drive_t *drive) -{ - struct request *rq = drive->hwif->rq; - struct ide_cmd *cmd = NULL; - - if (!rq) - return; - - if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) - cmd = rq->special; - - printk(KERN_ERR "ide: failed opcode was: "); - if (cmd == NULL) - printk(KERN_CONT "unknown\n"); - else - printk(KERN_CONT "0x%02x\n", cmd->tf.command); -} - u64 ide_get_lba_addr(struct ide_cmd *cmd, int lba48) { struct ide_taskfile *tf = &cmd->tf; @@ -179,7 +161,10 @@ u8 ide_dump_status(ide_drive_t *drive, const char *msg, u8 stat) else ide_dump_atapi_error(drive, err); } - ide_dump_opcode(drive); + + printk(KERN_ERR "%s: possibly failed opcode: 0x%02x\n", + drive->name, drive->hwif->cmd.tf.command); + return err; } EXPORT_SYMBOL(ide_dump_status); From 28ee9bc5cc42776e0364399b401a64906ac1ac8e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 22 May 2009 16:23:38 +0200 Subject: [PATCH 535/900] ide: report timeouts in ide_busy_sleep() * change 'hwif' argument to 'drive' * report an error on timeout Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-probe.c | 9 ++++++--- include/linux/ide.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 7f264ed1141..c895ed52b2e 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -295,7 +295,7 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id) timeout = ((cmd == ATA_CMD_ID_ATA) ? WAIT_WORSTCASE : WAIT_PIDENTIFY) / 2; - if (ide_busy_sleep(hwif, timeout, use_altstatus)) + if (ide_busy_sleep(drive, timeout, use_altstatus)) return 1; /* wait for IRQ and ATA_DRQ */ @@ -316,8 +316,9 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id) return rc; } -int ide_busy_sleep(ide_hwif_t *hwif, unsigned long timeout, int altstatus) +int ide_busy_sleep(ide_drive_t *drive, unsigned long timeout, int altstatus) { + ide_hwif_t *hwif = drive->hwif; u8 stat; timeout += jiffies; @@ -330,6 +331,8 @@ int ide_busy_sleep(ide_hwif_t *hwif, unsigned long timeout, int altstatus) return 0; } while (time_before(jiffies, timeout)); + printk(KERN_ERR "%s: timeout in %s\n", drive->name, __func__); + return 1; /* drive timed-out */ } @@ -420,7 +423,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd) tp_ops->dev_select(drive); msleep(50); tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET); - (void)ide_busy_sleep(hwif, WAIT_WORSTCASE, 0); + (void)ide_busy_sleep(drive, WAIT_WORSTCASE, 0); rc = ide_dev_read_id(drive, cmd, id); } diff --git a/include/linux/ide.h b/include/linux/ide.h index ff65fffb078..9fed365a598 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1109,7 +1109,7 @@ void ide_fix_driveid(u16 *); extern void ide_fixstring(u8 *, const int, const int); -int ide_busy_sleep(ide_hwif_t *, unsigned long, int); +int ide_busy_sleep(ide_drive_t *, unsigned long, int); int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long); From 5993856e53fbc4b4f28e2d481deaebeb715b1267 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Fri, 22 May 2009 16:23:39 +0200 Subject: [PATCH 536/900] via82cxxx: Add VIA VX855 PCI Device ID This patch adds the PCI Device ID 0xc409 to the PCI ID table of via82cxxx.c, as well as the 0x8409 south bridge ID. This is required to make the IDE driver work on the VX855/VX875 integrated chipset. Signed-off-by: Harald Welte Cc: Joseph Chan Cc: Bruce Chang Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/via82cxxx.c | 2 ++ include/linux/pci_ids.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/ide/via82cxxx.c b/drivers/ide/via82cxxx.c index 3ff7231e485..028de26a25f 100644 --- a/drivers/ide/via82cxxx.c +++ b/drivers/ide/via82cxxx.c @@ -67,6 +67,7 @@ static struct via_isa_bridge { u8 udma_mask; u8 flags; } via_isa_bridges[] = { + { "vx855", PCI_DEVICE_ID_VIA_VX855, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST }, { "vx800", PCI_DEVICE_ID_VIA_VX800, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST }, { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST }, { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST }, @@ -474,6 +475,7 @@ static const struct pci_device_id via_pci_tbl[] = { { PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_82C576_1), 0 }, { PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_82C586_1), 0 }, { PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_CX700_IDE), 0 }, + { PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_VX855_IDE), 0 }, { PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_6410), 1 }, { PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_SATA_EIDE), 1 }, { 0, }, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 06ba90c211a..0f71812d67d 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1406,7 +1406,7 @@ #define PCI_DEVICE_ID_VIA_82C598_1 0x8598 #define PCI_DEVICE_ID_VIA_838X_1 0xB188 #define PCI_DEVICE_ID_VIA_83_87XX_1 0xB198 -#define PCI_DEVICE_ID_VIA_C409_IDE 0XC409 +#define PCI_DEVICE_ID_VIA_VX855_IDE 0xC409 #define PCI_DEVICE_ID_VIA_ANON 0xFFFF #define PCI_VENDOR_ID_SIEMENS 0x110A From 9a2845c453d170e4e9b1437fa671dbf39b0e7bd8 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 20 May 2009 13:36:17 -0500 Subject: [PATCH 537/900] ipmi: fix ipmi_si modprobe hang Instead of queuing IPMB messages before channel initialization, just throw them away. Nobody will be listening for them at this point, anyway, and they will clog up the queue and nothing will be delivered if we queue them. Also set the current channel to the number of channels, as this value is used to tell if the channel information has been initialized. Signed-off-by: Corey Minyard Cc: Ferenc Wagner Cc: Dan Frazier Signed-off-by: Linus Torvalds --- drivers/char/ipmi/ipmi_msghandler.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index aa83a0865ec..09050797c76 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -2856,6 +2856,7 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers, /* Assume a single IPMB channel at zero. */ intf->channels[0].medium = IPMI_CHANNEL_MEDIUM_IPMB; intf->channels[0].protocol = IPMI_CHANNEL_PROTOCOL_IPMB; + intf->curr_channel = IPMI_MAX_CHANNELS; } if (rv == 0) @@ -3648,13 +3649,13 @@ static int handle_new_recv_msg(ipmi_smi_t intf, } /* - ** We need to make sure the channels have been initialized. - ** The channel_handler routine will set the "curr_channel" - ** equal to or greater than IPMI_MAX_CHANNELS when all the - ** channels for this interface have been initialized. - */ + * We need to make sure the channels have been initialized. + * The channel_handler routine will set the "curr_channel" + * equal to or greater than IPMI_MAX_CHANNELS when all the + * channels for this interface have been initialized. + */ if (intf->curr_channel < IPMI_MAX_CHANNELS) { - requeue = 1; /* Just put the message back for now */ + requeue = 0; /* Throw the message away */ goto out; } From afe6d7e3c4a9aba020637f4ae15527a89ba31f21 Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Fri, 22 May 2009 17:48:58 +0200 Subject: [PATCH 538/900] ALSA: Kill truncate warning by shortening Sigmatel-specific AC97 control name ALSA sound/core/control.c:232: Control name 'Sigmatel Surround Phase Inversion Playback Switch' truncated to 'Sigmatel Surround Phase Inversion Playback ' bootup message by omitting weird Sigmatel prefix in this case; also fix up the related ca0106 mixer control removal part by using identical naming there. Signed-off-by: Andreas Mohr Signed-off-by: Takashi Iwai --- sound/pci/ac97/ac97_patch.c | 7 +++++-- sound/pci/ca0106/ca0106_mixer.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sound/pci/ac97/ac97_patch.c b/sound/pci/ac97/ac97_patch.c index 81bc93e5f1e..7337abdbe4e 100644 --- a/sound/pci/ac97/ac97_patch.c +++ b/sound/pci/ac97/ac97_patch.c @@ -958,10 +958,13 @@ static int patch_sigmatel_stac9708_3d(struct snd_ac97 * ac97) } static const struct snd_kcontrol_new snd_ac97_sigmatel_4speaker = -AC97_SINGLE("Sigmatel 4-Speaker Stereo Playback Switch", AC97_SIGMATEL_DAC2INVERT, 2, 1, 0); +AC97_SINGLE("Sigmatel 4-Speaker Stereo Playback Switch", + AC97_SIGMATEL_DAC2INVERT, 2, 1, 0); +/* "Sigmatel " removed due to excessive name length: */ static const struct snd_kcontrol_new snd_ac97_sigmatel_phaseinvert = -AC97_SINGLE("Sigmatel Surround Phase Inversion Playback Switch", AC97_SIGMATEL_DAC2INVERT, 3, 1, 0); +AC97_SINGLE("Surround Phase Inversion Playback Switch", + AC97_SIGMATEL_DAC2INVERT, 3, 1, 0); static const struct snd_kcontrol_new snd_ac97_sigmatel_controls[] = { AC97_SINGLE("Sigmatel DAC 6dB Attenuate", AC97_SIGMATEL_ANALOG, 1, 1, 0), diff --git a/sound/pci/ca0106/ca0106_mixer.c b/sound/pci/ca0106/ca0106_mixer.c index ad2888705d2..c111efe61c3 100644 --- a/sound/pci/ca0106/ca0106_mixer.c +++ b/sound/pci/ca0106/ca0106_mixer.c @@ -800,7 +800,7 @@ int __devinit snd_ca0106_mixer(struct snd_ca0106 *emu) "Capture Volume", "External Amplifier", "Sigmatel 4-Speaker Stereo Playback Switch", - "Sigmatel Surround Phase Inversion Playback ", + "Surround Phase Inversion Playback Switch", NULL }; static char *ca0106_rename_ctls[] = { From bca23dba760d6705c013f89113c46570378fb626 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 21 May 2009 11:46:16 -0700 Subject: [PATCH 539/900] x86, setup: revert ACPI 3 E820 extended attributes support Remove ACPI 3 E820 extended memory attributes support. At least one vendor actively set all the flags to zero, but left ECX on return at 24. This bug may be present in other BIOSes. The breakage functionally means the ACPI 3 flags are probably completely useless, and that no OS any time soon is going to rely on their existence. Therefore, drop support completely. We may want to revisit this question in the future, if we find ourselves actually needing the flags. This reverts all or part of the following checkins: cd670599b7b00d9263f6f11a05c0edeb9cbedaf3 c549e71d073a6e9a4847497344db28a784061455 However, retain the part from the latter commit that copies e820 into a temporary buffer; that is an unrelated BIOS workaround. Put in a comment to explain that part. See https://bugzilla.redhat.com/show_bug.cgi?id=499396 for some additional information. [ Impact: detect all memory on affected machines ] Reported-by: Thomas J. Baker Signed-off-by: H. Peter Anvin Acked-by: Len Brown Cc: Chuck Ebbert Cc: Kyle McMartin Cc: Matt Domsch --- arch/x86/boot/memory.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index 5054c2ddd1a..74b3d2ba84e 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -17,11 +17,6 @@ #define SMAP 0x534d4150 /* ASCII "SMAP" */ -struct e820_ext_entry { - struct e820entry std; - u32 ext_flags; -} __attribute__((packed)); - static int detect_memory_e820(void) { int count = 0; @@ -29,13 +24,21 @@ static int detect_memory_e820(void) u32 size, id, edi; u8 err; struct e820entry *desc = boot_params.e820_map; - static struct e820_ext_entry buf; /* static so it is zeroed */ + static struct e820entry buf; /* static so it is zeroed */ /* - * Set this here so that if the BIOS doesn't change this field - * but still doesn't change %ecx, we're still okay... + * Note: at least one BIOS is known which assumes that the + * buffer pointed to by one e820 call is the same one as + * the previous call, and only changes modified fields. Therefore, + * we use a temporary buffer and copy the results entry by entry. + * + * This routine deliberately does not try to account for + * ACPI 3+ extended attributes. This is because there are + * BIOSes in the field which report zero for the valid bit for + * all ranges, and we don't currently make any use of the + * other attribute bits. Revisit this if we see the extended + * attribute bits deployed in a meaningful way in the future. */ - buf.ext_flags = 1; do { size = sizeof buf; @@ -66,13 +69,7 @@ static int detect_memory_e820(void) break; } - /* ACPI 3.0 added the extended flags support. If bit 0 - in the extended flags is zero, we're supposed to simply - ignore the entry -- a backwards incompatible change! */ - if (size > 20 && !(buf.ext_flags & 1)) - continue; - - *desc++ = buf.std; + *desc++ = buf; count++; } while (next && count < ARRAY_SIZE(boot_params.e820_map)); From 14b60391587ab9b2207c4fb6281763a93ae85e0f Mon Sep 17 00:00:00 2001 From: Jesse Barnes Date: Wed, 20 May 2009 16:47:08 -0400 Subject: [PATCH 540/900] i915: support 8xx desktop cursors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For some reason we never added 8xx desktop cursor support to the kernel. This patch fixes that. [krh: Also set the size on pre-i915 hw.] Tested-by: Kristian Høgsberg Signed-off-by: Jesse Barnes Signed-off-by: Eric Anholt --- drivers/gpu/drm/i915/i915_gem.c | 4 ++-- drivers/gpu/drm/i915/i915_reg.h | 17 +++++++++++++++++ drivers/gpu/drm/i915/intel_display.c | 26 ++++++++++++++++++++------ 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 4a24c90fb94..717b6a854bc 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1145,7 +1145,7 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) mutex_unlock(&dev->struct_mutex); return VM_FAULT_SIGBUS; } - list_add(&obj_priv->list, &dev_priv->mm.inactive_list); + list_add_tail(&obj_priv->list, &dev_priv->mm.inactive_list); } /* Need a new fence register? */ @@ -1375,7 +1375,7 @@ i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data, mutex_unlock(&dev->struct_mutex); return ret; } - list_add(&obj_priv->list, &dev_priv->mm.inactive_list); + list_add_tail(&obj_priv->list, &dev_priv->mm.inactive_list); } drm_gem_object_unreference(obj); diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 15da44cf21b..9668cc0d7f4 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -1410,9 +1410,25 @@ /* Cursor A & B regs */ #define CURACNTR 0x70080 +/* Old style CUR*CNTR flags (desktop 8xx) */ +#define CURSOR_ENABLE 0x80000000 +#define CURSOR_GAMMA_ENABLE 0x40000000 +#define CURSOR_STRIDE_MASK 0x30000000 +#define CURSOR_FORMAT_SHIFT 24 +#define CURSOR_FORMAT_MASK (0x07 << CURSOR_FORMAT_SHIFT) +#define CURSOR_FORMAT_2C (0x00 << CURSOR_FORMAT_SHIFT) +#define CURSOR_FORMAT_3C (0x01 << CURSOR_FORMAT_SHIFT) +#define CURSOR_FORMAT_4C (0x02 << CURSOR_FORMAT_SHIFT) +#define CURSOR_FORMAT_ARGB (0x04 << CURSOR_FORMAT_SHIFT) +#define CURSOR_FORMAT_XRGB (0x05 << CURSOR_FORMAT_SHIFT) +/* New style CUR*CNTR flags */ +#define CURSOR_MODE 0x27 #define CURSOR_MODE_DISABLE 0x00 #define CURSOR_MODE_64_32B_AX 0x07 #define CURSOR_MODE_64_ARGB_AX ((1 << 5) | CURSOR_MODE_64_32B_AX) +#define MCURSOR_PIPE_SELECT (1 << 28) +#define MCURSOR_PIPE_A 0x00 +#define MCURSOR_PIPE_B (1 << 28) #define MCURSOR_GAMMA_ENABLE (1 << 26) #define CURABASE 0x70084 #define CURAPOS 0x70088 @@ -1420,6 +1436,7 @@ #define CURSOR_POS_SIGN 0x8000 #define CURSOR_X_SHIFT 0 #define CURSOR_Y_SHIFT 16 +#define CURSIZE 0x700a0 #define CURBCNTR 0x700c0 #define CURBBASE 0x700c4 #define CURBPOS 0x700c8 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 3387cf32f38..c9d6f10ba92 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -1357,7 +1357,7 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc, int pipe = intel_crtc->pipe; uint32_t control = (pipe == 0) ? CURACNTR : CURBCNTR; uint32_t base = (pipe == 0) ? CURABASE : CURBBASE; - uint32_t temp; + uint32_t temp = I915_READ(control); size_t addr; int ret; @@ -1366,7 +1366,12 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc, /* if we want to turn off the cursor ignore width and height */ if (!handle) { DRM_DEBUG("cursor off\n"); - temp = CURSOR_MODE_DISABLE; + if (IS_MOBILE(dev) || IS_I9XX(dev)) { + temp &= ~(CURSOR_MODE | MCURSOR_GAMMA_ENABLE); + temp |= CURSOR_MODE_DISABLE; + } else { + temp &= ~(CURSOR_ENABLE | CURSOR_GAMMA_ENABLE); + } addr = 0; bo = NULL; mutex_lock(&dev->struct_mutex); @@ -1409,10 +1414,19 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc, addr = obj_priv->phys_obj->handle->busaddr; } - temp = 0; - /* set the pipe for the cursor */ - temp |= (pipe << 28); - temp |= CURSOR_MODE_64_ARGB_AX | MCURSOR_GAMMA_ENABLE; + if (!IS_I9XX(dev)) + I915_WRITE(CURSIZE, (height << 12) | width); + + /* Hooray for CUR*CNTR differences */ + if (IS_MOBILE(dev) || IS_I9XX(dev)) { + temp &= ~(CURSOR_MODE | MCURSOR_PIPE_SELECT); + temp |= CURSOR_MODE_64_ARGB_AX | MCURSOR_GAMMA_ENABLE; + temp |= (pipe << 28); /* Connect to correct pipe */ + } else { + temp &= ~(CURSOR_FORMAT_MASK); + temp |= CURSOR_ENABLE; + temp |= CURSOR_FORMAT_ARGB | CURSOR_GAMMA_ENABLE; + } finish: I915_WRITE(control, temp); From 8863170628da4b0b461eb96bf797df1dca0bd03e Mon Sep 17 00:00:00 2001 From: Ma Ling Date: Wed, 13 May 2009 11:19:55 +0800 Subject: [PATCH 541/900] drm/i915: Fetch SDVO LVDS mode lines from VBT, then reserve them Signed-off-by: Ma Ling Signed-off-by: Eric Anholt --- drivers/gpu/drm/i915/i915_drv.h | 3 +- drivers/gpu/drm/i915/intel_bios.c | 104 +++++++++++++++++++++--------- drivers/gpu/drm/i915/intel_bios.h | 17 +++++ drivers/gpu/drm/i915/intel_lvds.c | 4 +- 4 files changed, 94 insertions(+), 34 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 9b149fe824c..c431fa54bbb 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -180,7 +180,8 @@ typedef struct drm_i915_private { int backlight_duty_cycle; /* restore backlight to this value */ bool panel_wants_dither; struct drm_display_mode *panel_fixed_mode; - struct drm_display_mode *vbt_mode; /* if any */ + struct drm_display_mode *lfp_lvds_vbt_mode; /* if any */ + struct drm_display_mode *sdvo_lvds_vbt_mode; /* if any */ /* Feature bits from the VBIOS */ unsigned int int_tv_support:1; diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index fc28e2bbd54..9d78cff33b2 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -57,9 +57,43 @@ find_section(struct bdb_header *bdb, int section_id) return NULL; } -/* Try to find panel data */ static void -parse_panel_data(struct drm_i915_private *dev_priv, struct bdb_header *bdb) +fill_detail_timing_data(struct drm_display_mode *panel_fixed_mode, + struct lvds_dvo_timing *dvo_timing) +{ + panel_fixed_mode->hdisplay = (dvo_timing->hactive_hi << 8) | + dvo_timing->hactive_lo; + panel_fixed_mode->hsync_start = panel_fixed_mode->hdisplay + + ((dvo_timing->hsync_off_hi << 8) | dvo_timing->hsync_off_lo); + panel_fixed_mode->hsync_end = panel_fixed_mode->hsync_start + + dvo_timing->hsync_pulse_width; + panel_fixed_mode->htotal = panel_fixed_mode->hdisplay + + ((dvo_timing->hblank_hi << 8) | dvo_timing->hblank_lo); + + panel_fixed_mode->vdisplay = (dvo_timing->vactive_hi << 8) | + dvo_timing->vactive_lo; + panel_fixed_mode->vsync_start = panel_fixed_mode->vdisplay + + dvo_timing->vsync_off; + panel_fixed_mode->vsync_end = panel_fixed_mode->vsync_start + + dvo_timing->vsync_pulse_width; + panel_fixed_mode->vtotal = panel_fixed_mode->vdisplay + + ((dvo_timing->vblank_hi << 8) | dvo_timing->vblank_lo); + panel_fixed_mode->clock = dvo_timing->clock * 10; + panel_fixed_mode->type = DRM_MODE_TYPE_PREFERRED; + + /* Some VBTs have bogus h/vtotal values */ + if (panel_fixed_mode->hsync_end > panel_fixed_mode->htotal) + panel_fixed_mode->htotal = panel_fixed_mode->hsync_end + 1; + if (panel_fixed_mode->vsync_end > panel_fixed_mode->vtotal) + panel_fixed_mode->vtotal = panel_fixed_mode->vsync_end + 1; + + drm_mode_set_name(panel_fixed_mode); +} + +/* Try to find integrated panel data */ +static void +parse_lfp_panel_data(struct drm_i915_private *dev_priv, + struct bdb_header *bdb) { struct bdb_lvds_options *lvds_options; struct bdb_lvds_lfp_data *lvds_lfp_data; @@ -91,35 +125,9 @@ parse_panel_data(struct drm_i915_private *dev_priv, struct bdb_header *bdb) panel_fixed_mode = drm_calloc(1, sizeof(*panel_fixed_mode), DRM_MEM_DRIVER); - panel_fixed_mode->hdisplay = (dvo_timing->hactive_hi << 8) | - dvo_timing->hactive_lo; - panel_fixed_mode->hsync_start = panel_fixed_mode->hdisplay + - ((dvo_timing->hsync_off_hi << 8) | dvo_timing->hsync_off_lo); - panel_fixed_mode->hsync_end = panel_fixed_mode->hsync_start + - dvo_timing->hsync_pulse_width; - panel_fixed_mode->htotal = panel_fixed_mode->hdisplay + - ((dvo_timing->hblank_hi << 8) | dvo_timing->hblank_lo); + fill_detail_timing_data(panel_fixed_mode, dvo_timing); - panel_fixed_mode->vdisplay = (dvo_timing->vactive_hi << 8) | - dvo_timing->vactive_lo; - panel_fixed_mode->vsync_start = panel_fixed_mode->vdisplay + - dvo_timing->vsync_off; - panel_fixed_mode->vsync_end = panel_fixed_mode->vsync_start + - dvo_timing->vsync_pulse_width; - panel_fixed_mode->vtotal = panel_fixed_mode->vdisplay + - ((dvo_timing->vblank_hi << 8) | dvo_timing->vblank_lo); - panel_fixed_mode->clock = dvo_timing->clock * 10; - panel_fixed_mode->type = DRM_MODE_TYPE_PREFERRED; - - /* Some VBTs have bogus h/vtotal values */ - if (panel_fixed_mode->hsync_end > panel_fixed_mode->htotal) - panel_fixed_mode->htotal = panel_fixed_mode->hsync_end + 1; - if (panel_fixed_mode->vsync_end > panel_fixed_mode->vtotal) - panel_fixed_mode->vtotal = panel_fixed_mode->vsync_end + 1; - - drm_mode_set_name(panel_fixed_mode); - - dev_priv->vbt_mode = panel_fixed_mode; + dev_priv->lfp_lvds_vbt_mode = panel_fixed_mode; DRM_DEBUG("Found panel mode in BIOS VBT tables:\n"); drm_mode_debug_printmodeline(panel_fixed_mode); @@ -127,6 +135,39 @@ parse_panel_data(struct drm_i915_private *dev_priv, struct bdb_header *bdb) return; } +/* Try to find sdvo panel data */ +static void +parse_sdvo_panel_data(struct drm_i915_private *dev_priv, + struct bdb_header *bdb) +{ + struct bdb_sdvo_lvds_options *sdvo_lvds_options; + struct lvds_dvo_timing *dvo_timing; + struct drm_display_mode *panel_fixed_mode; + + dev_priv->sdvo_lvds_vbt_mode = NULL; + + sdvo_lvds_options = find_section(bdb, BDB_SDVO_LVDS_OPTIONS); + if (!sdvo_lvds_options) + return; + + dvo_timing = find_section(bdb, BDB_SDVO_PANEL_DTDS); + if (!dvo_timing) + return; + + panel_fixed_mode = drm_calloc(1, sizeof(*panel_fixed_mode), + DRM_MEM_DRIVER); + + if (!panel_fixed_mode) + return; + + fill_detail_timing_data(panel_fixed_mode, + dvo_timing + sdvo_lvds_options->panel_type); + + dev_priv->sdvo_lvds_vbt_mode = panel_fixed_mode; + + return; +} + static void parse_general_features(struct drm_i915_private *dev_priv, struct bdb_header *bdb) @@ -199,7 +240,8 @@ intel_init_bios(struct drm_device *dev) /* Grab useful general definitions */ parse_general_features(dev_priv, bdb); - parse_panel_data(dev_priv, bdb); + parse_lfp_panel_data(dev_priv, bdb); + parse_sdvo_panel_data(dev_priv, bdb); pci_unmap_rom(pdev, bios); diff --git a/drivers/gpu/drm/i915/intel_bios.h b/drivers/gpu/drm/i915/intel_bios.h index de621aad85b..8ca2cde1580 100644 --- a/drivers/gpu/drm/i915/intel_bios.h +++ b/drivers/gpu/drm/i915/intel_bios.h @@ -279,6 +279,23 @@ struct vch_bdb_22 { struct vch_panel_data panels[16]; } __attribute__((packed)); +struct bdb_sdvo_lvds_options { + u8 panel_backlight; + u8 h40_set_panel_type; + u8 panel_type; + u8 ssc_clk_freq; + u16 als_low_trip; + u16 als_high_trip; + u8 sclalarcoeff_tab_row_num; + u8 sclalarcoeff_tab_row_size; + u8 coefficient[8]; + u8 panel_misc_bits_1; + u8 panel_misc_bits_2; + u8 panel_misc_bits_3; + u8 panel_misc_bits_4; +} __attribute__((packed)); + + bool intel_init_bios(struct drm_device *dev); /* diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index 439a8651499..53731f0ffcb 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -511,10 +511,10 @@ void intel_lvds_init(struct drm_device *dev) } /* Failed to get EDID, what about VBT? */ - if (dev_priv->vbt_mode) { + if (dev_priv->lfp_lvds_vbt_mode) { mutex_lock(&dev->mode_config.mutex); dev_priv->panel_fixed_mode = - drm_mode_duplicate(dev, dev_priv->vbt_mode); + drm_mode_duplicate(dev, dev_priv->lfp_lvds_vbt_mode); mutex_unlock(&dev->mode_config.mutex); if (dev_priv->panel_fixed_mode) { dev_priv->panel_fixed_mode->type |= From 7086c87fb1446ceb37918ffa0941359a7c2ec6cf Mon Sep 17 00:00:00 2001 From: Ma Ling Date: Wed, 13 May 2009 11:20:06 +0800 Subject: [PATCH 542/900] drm/i915: Return SDVO LVDS VBT mode if no EDID modes are detected. Some new SDVO LVDS hardware doesn't have DDC available, and this should fix the display on it. Signed-off-by: Ma Ling Signed-off-by: Eric Anholt --- drivers/gpu/drm/i915/intel_sdvo.c | 41 +++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c index 9913651c1e1..f79ebf4de63 100644 --- a/drivers/gpu/drm/i915/intel_sdvo.c +++ b/drivers/gpu/drm/i915/intel_sdvo.c @@ -69,6 +69,10 @@ struct intel_sdvo_priv { * This is set if we treat the device as HDMI, instead of DVI. */ bool is_hdmi; + /** + * This is set if we detect output of sdvo device as LVDS. + */ + bool is_lvds; /** * Returned SDTV resolutions allowed for the current format, if the @@ -1543,6 +1547,37 @@ static void intel_sdvo_get_tv_modes(struct drm_connector *connector) } } +static void intel_sdvo_get_lvds_modes(struct drm_connector *connector) +{ + struct intel_output *intel_output = to_intel_output(connector); + struct intel_sdvo_priv *sdvo_priv = intel_output->dev_priv; + struct drm_i915_private *dev_priv = connector->dev->dev_private; + + /* + * Attempt to get the mode list from DDC. + * Assume that the preferred modes are + * arranged in priority order. + */ + /* set the bus switch and get the modes */ + intel_sdvo_set_control_bus_switch(intel_output, sdvo_priv->ddc_bus); + intel_ddc_get_modes(intel_output); + if (list_empty(&connector->probed_modes) == false) + return; + + /* Fetch modes from VBT */ + if (dev_priv->sdvo_lvds_vbt_mode != NULL) { + struct drm_display_mode *newmode; + newmode = drm_mode_duplicate(connector->dev, + dev_priv->sdvo_lvds_vbt_mode); + if (newmode != NULL) { + /* Guarantee the mode is preferred */ + newmode->type = (DRM_MODE_TYPE_PREFERRED | + DRM_MODE_TYPE_DRIVER); + drm_mode_probed_add(connector, newmode); + } + } +} + static int intel_sdvo_get_modes(struct drm_connector *connector) { struct intel_output *output = to_intel_output(connector); @@ -1550,6 +1585,8 @@ static int intel_sdvo_get_modes(struct drm_connector *connector) if (sdvo_priv->is_tv) intel_sdvo_get_tv_modes(connector); + else if (sdvo_priv->is_lvds == true) + intel_sdvo_get_lvds_modes(connector); else intel_sdvo_get_ddc_modes(connector); @@ -1720,6 +1757,8 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) } } + /* In defaut case sdvo lvds is false */ + sdvo_priv->is_lvds = false; intel_sdvo_get_capabilities(intel_output, &sdvo_priv->caps); if (sdvo_priv->caps.output_flags & @@ -1773,6 +1812,7 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) connector->display_info.subpixel_order = SubPixelHorizontalRGB; encoder_type = DRM_MODE_ENCODER_LVDS; connector_type = DRM_MODE_CONNECTOR_LVDS; + sdvo_priv->is_lvds = true; } else if (sdvo_priv->caps.output_flags & SDVO_OUTPUT_LVDS1) { @@ -1780,6 +1820,7 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) connector->display_info.subpixel_order = SubPixelHorizontalRGB; encoder_type = DRM_MODE_ENCODER_LVDS; connector_type = DRM_MODE_CONNECTOR_LVDS; + sdvo_priv->is_lvds = true; } else { From ad5b2a6db3eddc41358d8a73f5cfe1c38e7e3a19 Mon Sep 17 00:00:00 2001 From: Jonas Bonn Date: Fri, 15 May 2009 09:10:41 +0200 Subject: [PATCH 543/900] drm/i915: Determine type before initialising connector drm_connector_init sets both the connector type and the connector type_id on the newly initialised connector. As the connector type_id is coupled to the connector type, the connector type cannot simply be modified on an initialised connector. This patch changes the order of operations on intel_sdvo_init so that the type is determined before the connector is intialised. This fixes a bug whereby the name card0-VGA-1 would be allocted to both a CRT and an SDVO connector since the SDVO connector would be initialised with type 'unknown' and hence have its type_id assigned from the wrong pool. Signed-off-by: Jonas Bonn Signed-off-by: Eric Anholt --- drivers/gpu/drm/i915/intel_sdvo.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c index f79ebf4de63..ded122c1ae2 100644 --- a/drivers/gpu/drm/i915/intel_sdvo.c +++ b/drivers/gpu/drm/i915/intel_sdvo.c @@ -1713,17 +1713,9 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) return false; } - connector = &intel_output->base; - - drm_connector_init(dev, connector, &intel_sdvo_connector_funcs, - DRM_MODE_CONNECTOR_Unknown); - drm_connector_helper_add(connector, &intel_sdvo_connector_helper_funcs); sdvo_priv = (struct intel_sdvo_priv *)(intel_output + 1); intel_output->type = INTEL_OUTPUT_SDVO; - connector->interlace_allowed = 0; - connector->doublescan_allowed = 0; - /* setup the DDC bus. */ if (output_device == SDVOB) i2cbus = intel_i2c_create(dev, GPIOE, "SDVOCTRL_E for SDVOB"); @@ -1731,7 +1723,7 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) i2cbus = intel_i2c_create(dev, GPIOE, "SDVOCTRL_E for SDVOC"); if (!i2cbus) - goto err_connector; + goto err_inteloutput; sdvo_priv->i2c_bus = i2cbus; @@ -1747,7 +1739,6 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) intel_output->i2c_bus = i2cbus; intel_output->dev_priv = sdvo_priv; - /* Read the regs to test if we can talk to the device */ for (i = 0; i < 0x40; i++) { if (!intel_sdvo_read_byte(intel_output, i, &ch[i])) { @@ -1768,7 +1759,6 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) else sdvo_priv->controlled_output = SDVO_OUTPUT_TMDS1; - connector->display_info.subpixel_order = SubPixelHorizontalRGB; encoder_type = DRM_MODE_ENCODER_TMDS; connector_type = DRM_MODE_CONNECTOR_DVID; @@ -1786,7 +1776,6 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) else if (sdvo_priv->caps.output_flags & SDVO_OUTPUT_SVID0) { sdvo_priv->controlled_output = SDVO_OUTPUT_SVID0; - connector->display_info.subpixel_order = SubPixelHorizontalRGB; encoder_type = DRM_MODE_ENCODER_TVDAC; connector_type = DRM_MODE_CONNECTOR_SVIDEO; sdvo_priv->is_tv = true; @@ -1795,21 +1784,18 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) else if (sdvo_priv->caps.output_flags & SDVO_OUTPUT_RGB0) { sdvo_priv->controlled_output = SDVO_OUTPUT_RGB0; - connector->display_info.subpixel_order = SubPixelHorizontalRGB; encoder_type = DRM_MODE_ENCODER_DAC; connector_type = DRM_MODE_CONNECTOR_VGA; } else if (sdvo_priv->caps.output_flags & SDVO_OUTPUT_RGB1) { sdvo_priv->controlled_output = SDVO_OUTPUT_RGB1; - connector->display_info.subpixel_order = SubPixelHorizontalRGB; encoder_type = DRM_MODE_ENCODER_DAC; connector_type = DRM_MODE_CONNECTOR_VGA; } else if (sdvo_priv->caps.output_flags & SDVO_OUTPUT_LVDS0) { sdvo_priv->controlled_output = SDVO_OUTPUT_LVDS0; - connector->display_info.subpixel_order = SubPixelHorizontalRGB; encoder_type = DRM_MODE_ENCODER_LVDS; connector_type = DRM_MODE_CONNECTOR_LVDS; sdvo_priv->is_lvds = true; @@ -1817,7 +1803,6 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) else if (sdvo_priv->caps.output_flags & SDVO_OUTPUT_LVDS1) { sdvo_priv->controlled_output = SDVO_OUTPUT_LVDS1; - connector->display_info.subpixel_order = SubPixelHorizontalRGB; encoder_type = DRM_MODE_ENCODER_LVDS; connector_type = DRM_MODE_CONNECTOR_LVDS; sdvo_priv->is_lvds = true; @@ -1836,9 +1821,16 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) goto err_i2c; } + connector = &intel_output->base; + drm_connector_init(dev, connector, &intel_sdvo_connector_funcs, + connector_type); + drm_connector_helper_add(connector, &intel_sdvo_connector_helper_funcs); + connector->interlace_allowed = 0; + connector->doublescan_allowed = 0; + connector->display_info.subpixel_order = SubPixelHorizontalRGB; + drm_encoder_init(dev, &intel_output->enc, &intel_sdvo_enc_funcs, encoder_type); drm_encoder_helper_add(&intel_output->enc, &intel_sdvo_helper_funcs); - connector->connector_type = connector_type; drm_mode_connector_attach_encoder(&intel_output->base, &intel_output->enc); drm_sysfs_connector_add(connector); @@ -1876,8 +1868,7 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) err_i2c: intel_i2c_destroy(intel_output->i2c_bus); -err_connector: - drm_connector_cleanup(connector); +err_inteloutput: kfree(intel_output); return false; From 0c752a93353d9b17dbe148312d732fbe06d235e1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 22 May 2009 12:17:45 -0700 Subject: [PATCH 544/900] x86: introduce noxsave boot parameter Introduce "noxsave" boot parameter which will disable the cpu's xsave/xrstor capabilities. Useful for debugging and working around xsave related issues. [ Impact: make it possible to debug problems in the field ] Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 4 ++++ arch/x86/kernel/cpu/common.c | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e87bdbfbcc7..fd5cac01303 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1535,6 +1535,10 @@ and is between 256 and 4096 characters. It is defined in the file register save and restore. The kernel will only save legacy floating-point registers on task switch. + noxsave [BUGS=X86] Disables x86 extended register state save + and restore using xsave. The kernel will fallback to + enabling legacy floating-point and sse state. + nohlt [BUGS=ARM,SH] Tells the kernel that the sleep(SH) or wfi(ARM) instruction doesn't work correctly and not to use it. This is also useful when using JTAG debugger. diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1caefc82e6..77848d9fca6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -114,6 +114,13 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +static int __init x86_xsave_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + return 1; +} +__setup("noxsave", x86_xsave_setup); + #ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; From 619ac3b75a1e9b2df66857f6a0fb466f1da5fa9e Mon Sep 17 00:00:00 2001 From: Ma Ling Date: Mon, 18 May 2009 16:12:46 +0800 Subject: [PATCH 545/900] drm/i915: Use an I2C algo to do the flip to SDVO DDC bus. Previously, we would set the control bus switch before calls were made to request EDID information over DDC. But recently the DDC code started doing multiple I2C transfers to get the EDID extensions as well. This tripped up SDVO, because the control bus switch is only in effect until the next STOP after a START. By doing our own algo, we can wrap each i2c transaction on the DDC I2C bus with the control bus switch it requires. freedesktop.org bug #21042 Signed-off-by: Ma Ling [anholt: Hand application for conflict, fixed error path] Signed-off-by: Eric Anholt --- drivers/gpu/drm/i915/intel_sdvo.c | 67 +++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c index ded122c1ae2..f3ef6bfd8ff 100644 --- a/drivers/gpu/drm/i915/intel_sdvo.c +++ b/drivers/gpu/drm/i915/intel_sdvo.c @@ -1402,10 +1402,8 @@ static enum drm_connector_status intel_sdvo_detect(struct drm_connector *connect static void intel_sdvo_get_ddc_modes(struct drm_connector *connector) { struct intel_output *intel_output = to_intel_output(connector); - struct intel_sdvo_priv *sdvo_priv = intel_output->dev_priv; /* set the bus switch and get the modes */ - intel_sdvo_set_control_bus_switch(intel_output, sdvo_priv->ddc_bus); intel_ddc_get_modes(intel_output); #if 0 @@ -1601,6 +1599,9 @@ static void intel_sdvo_destroy(struct drm_connector *connector) if (intel_output->i2c_bus) intel_i2c_destroy(intel_output->i2c_bus); + if (intel_output->ddc_bus) + intel_i2c_destroy(intel_output->ddc_bus); + drm_sysfs_connector_remove(connector); drm_connector_cleanup(connector); kfree(intel_output); @@ -1697,12 +1698,56 @@ intel_sdvo_get_digital_encoding_mode(struct intel_output *output) return true; } +static struct intel_output * +intel_sdvo_chan_to_intel_output(struct intel_i2c_chan *chan) +{ + struct drm_device *dev = chan->drm_dev; + struct drm_connector *connector; + struct intel_output *intel_output = NULL; + + list_for_each_entry(connector, + &dev->mode_config.connector_list, head) { + if (to_intel_output(connector)->ddc_bus == chan) { + intel_output = to_intel_output(connector); + break; + } + } + return intel_output; +} + +static int intel_sdvo_master_xfer(struct i2c_adapter *i2c_adap, + struct i2c_msg msgs[], int num) +{ + struct intel_output *intel_output; + struct intel_sdvo_priv *sdvo_priv; + struct i2c_algo_bit_data *algo_data; + struct i2c_algorithm *algo; + + algo_data = (struct i2c_algo_bit_data *)i2c_adap->algo_data; + intel_output = + intel_sdvo_chan_to_intel_output( + (struct intel_i2c_chan *)(algo_data->data)); + if (intel_output == NULL) + return -EINVAL; + + sdvo_priv = intel_output->dev_priv; + algo = (struct i2c_algorithm *)intel_output->i2c_bus->adapter.algo; + + intel_sdvo_set_control_bus_switch(intel_output, sdvo_priv->ddc_bus); + return algo->master_xfer(i2c_adap, msgs, num); +} + +static struct i2c_algorithm intel_sdvo_i2c_bit_algo = { + .master_xfer = intel_sdvo_master_xfer, +}; + bool intel_sdvo_init(struct drm_device *dev, int output_device) { struct drm_connector *connector; struct intel_output *intel_output; struct intel_sdvo_priv *sdvo_priv; struct intel_i2c_chan *i2cbus = NULL; + struct intel_i2c_chan *ddcbus = NULL; int connector_type; u8 ch[0x40]; int i; @@ -1748,6 +1793,20 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) } } + /* setup the DDC bus. */ + if (output_device == SDVOB) + ddcbus = intel_i2c_create(dev, GPIOE, "SDVOB DDC BUS"); + else + ddcbus = intel_i2c_create(dev, GPIOE, "SDVOC DDC BUS"); + + if (ddcbus == NULL) + goto err_i2c; + + intel_sdvo_i2c_bit_algo.functionality = + intel_output->i2c_bus->adapter.algo->functionality; + ddcbus->adapter.algo = &intel_sdvo_i2c_bit_algo; + intel_output->ddc_bus = ddcbus; + /* In defaut case sdvo lvds is false */ sdvo_priv->is_lvds = false; intel_sdvo_get_capabilities(intel_output, &sdvo_priv->caps); @@ -1862,11 +1921,11 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device) sdvo_priv->caps.output_flags & (SDVO_OUTPUT_TMDS1 | SDVO_OUTPUT_RGB1) ? 'Y' : 'N'); - intel_output->ddc_bus = i2cbus; - return true; err_i2c: + if (ddcbus != NULL) + intel_i2c_destroy(intel_output->ddc_bus); intel_i2c_destroy(intel_output->i2c_bus); err_inteloutput: kfree(intel_output); From 0b827537e339c084ac9384df588969d400be9e0d Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 22 May 2009 13:23:37 -0700 Subject: [PATCH 546/900] x86: bugfix wbinvd() model check instead of family check wbinvd is supported on all CPUs 486 or later. But, pageattr.c is checking x86_model >= 4 before wbinvd(), which looks like an oversight bug. It was first introduced at one place by changeset d7c8f21a8cad0228c7c5ce2bb6dbd95d1ee49d13 and got copied over to second place in the same file later. [ Impact: fix missing cache flush on early-model CPUs, potential data corruption ] Signed-off-by: Venkatesh Pallipadi Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 797f9f107cb..2cc019a3f71 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -153,7 +153,7 @@ static void __cpa_flush_all(void *arg) */ __flush_tlb_all(); - if (cache && boot_cpu_data.x86_model >= 4) + if (cache && boot_cpu_data.x86 >= 4) wbinvd(); } @@ -218,7 +218,7 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, /* 4M threshold */ if (numpages >= 1024) { - if (boot_cpu_data.x86_model >= 4) + if (boot_cpu_data.x86 >= 4) wbinvd(); return; } From 0af48f42df15b97080b450d24219dd95db7b929a Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 22 May 2009 13:23:38 -0700 Subject: [PATCH 547/900] x86: cpa_flush_array wbinvd should be done on all CPUs cpa_flush_array seems to prefer wbinvd() over clflush at 4M threshold. clflush needs to be done on only one CPU as per instruction definition. wbinvd() however, should be done on all CPUs. [ Impact: fix missing flush which could cause data corruption ] Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2cc019a3f71..0f9052bcec4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -204,6 +204,11 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } +static void wbinvd_local(void *unused) +{ + wbinvd(); +} + static void cpa_flush_array(unsigned long *start, int numpages, int cache, int in_flags, struct page **pages) { @@ -219,7 +224,8 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, /* 4M threshold */ if (numpages >= 1024) { if (boot_cpu_data.x86 >= 4) - wbinvd(); + on_each_cpu(wbinvd_local, NULL, 1); + return; } /* From a49a018a6ea6d73742a81d673fe5ec4a7d2137b3 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 22 May 2009 16:53:40 -0400 Subject: [PATCH 548/900] [ARM] add coherent DMA mask for mv643xx_eth Since commit eb0519b5a1cf, mv643xx_eth is non functional on ARM because the platform device declaration does not include any coherent DMA mask and coherent memory allocations fail. Signed-off-by: Nicolas Pitre --- arch/arm/mach-kirkwood/common.c | 6 ++++++ arch/arm/mach-loki/common.c | 6 ++++++ arch/arm/mach-mv78xx0/common.c | 12 ++++++++++++ arch/arm/mach-orion5x/common.c | 3 +++ 4 files changed, 27 insertions(+) diff --git a/arch/arm/mach-kirkwood/common.c b/arch/arm/mach-kirkwood/common.c index 3fab82a4c8f..be1ca28fed3 100644 --- a/arch/arm/mach-kirkwood/common.c +++ b/arch/arm/mach-kirkwood/common.c @@ -144,6 +144,9 @@ static struct platform_device kirkwood_ge00 = { .id = 0, .num_resources = 1, .resource = kirkwood_ge00_resources, + .dev = { + .coherent_dma_mask = 0xffffffff, + }, }; void __init kirkwood_ge00_init(struct mv643xx_eth_platform_data *eth_data) @@ -202,6 +205,9 @@ static struct platform_device kirkwood_ge01 = { .id = 1, .num_resources = 1, .resource = kirkwood_ge01_resources, + .dev = { + .coherent_dma_mask = 0xffffffff, + }, }; void __init kirkwood_ge01_init(struct mv643xx_eth_platform_data *eth_data) diff --git a/arch/arm/mach-loki/common.c b/arch/arm/mach-loki/common.c index c0d2d9d12e7..818f19d7ab1 100644 --- a/arch/arm/mach-loki/common.c +++ b/arch/arm/mach-loki/common.c @@ -82,6 +82,9 @@ static struct platform_device loki_ge0 = { .id = 0, .num_resources = 1, .resource = loki_ge0_resources, + .dev = { + .coherent_dma_mask = 0xffffffff, + }, }; void __init loki_ge0_init(struct mv643xx_eth_platform_data *eth_data) @@ -136,6 +139,9 @@ static struct platform_device loki_ge1 = { .id = 1, .num_resources = 1, .resource = loki_ge1_resources, + .dev = { + .coherent_dma_mask = 0xffffffff, + }, }; void __init loki_ge1_init(struct mv643xx_eth_platform_data *eth_data) diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c index 0d88eea6a09..1b22e4af879 100644 --- a/arch/arm/mach-mv78xx0/common.c +++ b/arch/arm/mach-mv78xx0/common.c @@ -321,6 +321,9 @@ static struct platform_device mv78xx0_ge00 = { .id = 0, .num_resources = 1, .resource = mv78xx0_ge00_resources, + .dev = { + .coherent_dma_mask = 0xffffffff, + }, }; void __init mv78xx0_ge00_init(struct mv643xx_eth_platform_data *eth_data) @@ -375,6 +378,9 @@ static struct platform_device mv78xx0_ge01 = { .id = 1, .num_resources = 1, .resource = mv78xx0_ge01_resources, + .dev = { + .coherent_dma_mask = 0xffffffff, + }, }; void __init mv78xx0_ge01_init(struct mv643xx_eth_platform_data *eth_data) @@ -429,6 +435,9 @@ static struct platform_device mv78xx0_ge10 = { .id = 2, .num_resources = 1, .resource = mv78xx0_ge10_resources, + .dev = { + .coherent_dma_mask = 0xffffffff, + }, }; void __init mv78xx0_ge10_init(struct mv643xx_eth_platform_data *eth_data) @@ -496,6 +505,9 @@ static struct platform_device mv78xx0_ge11 = { .id = 3, .num_resources = 1, .resource = mv78xx0_ge11_resources, + .dev = { + .coherent_dma_mask = 0xffffffff, + }, }; void __init mv78xx0_ge11_init(struct mv643xx_eth_platform_data *eth_data) diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c index a51fb9dd65a..b1c7778d9f9 100644 --- a/arch/arm/mach-orion5x/common.c +++ b/arch/arm/mach-orion5x/common.c @@ -188,6 +188,9 @@ static struct platform_device orion5x_eth = { .id = 0, .num_resources = 1, .resource = orion5x_eth_resources, + .dev = { + .coherent_dma_mask = 0xffffffff, + }, }; void __init orion5x_eth_init(struct mv643xx_eth_platform_data *eth_data) From 95caa0a9bdaf93607bd0cc8932f53112496f2f22 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 22 May 2009 21:30:39 -0300 Subject: [PATCH 549/900] icom: fix rmmod crash Actually the icom driver is crashing when is being removed because the driver is kfreeing the adapter structure before calling pci_release_regions(), which result in the following error: Unable to handle kernel paging request for data at address 0x6b6b6b6b6b6b6d33 Faulting instruction address: 0xc000000000246b80 Oops: Kernel access of bad area, sig: 11 [#1] .... [c000000012d436a0] [c0000000001002d0] .kfree+0x120/0x34c (unreliable) [c000000012d43730] [c000000000246d60] .pci_release_selected_regions+0x3c/0x68 [c000000012d437c0] [d000000002d54700] .icom_kref_release+0xf4/0x118 [icom] [c000000012d43850] [c000000000232e50] .kref_put+0x74/0x94 [c000000012d438d0] [d000000002d56c58] .icom_remove+0x40/0xa4 [icom] [c000000012d43960] [c000000000249e48] .pci_device_remove+0x50/0x90 [c000000012d439e0] [c0000000002d68d8] .__device_release_driver+0x94/0xd4 [c000000012d43a70] [c0000000002d7104] .driver_detach+0xf8/0x12c [c000000012d43b00] [c0000000002d549c] .bus_remove_driver+0xbc/0x11c [c000000012d43b90] [c0000000002d71dc] .driver_unregister+0x60/0x80 [c000000012d43c20] [c00000000024a07c] .pci_unregister_driver+0x44/0xe8 [c000000012d43cb0] [d000000002d56bf4] .icom_exit+0x1c/0x40 [icom] [c000000012d43d30] [c000000000095fa8] .SyS_delete_module+0x214/0x2a8 [c000000012d43e30] [c00000000000852c] syscall_exit+0x0/0x40 Signed-off-by: Breno Leitao Cc: stable@kernel.org Cc: Alan Cox Signed-off-by: Linus Torvalds --- drivers/serial/icom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/serial/icom.c b/drivers/serial/icom.c index 6579e2be1dd..a461b3b2c72 100644 --- a/drivers/serial/icom.c +++ b/drivers/serial/icom.c @@ -1472,8 +1472,8 @@ static void icom_remove_adapter(struct icom_adapter *icom_adapter) free_irq(icom_adapter->pci_dev->irq, (void *) icom_adapter); iounmap(icom_adapter->base_addr); - icom_free_adapter(icom_adapter); pci_release_regions(icom_adapter->pci_dev); + icom_free_adapter(icom_adapter); } static void icom_kref_release(struct kref *kref) From 14f0aa359365e8a93a77b71e3b840274b9b4dcb1 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 23 May 2009 11:36:20 +0100 Subject: [PATCH 550/900] [ARM] disable NX support for OABI-supporting kernels Our signal syscall restart handling for these kernels still uses the userspace stack to build code for restarting the syscall. Unfortunately, fixing this is non-trivial, and so for the time being, we resolve the problem by disabling NX support. Signed-off-by: Russell King --- arch/arm/kernel/elf.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm/kernel/elf.c b/arch/arm/kernel/elf.c index d4a0da1e48f..950391f194c 100644 --- a/arch/arm/kernel/elf.c +++ b/arch/arm/kernel/elf.c @@ -78,6 +78,15 @@ int arm_elf_read_implies_exec(const struct elf32_hdr *x, int executable_stack) return 1; if (cpu_architecture() < CPU_ARCH_ARMv6) return 1; +#if !defined(CONFIG_AEABI) || defined(CONFIG_OABI_COMPAT) + /* + * If we have support for OABI programs, we can never allow NX + * support - our signal syscall restart mechanism relies upon + * being able to execute code placed on the user stack. + */ + return 1; +#else return 0; +#endif } EXPORT_SYMBOL(arm_elf_read_implies_exec); From 948cd52906baf1f92aeea2f9b5c515db1b2e592a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 22 May 2009 10:40:09 +0900 Subject: [PATCH 551/900] sparseirq: Allow early irq_desc allocation Presently non-legacy IRQs have their irq_desc allocated with kzalloc_node(). This assumes that all callers of irq_to_desc_node_alloc() will be sufficiently late in the boot process that kmalloc is available. While porting sparseirq support to sh this blew up immediately, as at the time that we register the CPU's interrupt vector map only bootmem is available. Check slab_is_available() to work out which path to use. [ Impact: fix SH early boot crash with sparseirq enabled ] Signed-off-by: Paul Mundt Acked-by: Yinghai Lu Cc: Andrew Morton Cc: Mel Gorman LKML-Reference: <20090522014008.GA2806@linux-sh.org> Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a3c671e0f16..18041a254d3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include @@ -81,11 +82,16 @@ static struct irq_desc irq_desc_init = { .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; -void init_kstat_irqs(struct irq_desc *desc, int node, int nr) +void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) { void *ptr; - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); + if (slab_is_available()) + ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), + GFP_ATOMIC, node); + else + ptr = alloc_bootmem_node(NODE_DATA(node), + nr * sizeof(*desc->kstat_irqs)); /* * don't overwite if can not get new one @@ -186,7 +192,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } -struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) +struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; unsigned long flags; @@ -208,7 +214,11 @@ struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) if (desc) goto out_unlock; - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + if (slab_is_available()) + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + else + desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); + printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); if (!desc) { printk(KERN_ERR "can not alloc irq_desc\n"); From df391e0eda1e678add56a8e34226edf05d89af6a Mon Sep 17 00:00:00 2001 From: Henrik Rydberg Date: Sat, 23 May 2009 09:51:20 -0700 Subject: [PATCH 552/900] Input: multitouch - add tracking ID to the protocol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a few multi-touch devices that support finger tracking well in hardware, Stantum being the prime example. By exposing the tracking ID in the MT protocol, evdev bandwidth and cpu usage in user space can be reduced. This patch adds the ABS_MT_TRACKING_ID to the MT protocol. Signed-off-by: Henrik Rydberg Tested-by: Stéphane Chatty Signed-off-by: Dmitry Torokhov --- drivers/input/input.c | 1 + include/linux/input.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/input/input.c b/drivers/input/input.c index e54e002665b..5d445f48789 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -42,6 +42,7 @@ static unsigned int input_abs_bypass_init_data[] __initdata = { ABS_MT_POSITION_Y, ABS_MT_TOOL_TYPE, ABS_MT_BLOB_ID, + ABS_MT_TRACKING_ID, 0 }; static unsigned long input_abs_bypass[BITS_TO_LONGS(ABS_CNT)]; diff --git a/include/linux/input.h b/include/linux/input.h index 0e6ff5de358..6fed4f6a9c9 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -656,6 +656,7 @@ struct input_absinfo { #define ABS_MT_POSITION_Y 0x36 /* Center Y ellipse position */ #define ABS_MT_TOOL_TYPE 0x37 /* Type of touching device */ #define ABS_MT_BLOB_ID 0x38 /* Group a set of packets as a blob */ +#define ABS_MT_TRACKING_ID 0x39 /* Unique ID of initiated contact */ #define ABS_MAX 0x3f #define ABS_CNT (ABS_MAX+1) From f9fcfc3b4627a1ec9b50411060f1b384926d6610 Mon Sep 17 00:00:00 2001 From: Henrik Rydberg Date: Sat, 23 May 2009 09:51:21 -0700 Subject: [PATCH 553/900] Input: multitouch - augment event semantics documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Through the collaboration to adapt the N-trig and Stantum HID drivers to the MT protocol, some semantic clarifications to the protocol have been made. This patch adds them to the MT documentation. Signed-off-by: Henrik Rydberg Tested-by: Stéphane Chatty Signed-off-by: Dmitry Torokhov --- Documentation/input/multi-touch-protocol.txt | 103 ++++++++++++++----- 1 file changed, 79 insertions(+), 24 deletions(-) diff --git a/Documentation/input/multi-touch-protocol.txt b/Documentation/input/multi-touch-protocol.txt index 9f09557aea3..a12ea3b586e 100644 --- a/Documentation/input/multi-touch-protocol.txt +++ b/Documentation/input/multi-touch-protocol.txt @@ -18,8 +18,12 @@ Usage Anonymous finger details are sent sequentially as separate packets of ABS events. Only the ABS_MT events are recognized as part of a finger packet. The end of a packet is marked by calling the input_mt_sync() -function, which generates a SYN_MT_REPORT event. The end of multi-touch -transfer is marked by calling the usual input_sync() function. +function, which generates a SYN_MT_REPORT event. This instructs the +receiver to accept the data for the current finger and prepare to receive +another. The end of a multi-touch transfer is marked by calling the usual +input_sync() function. This instructs the receiver to act upon events +accumulated since last EV_SYN/SYN_REPORT and prepare to receive a new +set of events/packets. A set of ABS_MT events with the desired properties is defined. The events are divided into categories, to allow for partial implementation. The @@ -27,11 +31,26 @@ minimum set consists of ABS_MT_TOUCH_MAJOR, ABS_MT_POSITION_X and ABS_MT_POSITION_Y, which allows for multiple fingers to be tracked. If the device supports it, the ABS_MT_WIDTH_MAJOR may be used to provide the size of the approaching finger. Anisotropy and direction may be specified with -ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MINOR and ABS_MT_ORIENTATION. Devices with -more granular information may specify general shapes as blobs, i.e., as a -sequence of rectangular shapes grouped together by an -ABS_MT_BLOB_ID. Finally, the ABS_MT_TOOL_TYPE may be used to specify -whether the touching tool is a finger or a pen or something else. +ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MINOR and ABS_MT_ORIENTATION. The +ABS_MT_TOOL_TYPE may be used to specify whether the touching tool is a +finger or a pen or something else. Devices with more granular information +may specify general shapes as blobs, i.e., as a sequence of rectangular +shapes grouped together by an ABS_MT_BLOB_ID. Finally, for the few devices +that currently support it, the ABS_MT_TRACKING_ID event may be used to +report finger tracking from hardware [5]. + +Here is what a minimal event sequence for a two-finger touch would look +like: + + ABS_MT_TOUCH_MAJOR + ABS_MT_POSITION_X + ABS_MT_POSITION_Y + SYN_MT_REPORT + ABS_MT_TOUCH_MAJOR + ABS_MT_POSITION_X + ABS_MT_POSITION_Y + SYN_MT_REPORT + SYN_REPORT Event Semantics @@ -44,24 +63,24 @@ ABS_MT_TOUCH_MAJOR The length of the major axis of the contact. The length should be given in surface units. If the surface has an X times Y resolution, the largest -possible value of ABS_MT_TOUCH_MAJOR is sqrt(X^2 + Y^2), the diagonal. +possible value of ABS_MT_TOUCH_MAJOR is sqrt(X^2 + Y^2), the diagonal [4]. ABS_MT_TOUCH_MINOR The length, in surface units, of the minor axis of the contact. If the -contact is circular, this event can be omitted. +contact is circular, this event can be omitted [4]. ABS_MT_WIDTH_MAJOR The length, in surface units, of the major axis of the approaching tool. This should be understood as the size of the tool itself. The orientation of the contact and the approaching tool are assumed to be the -same. +same [4]. ABS_MT_WIDTH_MINOR The length, in surface units, of the minor axis of the approaching -tool. Omit if circular. +tool. Omit if circular [4]. The above four values can be used to derive additional information about the contact. The ratio ABS_MT_TOUCH_MAJOR / ABS_MT_WIDTH_MAJOR approximates @@ -70,14 +89,17 @@ different characteristic widths [1]. ABS_MT_ORIENTATION -The orientation of the ellipse. The value should describe half a revolution -clockwise around the touch center. The scale of the value is arbitrary, but -zero should be returned for an ellipse aligned along the Y axis of the -surface. As an example, an index finger placed straight onto the axis could -return zero orientation, something negative when twisted to the left, and -something positive when twisted to the right. This value can be omitted if -the touching object is circular, or if the information is not available in -the kernel driver. +The orientation of the ellipse. The value should describe a signed quarter +of a revolution clockwise around the touch center. The signed value range +is arbitrary, but zero should be returned for a finger aligned along the Y +axis of the surface, a negative value when finger is turned to the left, and +a positive value when finger turned to the right. When completely aligned with +the X axis, the range max should be returned. Orientation can be omitted +if the touching object is circular, or if the information is not available +in the kernel driver. Partial orientation support is possible if the device +can distinguish between the two axis, but not (uniquely) any values in +between. In such cases, the range of ABS_MT_ORIENTATION should be [0, 1] +[4]. ABS_MT_POSITION_X @@ -98,8 +120,35 @@ ABS_MT_BLOB_ID The BLOB_ID groups several packets together into one arbitrarily shaped contact. This is a low-level anonymous grouping, and should not be confused -with the high-level contactID, explained below. Most kernel drivers will -not have this capability, and can safely omit the event. +with the high-level trackingID [5]. Most kernel drivers will not have blob +capability, and can safely omit the event. + +ABS_MT_TRACKING_ID + +The TRACKING_ID identifies an initiated contact throughout its life cycle +[5]. There are currently only a few devices that support it, so this event +should normally be omitted. + + +Event Computation +----------------- + +The flora of different hardware unavoidably leads to some devices fitting +better to the MT protocol than others. To simplify and unify the mapping, +this section gives recipes for how to compute certain events. + +For devices reporting contacts as rectangular shapes, signed orientation +cannot be obtained. Assuming X and Y are the lengths of the sides of the +touching rectangle, here is a simple formula that retains the most +information possible: + + ABS_MT_TOUCH_MAJOR := max(X, Y) + ABS_MT_TOUCH_MINOR := min(X, Y) + ABS_MT_ORIENTATION := bool(X > Y) + +The range of ABS_MT_ORIENTATION should be set to [0, 1], to indicate that +the device can distinguish between a finger along the Y axis (0) and a +finger along the X axis (1). Finger Tracking @@ -109,14 +158,18 @@ The kernel driver should generate an arbitrary enumeration of the set of anonymous contacts currently on the surface. The order in which the packets appear in the event stream is not important. -The process of finger tracking, i.e., to assign a unique contactID to each +The process of finger tracking, i.e., to assign a unique trackingID to each initiated contact on the surface, is left to user space; preferably the -multi-touch X driver [3]. In that driver, the contactID stays the same and +multi-touch X driver [3]. In that driver, the trackingID stays the same and unique until the contact vanishes (when the finger leaves the surface). The problem of assigning a set of anonymous fingers to a set of identified fingers is a euclidian bipartite matching problem at each event update, and relies on a sufficiently rapid update rate. +There are a few devices that support trackingID in hardware. User space can +make use of these native identifiers to reduce bandwidth and cpu usage. + + Notes ----- @@ -136,5 +189,7 @@ could be used to derive tilt. time of writing (April 2009), the MT protocol is not yet merged, and the prototype implements finger matching, basic mouse support and two-finger scrolling. The project aims at improving the quality of current multi-touch -functionality available in the synaptics X driver, and in addition +functionality available in the Synaptics X driver, and in addition implement more advanced gestures. +[4] See the section on event computation. +[5] See the section on finger tracking. From 8db14ca12569fe885694bd3d5ff84c2d973d3cb0 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 23 May 2009 18:57:25 +0000 Subject: [PATCH 554/900] [CIFS] Avoid open on possible directories since Samba now rejects them Small change (mostly formatting) to limit lookup based open calls to file create only. After discussion yesteday on samba-technical about the posix lookup regression, and looking at a problem with cifs posix open to one particular Samba version, Jeff and JRA realized that Samba server's behavior changed in this area (posix open behavior on files vs. directories). To make this behavior consistent, JRA just made a fix to Samba server to alter how it handles open of directories (now returning the equivalent of EISDIR instead of success). Since we don't know at lookup time whether the inode is a directory or file (and thus whether posix open will succeed with most current Samba server), this change avoids the posix open code on lookup open (just issues posix open on creates). This gets the semantic benefits we want (atomicity, posix byte range locks, improved write semantics on newly created files) and file create still is fast, and we avoid the problem that Jeff noticed yesterday with "openat" (and some open directory calls) of non-cached directories to one version of Samba server, and will work with future Samba versions (which include the fix jra just pushed into Samba server). I confirmed this approach with jra yesterday and with Shirish today. Posix open is only called (at lookup time) for file create now. For opens (rather than creates), because we do not know if it is a file or directory yet, and current Samba no longer allows us to do posix open on dirs, we could end up wasting an open call on what turns out to be a dir. For file opens, we wait to call posix open till cifs_open. It could be added here (lookup) in the future but the performance tradeoff of the extra network request when EISDIR or EACCES is returned would have to be weighed against the 50% reduction in network traffic in the other paths. Reviewed-by: Shirish Pargaonkar Tested-by: Jeff Layton CC: Jeremy Allison Signed-off-by: Steve French --- fs/cifs/dir.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index f49d684edd9..3758965d73d 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -657,31 +657,36 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode)); + /* Posix open is only called (at lookup time) for file create now. + * For opens (rather than creates), because we do not know if it + * is a file or directory yet, and current Samba no longer allows + * us to do posix open on dirs, we could end up wasting an open call + * on what turns out to be a dir. For file opens, we wait to call posix + * open till cifs_open. It could be added here (lookup) in the future + * but the performance tradeoff of the extra network request when EISDIR + * or EACCES is returned would have to be weighed against the 50% + * reduction in network traffic in the other paths. + */ if (pTcon->unix_ext) { if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && - (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open) { - if (!((nd->intent.open.flags & O_CREAT) && - (nd->intent.open.flags & O_EXCL))) { - rc = cifs_posix_open(full_path, &newInode, + (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && + (nd->intent.open.flags & O_CREAT)) { + rc = cifs_posix_open(full_path, &newInode, parent_dir_inode->i_sb, nd->intent.open.create_mode, nd->intent.open.flags, &oplock, &fileHandle, xid); - /* - * This code works around a bug in - * samba posix open in samba versions 3.3.1 - * and earlier where create works - * but open fails with invalid parameter. - * If either of these error codes are - * returned, follow the normal lookup. - * Otherwise, the error during posix open - * is handled. - */ - if ((rc != -EINVAL) && (rc != -EOPNOTSUPP)) - posix_open = true; - else - pTcon->broken_posix_open = true; - } + /* + * The check below works around a bug in POSIX + * open in samba versions 3.3.1 and earlier where + * open could incorrectly fail with invalid parameter. + * If either that or op not supported returned, follow + * the normal lookup. + */ + if ((rc == 0) || (rc == -ENOENT)) + posix_open = true; + else if ((rc == -EINVAL) || (rc != -EOPNOTSUPP)) + pTcon->broken_posix_open = true; } if (!posix_open) rc = cifs_get_inode_info_unix(&newInode, full_path, From 59a3759d0fe8d969888c741bb33f4946e4d3750d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 23 May 2009 14:47:00 -0700 Subject: [PATCH 555/900] Linux 2.6.30-rc7 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index b57e1f539e8..739fd34a72a 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 30 -EXTRAVERSION = -rc6 -NAME = Vindictive Armadillo +EXTRAVERSION = -rc7 +NAME = Man-Eating Seals of Antiquity # *DOCUMENTATION* # To see a list of typical targets execute "make help" From 93574844bc3906941b89d6b6f72e01e87413f3c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ozan=20=C3=87a=C4=9Flayan?= Date: Sat, 23 May 2009 15:00:04 +0300 Subject: [PATCH 556/900] ALSA: hda - Add forced codec-slots for ASUS W5Fm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASUS W5Fm needs the fixed codec-slots to probe to override the BIOS problem like W5F. Tested-by: Alp Kılıç Signed-off-by: Ozan ÇaÄŸlayan Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 21e99cfa8c4..3128e1a6bc6 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2141,6 +2141,7 @@ static struct snd_pci_quirk probe_mask_list[] __devinitdata = { /* including bogus ALC268 in slot#2 that conflicts with ALC888 */ SND_PCI_QUIRK(0x17c0, 0x4085, "Medion MD96630", 0x01), /* forced codec slots */ + SND_PCI_QUIRK(0x1043, 0x1262, "ASUS W5Fm", 0x103), SND_PCI_QUIRK(0x1046, 0x1262, "ASUS W5F", 0x103), {} }; From 679d92ed1403b6cf9a19aa42ec62b81cae1aa017 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 24 May 2009 19:00:08 +0200 Subject: [PATCH 557/900] ALSA: hda - Add 5stack-no-fp model for STAC927x The recent fix for the headphone volume control on IDT/STAC codecs resulted in the removal of invalid "Side" volume eventually. But, if the front panel doesn't exist, this setup could be regarded as a sort of regression, as reported in kernel bug #13250. Now as a workaround, a new model 5stack-no-fp is added so that the user without the front panel can choose this one explicitly. Reference: bko#13250 http://bugzilla.kernel.org/show_bug.cgi?id=13250 Signed-off-by: Takashi Iwai --- Documentation/sound/alsa/HD-Audio-Models.txt | 1 + sound/pci/hda/patch_sigmatel.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt index 8eec05bc079..322869fc8a9 100644 --- a/Documentation/sound/alsa/HD-Audio-Models.txt +++ b/Documentation/sound/alsa/HD-Audio-Models.txt @@ -334,6 +334,7 @@ STAC9227/9228/9229/927x ref-no-jd Reference board without HP/Mic jack detection 3stack D965 3stack 5stack D965 5stack + SPDIF + 5stack-no-fp D965 5stack without front panel dell-3stack Dell Dimension E520 dell-bios Fixes with Dell BIOS setup auto BIOS setup (default) diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index 03b3646018a..d2fd8ef6aef 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -150,6 +150,7 @@ enum { STAC_D965_REF, STAC_D965_3ST, STAC_D965_5ST, + STAC_D965_5ST_NO_FP, STAC_DELL_3ST, STAC_DELL_BIOS, STAC_927X_MODELS @@ -2154,6 +2155,13 @@ static unsigned int d965_5st_pin_configs[14] = { 0x40000100, 0x40000100 }; +static unsigned int d965_5st_no_fp_pin_configs[14] = { + 0x40000100, 0x40000100, 0x0181304e, 0x01014010, + 0x01a19040, 0x01011012, 0x01016011, 0x40000100, + 0x40000100, 0x40000100, 0x40000100, 0x01442070, + 0x40000100, 0x40000100 +}; + static unsigned int dell_3st_pin_configs[14] = { 0x02211230, 0x02a11220, 0x01a19040, 0x01114210, 0x01111212, 0x01116211, 0x01813050, 0x01112214, @@ -2166,6 +2174,7 @@ static unsigned int *stac927x_brd_tbl[STAC_927X_MODELS] = { [STAC_D965_REF] = ref927x_pin_configs, [STAC_D965_3ST] = d965_3st_pin_configs, [STAC_D965_5ST] = d965_5st_pin_configs, + [STAC_D965_5ST_NO_FP] = d965_5st_no_fp_pin_configs, [STAC_DELL_3ST] = dell_3st_pin_configs, [STAC_DELL_BIOS] = NULL, }; @@ -2176,6 +2185,7 @@ static const char *stac927x_models[STAC_927X_MODELS] = { [STAC_D965_REF] = "ref", [STAC_D965_3ST] = "3stack", [STAC_D965_5ST] = "5stack", + [STAC_D965_5ST_NO_FP] = "5stack-no-fp", [STAC_DELL_3ST] = "dell-3stack", [STAC_DELL_BIOS] = "dell-bios", }; From 657cafa6b0f5296424d6f43f6f6eeb4a3222117e Mon Sep 17 00:00:00 2001 From: Alex Riesen Date: Sun, 24 May 2009 15:30:48 +0200 Subject: [PATCH 558/900] Use a format for linux_banner There is no format specifiers left in the linux_banner, and gcc-4.3 complains seeing the printk. Signed-off-by: Alex Riesen Signed-off-by: Linus Torvalds --- init/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index 3bbf93be744..d721dad05dd 100644 --- a/init/main.c +++ b/init/main.c @@ -566,8 +566,7 @@ asmlinkage void __init start_kernel(void) tick_init(); boot_cpu_init(); page_address_init(); - printk(KERN_NOTICE); - printk(linux_banner); + printk(KERN_NOTICE "%s", linux_banner); setup_arch(&command_line); mm_init_owner(&init_mm, &init_task); setup_command_line(command_line); From 32bdfac5462d777f35b00838893c4f87baf23efe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 24 May 2009 21:15:07 +0200 Subject: [PATCH 559/900] PM: Do not hold dpm_list_mtx while disabling/enabling nonboot CPUs We shouldn't hold dpm_list_mtx while executing [disable|enable]_nonboot_cpus(), because theoretically this may lead to a deadlock as shown by the following example (provided by Johannes Berg): CPU 3 CPU 2 CPU 1 suspend/hibernate something: rtnl_lock() device_pm_lock() -> mutex_lock(&dpm_list_mtx) mutex_lock(&dpm_list_mtx) linkwatch_work -> rtnl_lock() disable_nonboot_cpus() -> flush CPU 3 workqueue Fortunately, device drivers are supposed to stop any activities that might lead to the registration of new device objects way before disable_nonboot_cpus() is called, so it shouldn't be necessary to hold dpm_list_mtx over the entire late part of device suspend and early part of device resume. Thus, during the late suspend and the early resume of devices acquire dpm_list_mtx only when dpm_list is going to be traversed and release it right after that. This patch is reported to fix the regressions tracked as http://bugzilla.kernel.org/show_bug.cgi?id=13245. Signed-off-by: Rafael J. Wysocki Acked-by: Alan Stern Reported-by: Miles Lane Tested-by: Ming Lei --- drivers/base/power/main.c | 4 ++++ kernel/kexec.c | 2 -- kernel/power/disk.c | 21 +++------------------ kernel/power/main.c | 7 +------ 4 files changed, 8 insertions(+), 26 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 69b4ddb7de3..3e4bc699bc0 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -357,6 +357,7 @@ static void dpm_power_up(pm_message_t state) { struct device *dev; + mutex_lock(&dpm_list_mtx); list_for_each_entry(dev, &dpm_list, power.entry) if (dev->power.status > DPM_OFF) { int error; @@ -366,6 +367,7 @@ static void dpm_power_up(pm_message_t state) if (error) pm_dev_err(dev, state, " early", error); } + mutex_unlock(&dpm_list_mtx); } /** @@ -614,6 +616,7 @@ int device_power_down(pm_message_t state) int error = 0; suspend_device_irqs(); + mutex_lock(&dpm_list_mtx); list_for_each_entry_reverse(dev, &dpm_list, power.entry) { error = suspend_device_noirq(dev, state); if (error) { @@ -622,6 +625,7 @@ int device_power_down(pm_message_t state) } dev->power.status = DPM_OFF_IRQ; } + mutex_unlock(&dpm_list_mtx); if (error) device_power_up(resume_event(state)); return error; diff --git a/kernel/kexec.c b/kernel/kexec.c index 5a758c6e495..e4983770913 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1451,7 +1451,6 @@ int kernel_kexec(void) error = device_suspend(PMSG_FREEZE); if (error) goto Resume_console; - device_pm_lock(); /* At this point, device_suspend() has been called, * but *not* device_power_down(). We *must* * device_power_down() now. Otherwise, drivers for @@ -1489,7 +1488,6 @@ int kernel_kexec(void) enable_nonboot_cpus(); device_power_up(PMSG_RESTORE); Resume_devices: - device_pm_unlock(); device_resume(PMSG_RESTORE); Resume_console: resume_console(); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index b0dc9e7a0d1..5cb080e7eeb 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -215,8 +215,6 @@ static int create_image(int platform_mode) if (error) return error; - device_pm_lock(); - /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* call device_power_down() now. * Otherwise, drivers for some devices (e.g. interrupt controllers) @@ -227,7 +225,7 @@ static int create_image(int platform_mode) if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); - goto Unlock; + return error; } error = platform_pre_snapshot(platform_mode); @@ -280,9 +278,6 @@ static int create_image(int platform_mode) device_power_up(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - Unlock: - device_pm_unlock(); - return error; } @@ -344,13 +339,11 @@ static int resume_target_kernel(bool platform_mode) { int error; - device_pm_lock(); - error = device_power_down(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); - goto Unlock; + return error; } error = platform_pre_restore(platform_mode); @@ -403,9 +396,6 @@ static int resume_target_kernel(bool platform_mode) device_power_up(PMSG_RECOVER); - Unlock: - device_pm_unlock(); - return error; } @@ -464,11 +454,9 @@ int hibernation_platform_enter(void) goto Resume_devices; } - device_pm_lock(); - error = device_power_down(PMSG_HIBERNATE); if (error) - goto Unlock; + goto Resume_devices; error = hibernation_ops->prepare(); if (error) @@ -493,9 +481,6 @@ int hibernation_platform_enter(void) device_power_up(PMSG_RESTORE); - Unlock: - device_pm_unlock(); - Resume_devices: entering_platform_hibernation = false; device_resume(PMSG_RESTORE); diff --git a/kernel/power/main.c b/kernel/power/main.c index f99ed6a75ea..868028280d1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -289,12 +289,10 @@ static int suspend_enter(suspend_state_t state) { int error; - device_pm_lock(); - if (suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) - goto Done; + return error; } error = device_power_down(PMSG_SUSPEND); @@ -343,9 +341,6 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); - Done: - device_pm_unlock(); - return error; } From d5a877e8dd409d8c702986d06485c374b705d340 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sun, 24 May 2009 13:03:43 -0700 Subject: [PATCH 560/900] async: make sure independent async domains can't accidentally entangle The problem occurs when async_synchronize_full_domain() is called when the async_pending list is not empty. This will cause lowest_running() to return the cookie of the first entry on the async_pending list, which might be nothing at all to do with the domain being asked for and thus cause the domain synchronization to wait for an unrelated domain. This can cause a deadlock if domain synchronization is used from one domain to wait for another. Fix by running over the async_pending list to see if any pending items actually belong to our domain (and return their cookies if they do). Signed-off-by: James Bottomley Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/async.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/kernel/async.c b/kernel/async.c index 968ef9457d4..50540301ed0 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -92,19 +92,23 @@ extern int initcall_debug; static async_cookie_t __lowest_in_progress(struct list_head *running) { struct async_entry *entry; + async_cookie_t ret = next_cookie; /* begin with "infinity" value */ + if (!list_empty(running)) { entry = list_first_entry(running, struct async_entry, list); - return entry->cookie; - } else if (!list_empty(&async_pending)) { - entry = list_first_entry(&async_pending, - struct async_entry, list); - return entry->cookie; - } else { - /* nothing in progress... next_cookie is "infinity" */ - return next_cookie; + ret = entry->cookie; } + if (!list_empty(&async_pending)) { + list_for_each_entry(entry, &async_pending, list) + if (entry->running == running) { + ret = entry->cookie; + break; + } + } + + return ret; } static async_cookie_t lowest_in_progress(struct list_head *running) From 71c9d8b68b299bef614afc7907393564a9f1476f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 May 2009 12:01:59 +0900 Subject: [PATCH 561/900] x86: Remove remap percpu allocator for the time being Remap percpu allocator has subtle bug when combined with page attribute changing. Remap percpu allocator aliases PMD pages for the first chunk and as pageattr doesn't know about the alias it ends up updating page attributes of the original mapping thus leaving the alises in inconsistent state which might lead to subtle data corruption. Please read the following threads for more information: http://thread.gmane.org/gmane.linux.kernel/835783 The following is the proposed fix which teaches pageattr about percpu aliases. http://thread.gmane.org/gmane.linux.kernel/837157 However, the above changes are deemed too pervasive for upstream inclusion for 2.6.30 release, so this patch essentially disables the remap allocator for the time being. Signed-off-by: Tejun Heo LKML-Reference: <4A1A0A27.4050301@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 3a97a4cf187..8f0e13be36b 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -160,8 +160,10 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) /* * If large page isn't supported, there's no benefit in doing * this. Also, on non-NUMA, embedding is better. + * + * NOTE: disabled for now. */ - if (!cpu_has_pse || !pcpu_need_numa()) + if (true || !cpu_has_pse || !pcpu_need_numa()) return -EINVAL; /* From 461c6c3a0a23a8fac1a4b636e93ff5dfe599a241 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 25 May 2009 08:06:02 +0200 Subject: [PATCH 562/900] ALSA: hda - Add missing check of pin vref 50 and others in Realtek codecs Some Realtek codecs like ALC861 seem to support only VREF50 while the current driver assumes it's only VREF80. Check other VREF bits to set the correct value. Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index bcbb736f94f..0fd258eba3a 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -776,6 +776,12 @@ static void alc_set_input_pin(struct hda_codec *codec, hda_nid_t nid, pincap = (pincap & AC_PINCAP_VREF) >> AC_PINCAP_VREF_SHIFT; if (pincap & AC_PINCAP_VREF_80) val = PIN_VREF80; + else if (pincap & AC_PINCAP_VREF_50) + val = PIN_VREF50; + else if (pincap & AC_PINCAP_VREF_100) + val = PIN_VREF100; + else if (pincap & AC_PINCAP_VREF_GRD) + val = PIN_VREFGRD; } snd_hda_codec_write(codec, nid, 0, AC_VERB_SET_PIN_WIDGET_CONTROL, val); } From 4e2fd555199977c5994d1a4d2d3b8761b20ca4c7 Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Mon, 25 May 2009 00:42:34 -0700 Subject: [PATCH 563/900] gianfar: fix BUG under load after introduction of skb recycling Since commit 0fd56bb5be6455d0d42241e65aed057244665e5e ("gianfar: Add support for skb recycling"), gianfar puts skbuffs that are in the rx ring back onto the recycle list as-is in case there was a receive error, but this breaks the following invariant: that all skbuffs on the recycle list have skb->data = skb->head + NET_SKB_PAD. The RXBUF_ALIGNMENT realignment done in gfar_new_skb() will be done twice on skbuffs recycled in this way, causing there not to be enough room in the skb anymore to receive a full packet, eventually leading to an skb_over_panic from gfar_clean_rx_ring() -> skb_put(). Resetting the skb->data pointer to skb->head + NET_SKB_PAD before putting the skb back onto the recycle list restores the mentioned invariant, and should fix this issue. Reported-by: Michael Guntsche Tested-by: Michael Guntsche Signed-off-by: Lennert Buytenhek Signed-off-by: David S. Miller --- drivers/net/gianfar.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c index b2c49679bba..a0519184e54 100644 --- a/drivers/net/gianfar.c +++ b/drivers/net/gianfar.c @@ -1885,8 +1885,17 @@ int gfar_clean_rx_ring(struct net_device *dev, int rx_work_limit) if (unlikely(!newskb)) newskb = skb; - else if (skb) + else if (skb) { + /* + * We need to reset ->data to what it + * was before gfar_new_skb() re-aligned + * it to an RXBUF_ALIGNMENT boundary + * before we put the skb back on the + * recycle list. + */ + skb->data = skb->head + NET_SKB_PAD; __skb_queue_head(&priv->rx_recycle, skb); + } } else { /* Increment the number of packets */ dev->stats.rx_packets++; From bfcaa50270e18f35220a11d46e98fc6232c24606 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 25 May 2009 17:23:15 +0200 Subject: [PATCH 564/900] netfilter: nf_ct_tcp: fix accepting invalid RST segments Robert L Mathews discovered that some clients send evil TCP RST segments, which are accepted by netfilter conntrack but discarded by the destination. Thus the conntrack entry is destroyed but the destination retransmits data until timeout. The same technique, i.e. sending properly crafted RST segments, can easily be used to bypass connlimit/connbytes based restrictions (the sample script written by Robert can be found in the netfilter mailing list archives). The patch below adds a new flag and new field to struct ip_ct_tcp_state so that checking RST segments can be made more strict and thus TCP conntrack can catch the invalid ones: the RST segment is accepted only if its sequence number higher than or equal to the highest ack we seen from the other direction. (The last_ack field cannot be reused because it is used to catch resent packets.) Signed-off-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy --- include/linux/netfilter/nf_conntrack_tcp.h | 4 ++++ net/netfilter/nf_conntrack_proto_tcp.c | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/linux/netfilter/nf_conntrack_tcp.h b/include/linux/netfilter/nf_conntrack_tcp.h index 3066789b972..b2f384d4261 100644 --- a/include/linux/netfilter/nf_conntrack_tcp.h +++ b/include/linux/netfilter/nf_conntrack_tcp.h @@ -35,6 +35,9 @@ enum tcp_conntrack { /* Has unacknowledged data */ #define IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED 0x10 +/* The field td_maxack has been set */ +#define IP_CT_TCP_FLAG_MAXACK_SET 0x20 + struct nf_ct_tcp_flags { __u8 flags; __u8 mask; @@ -46,6 +49,7 @@ struct ip_ct_tcp_state { u_int32_t td_end; /* max of seq + len */ u_int32_t td_maxend; /* max of ack + max(win, 1) */ u_int32_t td_maxwin; /* max(win) */ + u_int32_t td_maxack; /* max of ack */ u_int8_t td_scale; /* window scale factor */ u_int8_t flags; /* per direction options */ }; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b5ccf2b4b2e..97a6e93d742 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -634,6 +634,14 @@ static bool tcp_in_window(const struct nf_conn *ct, sender->td_end = end; sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; } + if (tcph->ack) { + if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { + sender->td_maxack = ack; + sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; + } else if (after(ack, sender->td_maxack)) + sender->td_maxack = ack; + } + /* * Update receiver data. */ @@ -918,6 +926,16 @@ static int tcp_packet(struct nf_conn *ct, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; case TCP_CONNTRACK_CLOSE: + if (index == TCP_RST_SET + && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) + && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) { + /* Invalid RST */ + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid RST "); + return -NF_ACCEPT; + } if (index == TCP_RST_SET && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) && ct->proto.tcp.last_index == TCP_SYN_SET) From b38b1f616867c832301f24eaf259889494d495b3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 25 May 2009 17:29:43 +0200 Subject: [PATCH 565/900] netfilter: nf_ct_dccp: add missing DCCP protocol changes in event cache This patch adds the missing protocol state-change event reporting for DCCP. $ sudo conntrack -E [NEW] dccp 33 240 src=192.168.0.2 dst=192.168.1.2 sport=57040 dport=5001 [UNREPLIED] src=192.168.1.2 dst=192.168.1.100 sport=5001 dport=57040 With this patch: $ sudo conntrack -E [NEW] dccp 33 240 REQUEST src=192.168.0.2 dst=192.168.1.2 sport=57040 dport=5001 [UNREPLIED] src=192.168.1.2 dst=192.168.1.100 sport=5001 dport=57040 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_proto_dccp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 8e757dd5339..aee0d6bea30 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -22,6 +22,7 @@ #include #include #include +#include #include static DEFINE_RWLOCK(dccp_lock); @@ -553,6 +554,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.dccp.state = new_state; write_unlock_bh(&dccp_lock); + if (new_state != old_state) + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + dn = dccp_pernet(net); nf_ct_refresh_acct(ct, ctinfo, skb, dn->dccp_timeout[new_state]); From a8cd0244e9cebcf9b358d24c7e7410062f3665cb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 24 May 2009 22:15:25 +0300 Subject: [PATCH 566/900] KVM: Make paravirt tlb flush also reload the PAE PDPTRs The paravirt tlb flush may be used not only to flush TLBs, but also to reload the four page-directory-pointer-table entries, as it is used as a replacement for reloading CR3. Change the code to do the entire CR3 reloading dance instead of simply flushing the TLB. Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b6caf1329b1..32cf11e5728 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2897,8 +2897,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) { - kvm_x86_ops->tlb_flush(vcpu); - set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_set_cr3(vcpu, vcpu->arch.cr3); return 1; } From a2edf57f510cce6a389cc14e58c6ad0a4296d6f9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 24 May 2009 22:19:00 +0300 Subject: [PATCH 567/900] KVM: Fix PDPTR reloading on CR4 writes The processor is documented to reload the PDPTRs while in PAE mode if any of the CR4 bits PSE, PGE, or PAE change. Linux relies on this behaviour when zapping the low mappings of PAE kernels during boot. The code already handled changes to CR4.PAE; augment it to also notice changes to PSE and PGE. This triggered while booting an F11 PAE kernel; the futex initialization code runs before any CR3 reloads and writes to a NULL pointer; the futex subsystem ended up uninitialized, killing PI futexes and pulseaudio which uses them. Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 49079a46687..3944e917e79 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -338,6 +338,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { + unsigned long old_cr4 = vcpu->arch.cr4; + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; + if (cr4 & CR4_RESERVED_BITS) { printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); @@ -351,7 +354,8 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_inject_gp(vcpu, 0); return; } - } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) + } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) + && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); kvm_inject_gp(vcpu, 0); From 4f5359685af6de7dca101393dc606620adbe963f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 18 May 2009 19:35:34 +0800 Subject: [PATCH 568/900] tracing: add trace_event_read_lock() I found that there is nothing to protect event_hash in ftrace_find_event(). Rcu protects the event hashlist but not the event itself while we use it after its extraction through ftrace_find_event(). This lack of a proper locking in this spot opens a race window between any event dereferencing and module removal. Eg: --Task A-- print_trace_line(trace) { event = find_ftrace_event(trace) --Task B-- trace_module_remove_events(mod) { list_trace_events_module(ev, mod) { unregister_ftrace_event(ev->event) { hlist_del(ev->event->node) list_del(....) } } } |--> module removed, the event has been dropped --Task A-- event->print(trace); // Dereferencing freed memory If the event retrieved belongs to a module and this module is concurrently removed, we may end up dereferencing a data from a freed module. RCU could solve this, but it would add latency to the kernel and forbid tracers output callbacks to call any sleepable code. So this fix converts 'trace_event_mutex' to a read/write semaphore, and adds trace_event_read_lock() to protect ftrace_find_event(). [ Impact: fix possible freed memory dereference in ftrace ] Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt LKML-Reference: <4A114806.7090302@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 8 ++++++++ kernel/trace/trace_output.c | 25 ++++++++++++++++++------- kernel/trace/trace_output.h | 2 ++ 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dd40d232034..02d32baa23a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1569,12 +1569,14 @@ static void *s_start(struct seq_file *m, loff_t *pos) p = s_next(m, p, &l); } + trace_event_read_lock(); return p; } static void s_stop(struct seq_file *m, void *p) { atomic_dec(&trace_record_cmdline_disabled); + trace_event_read_unlock(); } static void print_lat_help_header(struct seq_file *m) @@ -1817,6 +1819,7 @@ static int trace_empty(struct trace_iterator *iter) return 1; } +/* Called with trace_event_read_lock() held. */ static enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; @@ -3008,6 +3011,7 @@ waitagain: offsetof(struct trace_iterator, seq)); iter->pos = -1; + trace_event_read_lock(); while (find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -3024,6 +3028,7 @@ waitagain: if (iter->seq.len >= cnt) break; } + trace_event_read_unlock(); /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); @@ -3146,6 +3151,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, goto out_err; } + trace_event_read_lock(); + /* Fill as many pages as possible. */ for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { pages[i] = alloc_page(GFP_KERNEL); @@ -3168,6 +3175,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_seq_init(&iter->seq); } + trace_event_read_unlock(); mutex_unlock(&iter->mutex); spd.nr_pages = i; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 489c0e8ada0..7136420603a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@ /* must be a power of 2 */ #define EVENT_HASHSIZE 128 -static DEFINE_MUTEX(trace_event_mutex); +static DECLARE_RWSEM(trace_event_mutex); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; @@ -466,6 +466,7 @@ static int task_state_char(unsigned long state) * @type: the type of event to look for * * Returns an event of type @type otherwise NULL + * Called with trace_event_read_lock() held. */ struct trace_event *ftrace_find_event(int type) { @@ -475,7 +476,7 @@ struct trace_event *ftrace_find_event(int type) key = type & (EVENT_HASHSIZE - 1); - hlist_for_each_entry_rcu(event, n, &event_hash[key], node) { + hlist_for_each_entry(event, n, &event_hash[key], node) { if (event->type == type) return event; } @@ -513,6 +514,16 @@ static int trace_search_list(struct list_head **list) return last + 1; } +void trace_event_read_lock(void) +{ + down_read(&trace_event_mutex); +} + +void trace_event_read_unlock(void) +{ + up_read(&trace_event_mutex); +} + /** * register_ftrace_event - register output for an event type * @event: the event type to register @@ -533,7 +544,7 @@ int register_ftrace_event(struct trace_event *event) unsigned key; int ret = 0; - mutex_lock(&trace_event_mutex); + down_write(&trace_event_mutex); if (WARN_ON(!event)) goto out; @@ -581,11 +592,11 @@ int register_ftrace_event(struct trace_event *event) key = event->type & (EVENT_HASHSIZE - 1); - hlist_add_head_rcu(&event->node, &event_hash[key]); + hlist_add_head(&event->node, &event_hash[key]); ret = event->type; out: - mutex_unlock(&trace_event_mutex); + up_write(&trace_event_mutex); return ret; } @@ -597,10 +608,10 @@ EXPORT_SYMBOL_GPL(register_ftrace_event); */ int unregister_ftrace_event(struct trace_event *event) { - mutex_lock(&trace_event_mutex); + down_write(&trace_event_mutex); hlist_del(&event->node); list_del(&event->list); - mutex_unlock(&trace_event_mutex); + up_write(&trace_event_mutex); return 0; } diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 6e220a8e570..ac240e76eb0 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -20,6 +20,8 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); +extern void trace_event_read_lock(void); +extern void trace_event_read_unlock(void); extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, From b0aae68cc5508f3c2fbf728988c954db4c8b8a53 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 21 May 2009 13:59:18 +0800 Subject: [PATCH 569/900] tracing/events: change the type of __str_loc_item to unsigned short When defining a dynamic size string, we add __str_loc_##item to the trace entry, and it stores the location of the actual string in entry->_str_data[] 'unsigned short' should be sufficient to store this information, thus we save 2 bytes per dyn-size string in the ring buffer. [ Impact: reduce memory occupied by dyn-size strings in ring buffer ] Signed-off-by: Li Zefan Cc: Steven Rostedt LKML-Reference: <4A14EDB6.2050507@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- include/trace/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index edb02bc9f8f..b5ff2e8229e 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -25,7 +25,7 @@ #define __field(type, item) type item; #undef __string -#define __string(item, src) int __str_loc_##item; +#define __string(item, src) unsigned short __str_loc_##item; #undef TP_STRUCT__entry #define TP_STRUCT__entry(args...) args From 62e1e389f87a8839ad83b08c44691d1df8320846 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 09:40:59 +1000 Subject: [PATCH 570/900] md: always update level / chunk_size / layout when writing v1.x metadata. We previously didn't update these fields when writing the metadata because they could never change. They can now, so we better write them. v0.90 metadata always updated these fields. Signed-off-by: NeilBrown --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index fccc8343a25..aa79d55875f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1375,6 +1375,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->size = cpu_to_le64(mddev->dev_sectors); + sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9); + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); if (mddev->bitmap && mddev->bitmap_file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); From 2b69c83924396ad1eda36fdd267c9d2f360f5555 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 09:41:17 +1000 Subject: [PATCH 571/900] md: improve errno return when setting array_size Instead of always returns EINVAL if anything goes wrong when setting the array size, add the option of E2BIG if the size requested is too large. This makes it easier for user-space to be sure what went wrong. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index aa79d55875f..58e0b02a74c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3683,7 +3683,7 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len) if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) - return -EINVAL; + return -E2BIG; mddev->external_size = 1; } From be512691036cc989c11d0f418187efbbf14468e6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 09:41:17 +1000 Subject: [PATCH 572/900] md: bitmap: improve bitmap maintenance code. The code for checking which bits in the bitmap can be cleared has 2 problems: 1/ it repeatedly takes and drops a spinlock, where it would make more sense to just hold on to it most of the time. 2/ it doesn't make use of some opportunities to skip large sections of the bitmap This patch fixes those. It will only affect CPU consumption, not correctness. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 47c68bc75a1..56df1cee8fb 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1097,14 +1097,12 @@ void bitmap_daemon_work(struct bitmap *bitmap) } bitmap->allclean = 1; + spin_lock_irqsave(&bitmap->lock, flags); for (j = 0; j < bitmap->chunks; j++) { bitmap_counter_t *bmc; - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->filemap) { + if (!bitmap->filemap) /* error or shutdown */ - spin_unlock_irqrestore(&bitmap->lock, flags); break; - } page = filemap_get_page(bitmap, j); @@ -1121,6 +1119,8 @@ void bitmap_daemon_work(struct bitmap *bitmap) write_page(bitmap, page, 0); bitmap->allclean = 0; } + spin_lock_irqsave(&bitmap->lock, flags); + j |= (PAGE_BITS - 1); continue; } @@ -1181,9 +1181,10 @@ void bitmap_daemon_work(struct bitmap *bitmap) ext2_clear_bit(file_page_offset(j), paddr); kunmap_atomic(paddr, KM_USER0); } - } - spin_unlock_irqrestore(&bitmap->lock, flags); + } else + j |= PAGE_COUNTER_MASK; } + spin_unlock_irqrestore(&bitmap->lock, flags); /* now sync the final page */ if (lastpage != NULL) { From b6a9ce688f613e2ee5f15e6720e0bb8520efc36e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 09:41:17 +1000 Subject: [PATCH 573/900] md: export 'frozen' resync state through sysfs The md resync engine has a 'frozen' state which ensures that no resync/recovery. This is used to avoid races. Export this state through the 'sync_action' sysfs attribute so that user-space can benefit and also avoid some races. Signed-off-by: NeilBrown --- drivers/md/md.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 58e0b02a74c..384e4f0904c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3306,7 +3306,9 @@ static ssize_t action_show(mddev_t *mddev, char *page) { char *type = "idle"; - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + type = "frozen"; + else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) type = "reshape"; @@ -3329,7 +3331,12 @@ action_store(mddev_t *mddev, const char *page, size_t len) if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (cmd_match(page, "idle")) { + if (cmd_match(page, "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_unregister_thread(mddev->sync_thread); From 29fcefba8a2f0fea11e2b721fe174a1832801284 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sun, 24 May 2009 11:13:17 +0300 Subject: [PATCH 574/900] kmemtrace: fix kernel parameter documentation The kmemtrace.enable kernel parameter no longer works. To enable kmemtrace at boot-time, you must pass "ftrace=kmemtrace" instead. [ Impact: remove obsolete kernel parameter documentation ] Cc: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg LKML-Reference: Signed-off-by: Frederic Weisbecker --- Documentation/kernel-parameters.txt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e87bdbfbcc7..9243dd84f4d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -56,7 +56,6 @@ parameter is applicable: ISAPNP ISA PnP code is enabled. ISDN Appropriate ISDN support is enabled. JOY Appropriate joystick support is enabled. - KMEMTRACE kmemtrace is enabled. LIBATA Libata driver is enabled LP Printer support is enabled. LOOP Loopback device support is enabled. @@ -1054,15 +1053,6 @@ and is between 256 and 4096 characters. It is defined in the file use the HighMem zone if it exists, and the Normal zone if it does not. - kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no } - Controls whether kmemtrace is enabled - at boot-time. - - kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of - subbufs kmemtrace's relay channel has. Set this - higher than default (KMEMTRACE_N_SUBBUFS in code) if - you experience buffer overruns. - kgdboc= [HW] kgdb over consoles. Requires a tty driver that supports console polling. (only serial suported for now) From b11c53e12f94a46b50bccc7a1a953d7ca1d54a31 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 25 May 2009 18:11:59 +0800 Subject: [PATCH 575/900] ftrace: Add task_comm support for trace_event If we enable a trace event alone without any tracer running (such as function tracer, sched switch tracer, etc...) it can't output enough task command information. We need to use the tracing_{start/stop}_cmdline_record() helpers which are designed to keep track of cmdlines for any tasks that were scheduled during the tracing. Before this patch: # echo 1 > debugfs/tracing/events/sched/sched_switch/enable # cat debugfs/tracing/trace # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | <...>-2289 [000] 526276.724790: sched_switch: task bash:2289 [120] ==> sshd:2287 [120] <...>-2287 [000] 526276.725231: sched_switch: task sshd:2287 [120] ==> bash:2289 [120] <...>-2289 [000] 526276.725452: sched_switch: task bash:2289 [120] ==> sshd:2287 [120] <...>-2287 [000] 526276.727181: sched_switch: task sshd:2287 [120] ==> swapper:0 [140] -0 [000] 526277.032734: sched_switch: task swapper:0 [140] ==> events/0:5 [115] <...>-5 [000] 526277.032782: sched_switch: task events/0:5 [115] ==> swapper:0 [140] ... After this patch: # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | bash-2269 [000] 527347.989229: sched_switch: task bash:2269 [120] ==> sshd:2267 [120] sshd-2267 [000] 527347.990960: sched_switch: task sshd:2267 [120] ==> bash:2269 [120] bash-2269 [000] 527347.991143: sched_switch: task bash:2269 [120] ==> sshd:2267 [120] sshd-2267 [000] 527347.992959: sched_switch: task sshd:2267 [120] ==> swapper:0 [140] -0 [000] 527348.531989: sched_switch: task swapper:0 [140] ==> events/0:5 [115] events/0-5 [000] 527348.532115: sched_switch: task events/0:5 [115] ==> swapper:0 [140] ... Changelog: v1->v2: Update Kconfig to select CONTEXT_SWITCH_TRACER in ENABLE_EVENT_TRACING v2->v3: v2 can solve problem that was caused by config EVENT_TRACING alone, but when CONFIG_FTRACE is off and CONFIG_TRACING is selected by other config, compile fail happened again. This version solves it. [ Impact: fix incomplete output of event tracing ] Signed-off-by: Zhao Lei Cc: Tom Zanussi Cc: Steven Rostedt LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/Kconfig | 9 +++++++-- kernel/trace/trace_events.c | 6 ++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f61be301578..a508b9d2adb 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config FTRACE_NMI_ENTER default y config EVENT_TRACING + select CONTEXT_SWITCH_TRACER + bool + +config CONTEXT_SWITCH_TRACER + select MARKERS bool config TRACING @@ -176,10 +181,10 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. -config CONTEXT_SWITCH_TRACER +config ENABLE_CONTEXT_SWITCH_TRACER bool "Trace process context switches" select TRACING - select MARKERS + select CONTEXT_SWITCH_TRACER help This tracer gets called from the context switch and records all switching of tasks. diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9e91c4ad7c8..9b246eb01d5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -85,6 +85,7 @@ static void ftrace_clear_events(void) if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } } @@ -99,12 +100,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, case 0: if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } break; case 1: if (!call->enabled) { call->enabled = 1; + tracing_start_cmdline_record(); call->regfunc(); } break; @@ -1058,6 +1061,7 @@ static void trace_module_remove_events(struct module *mod) found = true; if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } if (call->event) @@ -1262,11 +1266,13 @@ static __init void event_trace_self_tests(void) } call->enabled = 1; + tracing_start_cmdline_record(); call->regfunc(); event_test_stuff(); call->unregfunc(); + tracing_stop_cmdline_record(); call->enabled = 0; pr_cont("OK\n"); From 0e907c99391362385c8e3af2c43b904dd1fd5d73 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 25 May 2009 18:13:59 +0800 Subject: [PATCH 576/900] ftrace: clean up of using ftrace_event_enable_disable() Always use ftrace_event_enable_disable() to enable/disable an event so that we can factorize out the event toggling code. [ Impact: factorize and cleanup event tracing code ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 44 ++++++++++++------------------------- 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9b246eb01d5..6c81f9c2142 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -76,26 +76,9 @@ static void trace_destroy_fields(struct ftrace_event_call *call) #endif /* CONFIG_MODULES */ -static void ftrace_clear_events(void) -{ - struct ftrace_event_call *call; - - mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - - if (call->enabled) { - call->enabled = 0; - tracing_stop_cmdline_record(); - call->unregfunc(); - } - } - mutex_unlock(&event_mutex); -} - static void ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { - switch (enable) { case 0: if (call->enabled) { @@ -114,6 +97,17 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, } } +static void ftrace_clear_events(void) +{ + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + ftrace_event_enable_disable(call, 0); + } + mutex_unlock(&event_mutex); +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ @@ -1059,11 +1053,7 @@ static void trace_module_remove_events(struct module *mod) list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; - if (call->enabled) { - call->enabled = 0; - tracing_stop_cmdline_record(); - call->unregfunc(); - } + ftrace_event_enable_disable(call, 0); if (call->event) unregister_ftrace_event(call->event); debugfs_remove_recursive(call->dir); @@ -1265,15 +1255,9 @@ static __init void event_trace_self_tests(void) continue; } - call->enabled = 1; - tracing_start_cmdline_record(); - call->regfunc(); - + ftrace_event_enable_disable(call, 1); event_test_stuff(); - - call->unregfunc(); - tracing_stop_cmdline_record(); - call->enabled = 0; + ftrace_event_enable_disable(call, 0); pr_cont("OK\n"); } From 848b3182365fdf5a05bcd5ed949071cac2c894b3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 12:41:08 +1000 Subject: [PATCH 577/900] md: raid5: avoid sector values going negative when testing reshape progress. As sector_t in unsigned, we cannot afford to let 'safepos' etc go negative. So replace a -= b; by a -= min(b,a); Signed-off-by: NeilBrown --- drivers/md/raid5.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4616bc3a6e7..3c3626d2a1f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3811,13 +3811,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->delta_disks < 0) { - writepos -= reshape_sectors; + writepos -= min(reshape_sectors, writepos); readpos += reshape_sectors; safepos += reshape_sectors; } else { writepos += reshape_sectors; - readpos -= reshape_sectors; - safepos -= reshape_sectors; + readpos -= min(reshape_sectors, readpos); + safepos -= min(reshape_sectors, safepos); } /* 'writepos' is the most advanced device address we might write. From 7a91ee1f628ef6bfe3f13067c0ddf9db520cb86b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 12:57:21 +1000 Subject: [PATCH 578/900] md: don't update curr_resync_completed without also updating reshape_position. In order for the metadata to always be consistent, we mustn't updated curr_resync_completed without also updating reshape_position. The reshape code updates both at the same time. However since commit 97e4f42d62badb0f9fbc27c013e89bc1336a03bc the common md_do_sync will sometimes update curr_resync_completed but is not in a position to update reshape_position. So if MD_RECOVERY_RESHAPE is set (indicating that a reshape is happening, so reshape_position might change), don't update curr_resync_completed in md_do_sync, leave it to the per-personality reshape code. Signed-off-by: NeilBrown --- drivers/md/md.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 384e4f0904c..954456532ac 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6362,12 +6362,13 @@ void md_do_sync(mddev_t *mddev) skipped = 0; - if ((mddev->curr_resync > mddev->curr_resync_completed && - (mddev->curr_resync - mddev->curr_resync_completed) - > (max_sectors >> 4)) || - (j - mddev->curr_resync_completed)*2 - >= mddev->resync_max - mddev->curr_resync_completed - ) { + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + ((mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) || + (j - mddev->curr_resync_completed)*2 + >= mddev->resync_max - mddev->curr_resync_completed + )) { /* time to update curr_resync_completed */ blk_unplug(mddev->queue); wait_event(mddev->recovery_wait, From b492b852cd8c99505708152c29a5e09a787af9de Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 12:57:36 +1000 Subject: [PATCH 579/900] md: don't use locked_ioctl. md has no need for the BKL - it does its own locking. So md_ioctl doesn't need to be a locked_ioctl. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 954456532ac..641b211fe3f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5567,7 +5567,7 @@ static struct block_device_operations md_fops = .owner = THIS_MODULE, .open = md_open, .release = md_release, - .locked_ioctl = md_ioctl, + .ioctl = md_ioctl, .getgeo = md_getgeo, .media_changed = md_media_changed, .revalidate_disk= md_revalidate, From 8e35961b57da14cb64cb0e4e1b7e3aabda6396fe Mon Sep 17 00:00:00 2001 From: Hideo Saito Date: Sun, 24 May 2009 15:33:34 +0000 Subject: [PATCH 580/900] powerpc/mm: Fix broken MMU PID stealing on !SMP The recent rework of the MMU PID handling for non-hash CPUs has a subtle bug in the !SMP "optimized" variant of the PID stealing function. It clears the PID in the mm context before it calls local_flush_tlb_mm(). However, the later will not flush anything if the PID in the context is clear... Signed-off-by: Hideo Saito Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/mmu_context_nohash.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index a70e311bd45..030d0005b4d 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -127,12 +127,12 @@ static unsigned int steal_context_up(unsigned int id) pr_debug("[%d] steal context %d from mm @%p\n", cpu, id, mm); - /* Mark this mm has having no context anymore */ - mm->context.id = MMU_NO_CONTEXT; - /* Flush the TLB for that context */ local_flush_tlb_mm(mm); + /* Mark this mm has having no context anymore */ + mm->context.id = MMU_NO_CONTEXT; + /* XXX This clear should ultimately be part of local_flush_tlb_mm */ __clear_bit(id, stale_map[cpu]); From 217cbfa856dc1cbc2890781626c4032d9e3ec59f Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Mon, 25 May 2009 22:43:49 -0700 Subject: [PATCH 581/900] mac8390: fix regression caused during net_device_ops conversion Changeset ca17584bf2ad1b1e37a5c0e4386728cc5fc9dabc ("mac8390: update to net_device_ops") broke mac8390 by adding 8390.o to the link. That meant that lib8390.c was included twice, once in mac8390.c and once in 8390.c, subject to different macros. This patch reverts that by avoiding the wrappers in 8390.c. They seem to be of no value since COMPAT_NET_DEV_OPS is going away soon. Tested with a Kinetics EtherPort card. Signed-off-by: Finn Thain Signed-off-by: David S. Miller --- drivers/net/Makefile | 2 +- drivers/net/mac8390.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 1fc4602a6ff..a1c25cb4669 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -102,7 +102,7 @@ obj-$(CONFIG_HAMACHI) += hamachi.o obj-$(CONFIG_NET) += Space.o loopback.o obj-$(CONFIG_SEEQ8005) += seeq8005.o obj-$(CONFIG_NET_SB1000) += sb1000.o -obj-$(CONFIG_MAC8390) += mac8390.o 8390.o +obj-$(CONFIG_MAC8390) += mac8390.o obj-$(CONFIG_APNE) += apne.o 8390.o obj-$(CONFIG_PCMCIA_PCNET) += 8390.o obj-$(CONFIG_HP100) += hp100.o diff --git a/drivers/net/mac8390.c b/drivers/net/mac8390.c index 8e884869a05..f26667d5eaa 100644 --- a/drivers/net/mac8390.c +++ b/drivers/net/mac8390.c @@ -304,7 +304,7 @@ struct net_device * __init mac8390_probe(int unit) if (!MACH_IS_MAC) return ERR_PTR(-ENODEV); - dev = alloc_ei_netdev(); + dev = ____alloc_ei_netdev(0); if (!dev) return ERR_PTR(-ENOMEM); @@ -481,10 +481,10 @@ void cleanup_module(void) static const struct net_device_ops mac8390_netdev_ops = { .ndo_open = mac8390_open, .ndo_stop = mac8390_close, - .ndo_start_xmit = ei_start_xmit, - .ndo_tx_timeout = ei_tx_timeout, - .ndo_get_stats = ei_get_stats, - .ndo_set_multicast_list = ei_set_multicast_list, + .ndo_start_xmit = __ei_start_xmit, + .ndo_tx_timeout = __ei_tx_timeout, + .ndo_get_stats = __ei_get_stats, + .ndo_set_multicast_list = __ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = eth_change_mtu, From c80a5cdfc5ca6533cb893154f546370da1fdb8f0 Mon Sep 17 00:00:00 2001 From: Doug Leith Date: Mon, 25 May 2009 22:44:59 -0700 Subject: [PATCH 582/900] tcp: tcp_vegas ssthresh bugfix This patch fixes ssthresh accounting issues in tcp_vegas when cwnd decreases Signed-off-by: Doug Leith Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index a453aac91bd..c6743eec9b7 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -158,6 +158,11 @@ void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) } EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); +static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) +{ + return min(tp->snd_ssthresh, tp->snd_cwnd-1); +} + static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); @@ -221,11 +226,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) */ diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT; - if (diff > gamma && tp->snd_ssthresh > 2 ) { + if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) { /* Going too fast. Time to slow down * and switch to congestion avoidance. */ - tp->snd_ssthresh = 2; /* Set cwnd to match the actual rate * exactly: @@ -235,6 +239,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * utilization. */ tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); + tp->snd_ssthresh = tcp_vegas_ssthresh(tp); } else if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ @@ -250,6 +255,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * we slow down. */ tp->snd_cwnd--; + tp->snd_ssthresh + = tcp_vegas_ssthresh(tp); } else if (diff < alpha) { /* We don't have enough extra packets * in the network, so speed up. From 46176b4f6bac19454b7b5c35f68594b85850a600 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 26 May 2009 14:42:40 +0900 Subject: [PATCH 583/900] x86, relocs: ignore R_386_NONE in kernel relocation entries For relocatable 32bit kernels, boot/compressed/relocs.c processes relocation entries in the kernel image and appends it to the kernel image such that boot/compressed/head_32.S can relocate the kernel. The kernel image is one statically linked object and only uses two relocation types - R_386_PC32 and R_386_32, of the two only the latter needs massaging during kernel relocation and thus handled by relocs. R_386_PC32 is ignored and all other relocation types are considered error. When the target of a relocation resides in a discarded section, binutils doesn't throw away the relocation record but nullifies it by changing it to R_386_NONE, which unfortunately makes relocs fail. The problem was triggered by yet out-of-tree x86 stack unwind patches but given the binutils behavior, ignoring R_386_NONE is the right thing to do. The problem has been tracked down to binutils behavior by Jan Beulich. [ Impact: fix build with certain binutils by ignoring R_386_NONE ] Signed-off-by: Tejun Heo Cc: Jan Beulich Cc: Ingo Molnar LKML-Reference: <4A1B8150.40702@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/relocs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index 857e492c571..bbeb0c3fbd9 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -504,8 +504,11 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) if (sym->st_shndx == SHN_ABS) { continue; } - if (r_type == R_386_PC32) { - /* PC relative relocations don't need to be adjusted */ + if (r_type == R_386_NONE || r_type == R_386_PC32) { + /* + * NONE can be ignored and and PC relative + * relocations don't need to be adjusted. + */ } else if (r_type == R_386_32) { /* Visit relocations that need to be adjusted */ From f11a377b3f4e897d11f0e8d1fc688667e2f19708 Mon Sep 17 00:00:00 2001 From: David Dillow Date: Fri, 22 May 2009 15:29:34 +0000 Subject: [PATCH 584/900] r8169: avoid losing MSI interrupts The 8169 chip only generates MSI interrupts when all enabled event sources are quiescent and one or more sources transition to active. If not all of the active events are acknowledged, or a new event becomes active while the existing ones are cleared in the handler, we will not see a new interrupt. The current interrupt handler masks off the Rx and Tx events once the NAPI handler has been scheduled, which opens a race window in which we can get another Rx or Tx event and never ACK'ing it, stopping all activity until the link is reset (ifconfig down/up). Fix this by always ACK'ing all event sources, and loop in the handler until we have all sources quiescent. Signed-off-by: David Dillow Tested-by: Michael Buesch Signed-off-by: David S. Miller --- drivers/net/r8169.c | 112 ++++++++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 50 deletions(-) diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 0b6e8c89683..8247a945a1d 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -3554,54 +3554,64 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance) int handled = 0; int status; + /* loop handling interrupts until we have no new ones or + * we hit a invalid/hotplug case. + */ status = RTL_R16(IntrStatus); + while (status && status != 0xffff) { + handled = 1; - /* hotplug/major error/no more work/shared irq */ - if ((status == 0xffff) || !status) - goto out; - - handled = 1; - - if (unlikely(!netif_running(dev))) { - rtl8169_asic_down(ioaddr); - goto out; - } - - status &= tp->intr_mask; - RTL_W16(IntrStatus, - (status & RxFIFOOver) ? (status | RxOverflow) : status); - - if (!(status & tp->intr_event)) - goto out; - - /* Work around for rx fifo overflow */ - if (unlikely(status & RxFIFOOver) && - (tp->mac_version == RTL_GIGA_MAC_VER_11)) { - netif_stop_queue(dev); - rtl8169_tx_timeout(dev); - goto out; - } - - if (unlikely(status & SYSErr)) { - rtl8169_pcierr_interrupt(dev); - goto out; - } - - if (status & LinkChg) - rtl8169_check_link_status(dev, tp, ioaddr); - - if (status & tp->napi_event) { - RTL_W16(IntrMask, tp->intr_event & ~tp->napi_event); - tp->intr_mask = ~tp->napi_event; - - if (likely(napi_schedule_prep(&tp->napi))) - __napi_schedule(&tp->napi); - else if (netif_msg_intr(tp)) { - printk(KERN_INFO "%s: interrupt %04x in poll\n", - dev->name, status); + /* Handle all of the error cases first. These will reset + * the chip, so just exit the loop. + */ + if (unlikely(!netif_running(dev))) { + rtl8169_asic_down(ioaddr); + break; } + + /* Work around for rx fifo overflow */ + if (unlikely(status & RxFIFOOver) && + (tp->mac_version == RTL_GIGA_MAC_VER_11)) { + netif_stop_queue(dev); + rtl8169_tx_timeout(dev); + break; + } + + if (unlikely(status & SYSErr)) { + rtl8169_pcierr_interrupt(dev); + break; + } + + if (status & LinkChg) + rtl8169_check_link_status(dev, tp, ioaddr); + + /* We need to see the lastest version of tp->intr_mask to + * avoid ignoring an MSI interrupt and having to wait for + * another event which may never come. + */ + smp_rmb(); + if (status & tp->intr_mask & tp->napi_event) { + RTL_W16(IntrMask, tp->intr_event & ~tp->napi_event); + tp->intr_mask = ~tp->napi_event; + + if (likely(napi_schedule_prep(&tp->napi))) + __napi_schedule(&tp->napi); + else if (netif_msg_intr(tp)) { + printk(KERN_INFO "%s: interrupt %04x in poll\n", + dev->name, status); + } + } + + /* We only get a new MSI interrupt when all active irq + * sources on the chip have been acknowledged. So, ack + * everything we've seen and check if new sources have become + * active to avoid blocking all interrupts from the chip. + */ + RTL_W16(IntrStatus, + (status & RxFIFOOver) ? (status | RxOverflow) : status); + status = RTL_R16(IntrStatus); } -out: + return IRQ_RETVAL(handled); } @@ -3617,13 +3627,15 @@ static int rtl8169_poll(struct napi_struct *napi, int budget) if (work_done < budget) { napi_complete(napi); - tp->intr_mask = 0xffff; - /* - * 20040426: the barrier is not strictly required but the - * behavior of the irq handler could be less predictable - * without it. Btw, the lack of flush for the posted pci - * write is safe - FR + + /* We need for force the visibility of tp->intr_mask + * for other CPUs, as we can loose an MSI interrupt + * and potentially wait for a retransmit timeout if we don't. + * The posted write to IntrMask is safe, as it will + * eventually make it to the chip and we won't loose anything + * until it does. */ + tp->intr_mask = 0xffff; smp_wmb(); RTL_W16(IntrMask, tp->intr_event); } From 4319503779060120fa5de9b8fde056603bb6e0fd Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Fri, 6 Mar 2009 20:24:57 +0000 Subject: [PATCH 585/900] [CPUFREQ] add atom family to p4-clockmod Some atom procs don't do freq scaling (such as the atom 330 on my own littlefalls2 board). By adding the atom family here, we at least get the benefit of passive cooling in a thermal emergency. Not sure how to see that its actually helping any, but the driver does bind and claim its functioning on my atom 330. Signed-off-by: Jarod Wilson Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 6ac55bd341a..86961519372 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -168,6 +168,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) case 0x0E: /* Core */ case 0x0F: /* Core Duo */ case 0x16: /* Celeron Core */ + case 0x1C: /* Atom */ p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE); case 0x0D: /* Pentium M (Dothan) */ From d38e73e8dad454a5916f446b0d3523c1161ae95a Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 23 Apr 2009 13:36:12 -0400 Subject: [PATCH 586/900] [CPUFREQ] powernow-k7 build fix when ACPI=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/x86/kernel/cpu/cpufreq/powernow-k7.c:172: warning: 'invalidate_entry' defined but not used Reported-by: Toralf Förster Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 3c28ccd4974..a8363e5be4e 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -168,10 +168,12 @@ static int check_powernow(void) return 1; } +#ifdef CONFIG_X86_POWERNOW_K7_ACPI static void invalidate_entry(unsigned int entry) { powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; } +#endif static int get_ranges(unsigned char *pst) { From 42a06f2166f2f6f7bf04f32b4e823eacdceafdc9 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sun, 17 May 2009 10:23:52 -0400 Subject: [PATCH 587/900] [CPUFREQ] remove rwsem lock from CPUFREQ_GOV_STOP call * Rafael J. Wysocki (rjw@sisk.pl) wrote: > This message has been generated automatically as a part of a report > of regressions introduced between 2.6.28 and 2.6.29. > > The following bug entry is on the current list of known regressions > introduced between 2.6.28 and 2.6.29. Please verify if it still should > be listed and let me know (either way). > > > Bug-Entry : http://bugzilla.kernel.org/show_bug.cgi?id=13186 > Subject : cpufreq timer teardown problem > Submitter : Mathieu Desnoyers > Date : 2009-04-23 14:00 (24 days old) > References : http://marc.info/?l=linux-kernel&m=124049523515036&w=4 > Handled-By : Mathieu Desnoyers > Patch : http://patchwork.kernel.org/patch/19754/ > http://patchwork.kernel.org/patch/19753/ The patches linked above depend on the following patch to remove circular locking dependency : cpufreq: remove rwsem lock from CPUFREQ_GOV_STOP call (the following issue was faced when using cancel_delayed_work_sync() in the timer teardown (which fixes a race). * KOSAKI Motohiro (kosaki.motohiro@jp.fujitsu.com) wrote: > Hi > > my box output following warnings. > it seems regression by commit 7ccc7608b836e58fbacf65ee4f8eefa288e86fac. > > A: work -> do_dbs_timer() -> cpu_policy_rwsem > B: store() -> cpu_policy_rwsem -> cpufreq_governor_dbs() -> work > > Hrm, I think it must be due to my attempt to fix the timer teardown race in ondemand governor mixed with new locking behavior in 2.6.30-rc. The rwlock seems to be taken around the whole call to cpufreq_governor_dbs(), when it should be only taken around accesses to the locked data, and especially *not* around the call to dbs_timer_exit(). Reverting my fix attempt would put the teardown race back in place (replacing the cancel_delayed_work_sync by cancel_delayed_work). Instead, a proper fix would imply modifying this critical section : cpufreq.c: __cpufreq_remove_dev() ... if (cpufreq_driver->target) __cpufreq_governor(data, CPUFREQ_GOV_STOP); unlock_policy_rwsem_write(cpu); To make sure the __cpufreq_governor() callback is not called with rwsem held. This would allow execution of cancel_delayed_work_sync() without being nested within the rwsem. Applies on top of the 2.6.30-rc5 tree. Required to remove circular dep in teardown of both conservative and ondemande governors so they can use cancel_delayed_work_sync(). CPUFREQ_GOV_STOP does not modify the policy, therefore this locking seemed unneeded. Signed-off-by: Mathieu Desnoyers CC: KOSAKI Motohiro Cc: Greg KH CC: Ingo Molnar CC: "Rafael J. Wysocki" CC: Ben Slusky CC: Chris Wright CC: Andrew Morton Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d270e8eb3e6..47d2ad0ae07 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1070,11 +1070,11 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev) spin_unlock_irqrestore(&cpufreq_driver_lock, flags); #endif + unlock_policy_rwsem_write(cpu); + if (cpufreq_driver->target) __cpufreq_governor(data, CPUFREQ_GOV_STOP); - unlock_policy_rwsem_write(cpu); - kobject_put(&data->kobj); /* we need to make sure that the underlying kobj is actually From b253d2b2d28ead6fed012feb54694b3d0562839a Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sun, 17 May 2009 10:29:33 -0400 Subject: [PATCH 588/900] [CPUFREQ] fix timer teardown in conservative governor * Rafael J. Wysocki (rjw@sisk.pl) wrote: > This message has been generated automatically as a part of a report > of regressions introduced between 2.6.28 and 2.6.29. > > The following bug entry is on the current list of known regressions > introduced between 2.6.28 and 2.6.29. Please verify if it still should > be listed and let me know (either way). > > > Bug-Entry : http://bugzilla.kernel.org/show_bug.cgi?id=13186 > Subject : cpufreq timer teardown problem > Submitter : Mathieu Desnoyers > Date : 2009-04-23 14:00 (24 days old) > References : http://marc.info/?l=linux-kernel&m=124049523515036&w=4 > Handled-By : Mathieu Desnoyers > Patch : http://patchwork.kernel.org/patch/19754/ > http://patchwork.kernel.org/patch/19753/ > (re-send with updated changelog) cpufreq fix timer teardown in conservative governor The problem is that dbs_timer_exit() uses cancel_delayed_work() when it should use cancel_delayed_work_sync(). cancel_delayed_work() does not wait for the workqueue handler to exit. The ondemand governor does not seem to be affected because the "if (!dbs_info->enable)" check at the beginning of the workqueue handler returns immediately without rescheduling the work. The conservative governor in 2.6.30-rc has the same check as the ondemand governor, which makes things usually run smoothly. However, if the governor is quickly stopped and then started, this could lead to the following race : dbs_enable could be reenabled and multiple do_dbs_timer handlers would run. This is why a synchronized teardown is required. Depends on patch cpufreq: remove rwsem lock from CPUFREQ_GOV_STOP call The following patch applies to 2.6.30-rc2. Stable kernels have a similar issue which should also be fixed, but the code changed between 2.6.29 and 2.6.30, so this patch only applies to 2.6.30-rc. Signed-off-by: Mathieu Desnoyers CC: Andrew Morton CC: gregkh@suse.de CC: stable@kernel.org CC: cpufreq@vger.kernel.org CC: Ingo Molnar CC: rjw@sisk.pl CC: Ben Slusky Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq_conservative.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 2ecd95e4ab1..7a74d175287 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -91,6 +91,9 @@ static unsigned int dbs_enable; /* number of CPUs using this policy */ * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock * is recursive for the same process. -Venki + * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it + * would deadlock with cancel_delayed_work_sync(), which is needed for proper + * raceless workqueue teardown. */ static DEFINE_MUTEX(dbs_mutex); @@ -542,7 +545,7 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) { dbs_info->enable = 0; - cancel_delayed_work(&dbs_info->work); + cancel_delayed_work_sync(&dbs_info->work); } static int cpufreq_governor_dbs(struct cpufreq_policy *policy, From b14893a62c73af0eca414cfed505b8c09efc613c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sun, 17 May 2009 10:30:45 -0400 Subject: [PATCH 589/900] [CPUFREQ] fix timer teardown in ondemand governor * Rafael J. Wysocki (rjw@sisk.pl) wrote: > This message has been generated automatically as a part of a report > of regressions introduced between 2.6.28 and 2.6.29. > > The following bug entry is on the current list of known regressions > introduced between 2.6.28 and 2.6.29. Please verify if it still should > be listed and let me know (either way). > > > Bug-Entry : http://bugzilla.kernel.org/show_bug.cgi?id=13186 > Subject : cpufreq timer teardown problem > Submitter : Mathieu Desnoyers > Date : 2009-04-23 14:00 (24 days old) > References : http://marc.info/?l=linux-kernel&m=124049523515036&w=4 > Handled-By : Mathieu Desnoyers > Patch : http://patchwork.kernel.org/patch/19754/ > http://patchwork.kernel.org/patch/19753/ > (updated changelog) cpufreq fix timer teardown in ondemand governor The problem is that dbs_timer_exit() uses cancel_delayed_work() when it should use cancel_delayed_work_sync(). cancel_delayed_work() does not wait for the workqueue handler to exit. The ondemand governor does not seem to be affected because the "if (!dbs_info->enable)" check at the beginning of the workqueue handler returns immediately without rescheduling the work. The conservative governor in 2.6.30-rc has the same check as the ondemand governor, which makes things usually run smoothly. However, if the governor is quickly stopped and then started, this could lead to the following race : dbs_enable could be reenabled and multiple do_dbs_timer handlers would run. This is why a synchronized teardown is required. The following patch applies to, at least, 2.6.28.x, 2.6.29.1, 2.6.30-rc2. Depends on patch cpufreq: remove rwsem lock from CPUFREQ_GOV_STOP call Signed-off-by: Mathieu Desnoyers CC: Andrew Morton CC: gregkh@suse.de CC: stable@kernel.org CC: cpufreq@vger.kernel.org CC: Ingo Molnar CC: rjw@sisk.pl CC: Ben Slusky Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq_ondemand.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 338f428a15b..e741c339df7 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -98,6 +98,9 @@ static unsigned int dbs_enable; /* number of CPUs using this policy */ * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock * is recursive for the same process. -Venki + * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it + * would deadlock with cancel_delayed_work_sync(), which is needed for proper + * raceless workqueue teardown. */ static DEFINE_MUTEX(dbs_mutex); @@ -562,7 +565,7 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) { dbs_info->enable = 0; - cancel_delayed_work(&dbs_info->work); + cancel_delayed_work_sync(&dbs_info->work); } static int cpufreq_governor_dbs(struct cpufreq_policy *policy, From df1829770db415dc5a5ed5ada3bd70176c6f0a01 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Wed, 22 Apr 2009 13:48:32 +0200 Subject: [PATCH 590/900] [CPUFREQ] powernow-k8 cleanup msg if BIOS does not export ACPI _PSS cpufreq data - Make the message shorter and easier to grep for - Use printk_once instead of WARN_ONCE (functionality of these was mixed) Signed-off-by: Thomas Renninger Cc: Langsdorf, Mark Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 4709ead2db5..feef10c085a 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1215,13 +1215,16 @@ static int powernowk8_verify(struct cpufreq_policy *pol) return cpufreq_frequency_table_verify(pol, data->powernow_table); } +static const char ACPI_PSS_BIOS_BUG_MSG[] = + KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" + KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; + /* per CPU init entry point to the driver */ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; cpumask_t oldmask; int rc; - static int print_once; if (!cpu_online(pol->cpu)) return -ENODEV; @@ -1244,19 +1247,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) * an UP version, and is deprecated by AMD. */ if (num_online_cpus() != 1) { - /* - * Replace this one with print_once as soon as such a - * thing gets introduced - */ - if (!print_once) { - WARN_ONCE(1, KERN_ERR FW_BUG PFX "Your BIOS " - "does not provide ACPI _PSS objects " - "in a way that Linux understands. " - "Please report this to the Linux ACPI" - " maintainers and complain to your " - "BIOS vendor.\n"); - print_once++; - } + printk_once(ACPI_PSS_BIOS_BUG_MSG); goto err_out; } if (pol->cpu != 0) { From ca446d06351992e4f1a7c1e5e99870ab4ec5188f Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 22 Apr 2009 13:48:33 +0200 Subject: [PATCH 591/900] [CPUFREQ] powernow-k8: determine exact CPU frequency for HW Pstates Slightly modified by trenn@suse.de -> only do this on fam 10h and fam 11h. Currently powernow-k8 determines CPU frequency from ACPI PSS objects, but according to AMD family 11h BKDG this frequency is just a rounded value: "CoreFreq (MHz) = The CPU COF specified by MSRC001_00[6B:64][CpuFid] rounded to the nearest 100 Mhz." As a consequnce powernow-k8 reports wrong CPU frequency on some systems, e.g. on Turion X2 Ultra: powernow-k8: Found 1 AMD Turion(tm)X2 Ultra DualCore Mobile ZM-82 processors (2 cpu cores) (version 2.20.00) powernow-k8: 0 : pstate 0 (2200 MHz) powernow-k8: 1 : pstate 1 (1100 MHz) powernow-k8: 2 : pstate 2 (600 MHz) But this is wrong as frequency for Pstate2 is 550 MHz. x86info reports it correctly: #x86info -a |grep Pstate ... Pstate-0: fid=e, did=0, vid=24 (2200MHz) Pstate-1: fid=e, did=1, vid=30 (1100MHz) Pstate-2: fid=e, did=2, vid=3c (550MHz) (current) Solution is to determine the frequency directly from Pstate MSRs instead of using rounded values from ACPI table. Signed-off-by: Andreas Herrmann Signed-off-by: Thomas Renninger Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index feef10c085a..f6b32d11235 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -649,6 +649,20 @@ static void print_basics(struct powernow_k8_data *data) data->batps); } +static u32 freq_from_fid_did(u32 fid, u32 did) +{ + u32 mhz = 0; + + if (boot_cpu_data.x86 == 0x10) + mhz = (100 * (fid + 0x10)) >> did; + else if (boot_cpu_data.x86 == 0x11) + mhz = (100 * (fid + 8)) >> did; + else + BUG(); + + return mhz * 1000; +} + static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid) { @@ -923,8 +937,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, powernow_table[i].index = index; - powernow_table[i].frequency = - data->acpi_data.states[i].core_frequency * 1000; + /* Frequency may be rounded for these */ + if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) { + powernow_table[i].frequency = + freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); + } else + powernow_table[i].frequency = + data->acpi_data.states[i].core_frequency * 1000; } return 0; } From e4a5d54f924ea5ce2913d9d0687d034004816465 Mon Sep 17 00:00:00 2001 From: Ma Ling Date: Tue, 26 May 2009 11:31:00 +0800 Subject: [PATCH 592/900] drm/i915: Add support for VGA load detection (pre-945). Two approaches for VGA detections: hot plug detection for 945G onwards and load pipe detection for Pre-945G. Load pipe detection will get one free pipe, set border color as red and blue, then check CRT status by swf register. This is a sync-up with the 2D driver. Signed-off-by: Ma Ling Signed-off-by: Eric Anholt --- drivers/gpu/drm/i915/intel_crt.c | 149 ++++++++++++++++++++++++++++++- 1 file changed, 147 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c index 19148c3df63..640f5158eff 100644 --- a/drivers/gpu/drm/i915/intel_crt.c +++ b/drivers/gpu/drm/i915/intel_crt.c @@ -198,9 +198,142 @@ static bool intel_crt_detect_ddc(struct drm_connector *connector) return intel_ddc_probe(intel_output); } +static enum drm_connector_status +intel_crt_load_detect(struct drm_crtc *crtc, struct intel_output *intel_output) +{ + struct drm_encoder *encoder = &intel_output->enc; + struct drm_device *dev = encoder->dev; + struct drm_i915_private *dev_priv = dev->dev_private; + struct intel_crtc *intel_crtc = to_intel_crtc(crtc); + uint32_t pipe = intel_crtc->pipe; + uint32_t save_bclrpat; + uint32_t save_vtotal; + uint32_t vtotal, vactive; + uint32_t vsample; + uint32_t vblank, vblank_start, vblank_end; + uint32_t dsl; + uint32_t bclrpat_reg; + uint32_t vtotal_reg; + uint32_t vblank_reg; + uint32_t vsync_reg; + uint32_t pipeconf_reg; + uint32_t pipe_dsl_reg; + uint8_t st00; + enum drm_connector_status status; + + if (pipe == 0) { + bclrpat_reg = BCLRPAT_A; + vtotal_reg = VTOTAL_A; + vblank_reg = VBLANK_A; + vsync_reg = VSYNC_A; + pipeconf_reg = PIPEACONF; + pipe_dsl_reg = PIPEADSL; + } else { + bclrpat_reg = BCLRPAT_B; + vtotal_reg = VTOTAL_B; + vblank_reg = VBLANK_B; + vsync_reg = VSYNC_B; + pipeconf_reg = PIPEBCONF; + pipe_dsl_reg = PIPEBDSL; + } + + save_bclrpat = I915_READ(bclrpat_reg); + save_vtotal = I915_READ(vtotal_reg); + vblank = I915_READ(vblank_reg); + + vtotal = ((save_vtotal >> 16) & 0xfff) + 1; + vactive = (save_vtotal & 0x7ff) + 1; + + vblank_start = (vblank & 0xfff) + 1; + vblank_end = ((vblank >> 16) & 0xfff) + 1; + + /* Set the border color to purple. */ + I915_WRITE(bclrpat_reg, 0x500050); + + if (IS_I9XX(dev)) { + uint32_t pipeconf = I915_READ(pipeconf_reg); + I915_WRITE(pipeconf_reg, pipeconf | PIPECONF_FORCE_BORDER); + /* Wait for next Vblank to substitue + * border color for Color info */ + intel_wait_for_vblank(dev); + st00 = I915_READ8(VGA_MSR_WRITE); + status = ((st00 & (1 << 4)) != 0) ? + connector_status_connected : + connector_status_disconnected; + + I915_WRITE(pipeconf_reg, pipeconf); + } else { + bool restore_vblank = false; + int count, detect; + + /* + * If there isn't any border, add some. + * Yes, this will flicker + */ + if (vblank_start <= vactive && vblank_end >= vtotal) { + uint32_t vsync = I915_READ(vsync_reg); + uint32_t vsync_start = (vsync & 0xffff) + 1; + + vblank_start = vsync_start; + I915_WRITE(vblank_reg, + (vblank_start - 1) | + ((vblank_end - 1) << 16)); + restore_vblank = true; + } + /* sample in the vertical border, selecting the larger one */ + if (vblank_start - vactive >= vtotal - vblank_end) + vsample = (vblank_start + vactive) >> 1; + else + vsample = (vtotal + vblank_end) >> 1; + + /* + * Wait for the border to be displayed + */ + while (I915_READ(pipe_dsl_reg) >= vactive) + ; + while ((dsl = I915_READ(pipe_dsl_reg)) <= vsample) + ; + /* + * Watch ST00 for an entire scanline + */ + detect = 0; + count = 0; + do { + count++; + /* Read the ST00 VGA status register */ + st00 = I915_READ8(VGA_MSR_WRITE); + if (st00 & (1 << 4)) + detect++; + } while ((I915_READ(pipe_dsl_reg) == dsl)); + + /* restore vblank if necessary */ + if (restore_vblank) + I915_WRITE(vblank_reg, vblank); + /* + * If more than 3/4 of the scanline detected a monitor, + * then it is assumed to be present. This works even on i830, + * where there isn't any way to force the border color across + * the screen + */ + status = detect * 4 > count * 3 ? + connector_status_connected : + connector_status_disconnected; + } + + /* Restore previous settings */ + I915_WRITE(bclrpat_reg, save_bclrpat); + + return status; +} + static enum drm_connector_status intel_crt_detect(struct drm_connector *connector) { struct drm_device *dev = connector->dev; + struct intel_output *intel_output = to_intel_output(connector); + struct drm_encoder *encoder = &intel_output->enc; + struct drm_crtc *crtc; + int dpms_mode; + enum drm_connector_status status; if (IS_I9XX(dev) && !IS_I915G(dev) && !IS_I915GM(dev)) { if (intel_crt_detect_hotplug(connector)) @@ -212,8 +345,20 @@ static enum drm_connector_status intel_crt_detect(struct drm_connector *connecto if (intel_crt_detect_ddc(connector)) return connector_status_connected; - /* TODO use load detect */ - return connector_status_unknown; + /* for pre-945g platforms use load detect */ + if (encoder->crtc && encoder->crtc->enabled) { + status = intel_crt_load_detect(encoder->crtc, intel_output); + } else { + crtc = intel_get_load_detect_pipe(intel_output, + NULL, &dpms_mode); + if (crtc) { + status = intel_crt_load_detect(crtc, intel_output); + intel_release_load_detect_pipe(intel_output, dpms_mode); + } else + status = connector_status_unknown; + } + + return status; } static void intel_crt_destroy(struct drm_connector *connector) From be74b73a57645cc253d881ab0c1014eb64b9cf22 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 26 May 2009 20:25:22 +0200 Subject: [PATCH 593/900] tracing: add __print_flags for events Developers have been asking for the ability in the ftrace event tracer to display names of bits in a flags variable. Instead of printing out c2, it would be easier to read FOO|BAR|GOO, assuming that FOO is bit 1, BAR is bit 6 and GOO is bit 7. Some examples where this would be useful are the state flags in a context switch, kmalloc flags, and even permision flags in accessing files. [ v2 changes include: Frederic Weisbecker's idea of using a mask instead of bits, thus we can output GFP_KERNEL instead of GPF_WAIT|GFP_IO|GFP_FS. Li Zefan's idea of allowing the caller of __print_flags to add their own delimiter (or no delimiter) where we can get for file permissions rwx instead of r|w|x. ] [ v3 changes: Christoph Hellwig's idea of using an array instead of va_args. ] [ Impact: better displaying of flags in trace output ] Signed-off-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/linux/ftrace_event.h | 13 +++++++++++- include/trace/ftrace.h | 14 +++++++++++++ kernel/trace/trace_output.c | 39 ++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index bae51ddfabd..4b58cf1a11c 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -3,12 +3,23 @@ #include #include - +#include struct trace_array; struct tracer; struct dentry; +DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq); + +struct trace_print_flags { + unsigned long mask; + const char *name; +}; + +const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array); + /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index b5ff2e8229e..22c94719c56 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -87,6 +87,7 @@ * struct trace_seq *s = &iter->seq; * struct ftrace_raw_ *field; <-- defined in stage 1 * struct trace_entry *entry; + * struct trace_seq *p; * int ret; * * entry = iter->ent; @@ -98,7 +99,9 @@ * * field = (typeof(field))entry; * + * p = get_cpu_var(ftrace_event_seq); * ret = trace_seq_printf(s, "\n"); + * put_cpu(); * if (!ret) * return TRACE_TYPE_PARTIAL_LINE; * @@ -119,6 +122,14 @@ #undef __get_str #define __get_str(field) ((char *)__entry + __entry->__str_loc_##field) +#undef __print_flags +#define __print_flags(flag, delim, flag_array...) \ + ({ \ + static const struct trace_print_flags flags[] = \ + { flag_array, { -1, NULL }}; \ + ftrace_print_flags_seq(p, delim, flag, flags); \ + }) + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ enum print_line_t \ @@ -127,6 +138,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ struct trace_seq *s = &iter->seq; \ struct ftrace_raw_##call *field; \ struct trace_entry *entry; \ + struct trace_seq *p; \ int ret; \ \ entry = iter->ent; \ @@ -138,7 +150,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ \ field = (typeof(field))entry; \ \ + p = &get_cpu_var(ftrace_event_seq); \ ret = trace_seq_printf(s, #call ": " print); \ + put_cpu(); \ if (!ret) \ return TRACE_TYPE_PARTIAL_LINE; \ \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 7136420603a..a4840c260c8 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -15,6 +15,9 @@ #define EVENT_HASHSIZE 128 static DECLARE_RWSEM(trace_event_mutex); + +DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); + static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; @@ -212,6 +215,42 @@ int trace_seq_path(struct trace_seq *s, struct path *path) return 0; } +const char * +ftrace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array) +{ + unsigned long mask; + const char *str; + int i; + + trace_seq_init(p); + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (p->len && delim) + trace_seq_puts(p, delim); + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (p->len && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%lx", flags); + } + + trace_seq_putc(p, 0); + + return p->buffer; +} + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { From 937cdb9db7f59278d0cb1582e6e64e3dfd73b4fc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 15 May 2009 10:51:13 -0400 Subject: [PATCH 594/900] tracing: add previous task state info to sched switch event It is useful to see the state of a task that is being switched out. This patch adds the output of the state of the previous task in the context switch event. [ Impact: see state of switched out task in context switch ] Signed-off-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/trace/events/sched.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index dd4033cf5b0..24ab5bcff7b 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -156,6 +156,7 @@ TRACE_EVENT(sched_switch, __array( char, prev_comm, TASK_COMM_LEN ) __field( pid_t, prev_pid ) __field( int, prev_prio ) + __field( long, prev_state ) __array( char, next_comm, TASK_COMM_LEN ) __field( pid_t, next_pid ) __field( int, next_prio ) @@ -165,13 +166,19 @@ TRACE_EVENT(sched_switch, memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; __entry->prev_prio = prev->prio; + __entry->prev_state = prev->state; memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; ), - TP_printk("task %s:%d [%d] ==> %s:%d [%d]", + TP_printk("task %s:%d [%d] (%s) ==> %s:%d [%d]", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, + __entry->prev_state ? + __print_flags(__entry->prev_state, "|", + { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, + { 16, "Z" }, { 32, "X" }, { 64, "x" }, + { 128, "W" }) : "R", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); From 62ba180e80f4194a498585ac0e4c07daa8ca08d1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 15 May 2009 16:16:30 -0400 Subject: [PATCH 595/900] tracing: add flag output for kmem events This patch changes the output for gfp_flags from being a simple hex value to the actual names. gfp_flags=GFP_ATOMIC instead of gfp_flags=00000020 And even gfp_flags=GFP_KERNEL instead of gfp_flags=000000d0 (Thanks to Frederic Weisbecker for pointing out that the first version had a bad order of GFP masks) [ Impact: more human readable output from tracer ] Acked-by: Eduard - Gabriel Munteanu Signed-off-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/trace/events/kmem.h | 53 +++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index c22c42f980b..9baba50d651 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -7,6 +7,43 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kmem +/* + * The order of these masks is important. Matching masks will be seen + * first and the left over flags will end up showing by themselves. + * + * For example, if we have GFP_KERNEL before GFP_USER we wil get: + * + * GFP_KERNEL|GFP_HARDWALL + * + * Thus most bits set go first. + */ +#define show_gfp_flags(flags) \ + (flags) ? __print_flags(flags, "|", \ + {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"}, \ + {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ + {(unsigned long)GFP_USER, "GFP_USER"}, \ + {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \ + {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \ + {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \ + {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \ + {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \ + {(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \ + {(unsigned long)__GFP_WAIT, "GFP_WAIT"}, \ + {(unsigned long)__GFP_IO, "GFP_IO"}, \ + {(unsigned long)__GFP_COLD, "GFP_COLD"}, \ + {(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \ + {(unsigned long)__GFP_REPEAT, "GFP_REPEAT"}, \ + {(unsigned long)__GFP_NOFAIL, "GFP_NOFAIL"}, \ + {(unsigned long)__GFP_NORETRY, "GFP_NORETRY"}, \ + {(unsigned long)__GFP_COMP, "GFP_COMP"}, \ + {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \ + {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \ + {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ + {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ + {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ + {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"} \ + ) : "GFP_NOWAIT" + TRACE_EVENT(kmalloc, TP_PROTO(unsigned long call_site, @@ -33,12 +70,12 @@ TRACE_EVENT(kmalloc, __entry->gfp_flags = gfp_flags; ), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s", __entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags) + show_gfp_flags(__entry->gfp_flags)) ); TRACE_EVENT(kmem_cache_alloc, @@ -67,12 +104,12 @@ TRACE_EVENT(kmem_cache_alloc, __entry->gfp_flags = gfp_flags; ), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s", __entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags) + show_gfp_flags(__entry->gfp_flags)) ); TRACE_EVENT(kmalloc_node, @@ -104,12 +141,12 @@ TRACE_EVENT(kmalloc_node, __entry->node = node; ), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d", __entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags, + show_gfp_flags(__entry->gfp_flags), __entry->node) ); @@ -142,12 +179,12 @@ TRACE_EVENT(kmem_cache_alloc_node, __entry->node = node; ), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d", __entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags, + show_gfp_flags(__entry->gfp_flags), __entry->node) ); From 0f4fc29dd68dfab9c6ddd5d087d34a5b6818cb00 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 May 2009 19:21:47 -0400 Subject: [PATCH 596/900] tracing: add __print_symbolic to trace events This patch adds __print_symbolic which is similar to __print_flags but works for an enumeration type instead. That is, there is only a one to one mapping between the values and the symbols. When a match is made, then it is printed, otherwise the hex value is outputed. [ Impact: add interface for showing symbol names in events ] Signed-off-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/linux/ftrace_event.h | 3 +++ include/trace/ftrace.h | 8 ++++++++ kernel/trace/trace_output.c | 25 +++++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 4b58cf1a11c..bbf40f624fc 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -20,6 +20,9 @@ const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, const struct trace_print_flags *flag_array); +const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array); + /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 22c94719c56..87fc227c6fb 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -130,6 +130,14 @@ ftrace_print_flags_seq(p, delim, flag, flags); \ }) +#undef __print_symbolic +#define __print_symbolic(value, symbol_array...) \ + ({ \ + static const struct trace_print_flags symbols[] = \ + { symbol_array, { -1, NULL }}; \ + ftrace_print_symbols_seq(p, value, symbols); \ + }) + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ enum print_line_t \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a4840c260c8..c12d95db2f5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -251,6 +251,31 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, return p->buffer; } +const char * +ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array) +{ + int i; + + trace_seq_init(p); + + for (i = 0; symbol_array[i].name; i++) { + + if (val != symbol_array[i].mask) + continue; + + trace_seq_puts(p, symbol_array[i].name); + break; + } + + if (!p->len) + trace_seq_printf(p, "0x%lx", val); + + trace_seq_putc(p, 0); + + return p->buffer; +} + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { From c2adae0970ca1db8adb92fb56ae3bcabd916e8bd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 May 2009 19:56:19 -0400 Subject: [PATCH 597/900] tracing: convert irq events to use __print_symbolic The recording of the names at trace time is inefficient. This patch implements the softirq event recording to only record the vector and then use the __print_symbolic interface to print out the names. [ Impact: faster recording of softirq events ] Signed-off-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/trace/events/irq.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 32a9f7ef432..683fb36a994 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -7,6 +7,19 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq +#define softirq_name(sirq) { sirq, #sirq } +#define show_softirq_name(val) \ + __print_symbolic(val, \ + softirq_name(HI_SOFTIRQ), \ + softirq_name(TIMER_SOFTIRQ), \ + softirq_name(NET_TX_SOFTIRQ), \ + softirq_name(NET_RX_SOFTIRQ), \ + softirq_name(BLOCK_SOFTIRQ), \ + softirq_name(TASKLET_SOFTIRQ), \ + softirq_name(SCHED_SOFTIRQ), \ + softirq_name(HRTIMER_SOFTIRQ), \ + softirq_name(RCU_SOFTIRQ)) + /** * irq_handler_entry - called immediately before the irq action handler * @irq: irq number @@ -87,15 +100,14 @@ TRACE_EVENT(softirq_entry, TP_STRUCT__entry( __field( int, vec ) - __string( name, softirq_to_name[h-vec] ) ), TP_fast_assign( __entry->vec = (int)(h - vec); - __assign_str(name, softirq_to_name[h-vec]); ), - TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) + TP_printk("softirq=%d action=%s", __entry->vec, + show_softirq_name(__entry->vec)) ); /** @@ -117,15 +129,14 @@ TRACE_EVENT(softirq_exit, TP_STRUCT__entry( __field( int, vec ) - __string( name, softirq_to_name[h-vec] ) ), TP_fast_assign( __entry->vec = (int)(h - vec); - __assign_str(name, softirq_to_name[h-vec]); ), - TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) + TP_printk("softirq=%d action=%s", __entry->vec, + show_softirq_name(__entry->vec)) ); #endif /* _TRACE_IRQ_H */ From 68743082b560067e3e93eab8b2568f238e486865 Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Tue, 26 May 2009 14:51:00 -0400 Subject: [PATCH 598/900] XPRTRDMA: fix client rpcrdma FRMR registration on mlx4 devices mlx4/connectX FRMR requires local write enable together with remote rdma write enable. This fixes NFS/RDMA operation over the ConnectX Infiniband HCA in the default memreg mode. Signed-off-by: Vu Pham Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3b21e0cc5e6..465aafc2007 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1495,7 +1495,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; frmr_wr.wr.fast_reg.access_flags = (writing ? - IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ); + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ); frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); From d0367a508af9cf97beb202935bb9ad8883d30cd1 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Tue, 26 May 2009 14:51:00 -0400 Subject: [PATCH 599/900] nfs: fix build error in nfsroot with initconst fix build error with latest kbuild adjustments to initconst. The commit a447c0932445f92ce6f4c1bd020f62c5097a7842 ("vfs: Use const for kernel parser table") changed: static match_table_t __initdata tokens = { to static match_table_t __initconst tokens = { But the missing const causes popwerpc to fail with latest updates to __initconst like this: fs/nfs/nfsroot.c:400: error: __setup_str_nfs_root_setup causes a section type conflict fs/nfs/nfsroot.c:400: error: __setup_str_nfs_root_setup causes a section type conflict The bug is only present with kbuild-next. Following patch has been build tested. Signed-off-by: Sam Ravnborg Cc: Steven Whitehouse Cc: Stephen Rothwell Acked-by: Jan Beulich Signed-off-by: Trond Myklebust --- fs/nfs/nfsroot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index d9ef602fbc5..e3ed5908820 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -129,7 +129,7 @@ enum { Opt_err }; -static match_table_t __initconst tokens = { +static const match_table_t tokens __initconst = { {Opt_port, "port=%u"}, {Opt_rsize, "rsize=%u"}, {Opt_wsize, "wsize=%u"}, From 95baa25c7321eb8613246acbf61b97911cc748d3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 26 May 2009 14:51:00 -0400 Subject: [PATCH 600/900] NFSv4: Fix the case where NFSv4 renewal fails If the asynchronous lease renewal fails (usually due to a soft timeout), then we _must_ schedule state recovery in order to ensure that we don't lose the lease unnecessarily or, if the lease is already lost, that we recover the locking state promptly... Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a4d24268029..4674f8092da 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2594,12 +2594,9 @@ static void nfs4_renew_done(struct rpc_task *task, void *data) unsigned long timestamp = (unsigned long)data; if (task->tk_status < 0) { - switch (task->tk_status) { - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_CB_PATH_DOWN: - nfs4_schedule_state_recovery(clp); - } + /* Unless we're shutting down, schedule state recovery! */ + if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) + nfs4_schedule_state_recovery(clp); return; } spin_lock(&clp->cl_lock); From ab2b7ebaad16226c9a5e85c5f384d19fa58a7459 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 26 May 2009 09:11:03 +0100 Subject: [PATCH 601/900] kmod: Release sub_info on cred allocation failure. call_usermodehelper_setup() forgot to kfree(sub_info) when prepare_usermodehelper_creds() failed. Signed-off-by: Tetsuo Handa Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/kmod.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/kmod.c b/kernel/kmod.c index b750675251e..7e95bedb2bf 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -370,8 +370,10 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, sub_info->argv = argv; sub_info->envp = envp; sub_info->cred = prepare_usermodehelper_creds(); - if (!sub_info->cred) + if (!sub_info->cred) { + kfree(sub_info); return NULL; + } out: return sub_info; From 564346224daaa8f7222d7a92cdbb7bafde59ae6e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 26 May 2009 20:54:41 +0930 Subject: [PATCH 602/900] lguest: fix on Intel when KVM loaded (unhandled trap 13) When KVM is loaded, and hence VT set up, the vmcall instruction in an lguest guest causes a #GP, not #UD. Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- drivers/lguest/x86/core.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 1a83910f674..eaf722fe309 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -358,6 +358,16 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) if (emulate_insn(cpu)) return; } + /* If KVM is active, the vmcall instruction triggers a + * General Protection Fault. Normally it triggers an + * invalid opcode fault (6): */ + case 6: + /* We need to check if ring == GUEST_PL and + * faulting instruction == vmcall. */ + if (is_hypercall(cpu)) { + rewrite_hypercall(cpu); + return; + } break; case 14: /* We've intercepted a Page Fault. */ /* The Guest accessed a virtual address that wasn't mapped. @@ -403,15 +413,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) * up the pointer now to indicate a hypercall is pending. */ cpu->hcall = (struct hcall_args *)cpu->regs; return; - case 6: - /* kvm hypercalls trigger an invalid opcode fault (6). - * We need to check if ring == GUEST_PL and - * faulting instruction == vmcall. */ - if (is_hypercall(cpu)) { - rewrite_hypercall(cpu); - return; - } - break; } /* We didn't handle the trap, so it needs to go to the Guest. */ From 2171787be2e71ff71159857bfeb21398b61eb615 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Tue, 26 May 2009 10:33:35 -0700 Subject: [PATCH 603/900] x86: avoid back to back on_each_cpu in cpa_flush_array Cleanup cpa_flush_array() to avoid back to back on_each_cpu() calls. [ Impact: optimizes fix 0af48f42df15b97080b450d24219dd95db7b929a ] Signed-off-by: Venkatesh Pallipadi Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 0f9052bcec4..e17efed088c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -204,30 +204,19 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } -static void wbinvd_local(void *unused) -{ - wbinvd(); -} - static void cpa_flush_array(unsigned long *start, int numpages, int cache, int in_flags, struct page **pages) { unsigned int i, level; + unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ BUG_ON(irqs_disabled()); - on_each_cpu(__cpa_flush_range, NULL, 1); + on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); - if (!cache) + if (!cache || do_wbinvd) return; - /* 4M threshold */ - if (numpages >= 1024) { - if (boot_cpu_data.x86 >= 4) - on_each_cpu(wbinvd_local, NULL, 1); - - return; - } /* * We only need to flush on one CPU, * clflush is a MESI-coherent instruction that From b1338d199dda6681d9af0297928af0a7eb9cba7b Mon Sep 17 00:00:00 2001 From: Herton Ronaldo Krzesinski Date: Tue, 26 May 2009 12:15:53 +0900 Subject: [PATCH 604/900] tomoyo: add missing call to cap_bprm_set_creds cap_bprm_set_creds() has to be called from security_bprm_set_creds(). TOMOYO forgot to call cap_bprm_set_creds() from tomoyo_bprm_set_creds() and suid executables were not being working. Make sure we call cap_bprm_set_creds() with TOMOYO, to set credentials properly inside tomoyo_bprm_set_creds(). Signed-off-by: Herton Ronaldo Krzesinski Acked-by: Tetsuo Handa Signed-off-by: James Morris --- security/tomoyo/tomoyo.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index 5b481912752..e42be5c4f05 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -27,6 +27,12 @@ static int tomoyo_cred_prepare(struct cred *new, const struct cred *old, static int tomoyo_bprm_set_creds(struct linux_binprm *bprm) { + int rc; + + rc = cap_bprm_set_creds(bprm); + if (rc) + return rc; + /* * Do only if this function is called for the first time of an execve * operation. From e76a16deb8785317a23cca7204331af053e0fb4e Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 26 May 2009 17:44:56 -0700 Subject: [PATCH 605/900] drm/i915: Fix tiling pitch handling on 8xx. The pitch field is an exponent on pre-965, so we were rejecting buffers on 8xx that we shouldn't have. 915 got lucky in that the largest legal value happened to match (8KB / 512 = 0x10), but 8xx has a smaller tile width. Additionally, we programmed that bad value into the register on 8xx, so the only pitch that would work correctly was 4096 (512-1023 pixels), while others would probably give bad rendering or hangs. Signed-off-by: Eric Anholt fd.o bug #20473. --- drivers/gpu/drm/i915/i915_gem.c | 6 ++++-- drivers/gpu/drm/i915/i915_gem_tiling.c | 14 +++++++++++--- drivers/gpu/drm/i915/i915_reg.h | 3 ++- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 717b6a854bc..e4408daf8ce 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2128,8 +2128,10 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *reg) return; } - pitch_val = (obj_priv->stride / 128) - 1; - WARN_ON(pitch_val & ~0x0000000f); + pitch_val = obj_priv->stride / 128; + pitch_val = ffs(pitch_val) - 1; + WARN_ON(pitch_val > I830_FENCE_MAX_PITCH_VAL); + val = obj_priv->gtt_offset; if (obj_priv->tiling_mode == I915_TILING_Y) val |= 1 << I830_FENCE_TILING_Y_SHIFT; diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c index 52a059354e8..540dd336e6e 100644 --- a/drivers/gpu/drm/i915/i915_gem_tiling.c +++ b/drivers/gpu/drm/i915/i915_gem_tiling.c @@ -213,7 +213,8 @@ i915_tiling_ok(struct drm_device *dev, int stride, int size, int tiling_mode) if (tiling_mode == I915_TILING_NONE) return true; - if (tiling_mode == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev)) + if (!IS_I9XX(dev) || + (tiling_mode == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev))) tile_width = 128; else tile_width = 512; @@ -225,11 +226,18 @@ i915_tiling_ok(struct drm_device *dev, int stride, int size, int tiling_mode) if (stride / 128 > I965_FENCE_MAX_PITCH_VAL) return false; } else if (IS_I9XX(dev)) { - if (stride / tile_width > I830_FENCE_MAX_PITCH_VAL || + uint32_t pitch_val = ffs(stride / tile_width) - 1; + + /* XXX: For Y tiling, FENCE_MAX_PITCH_VAL is actually 6 (8KB) + * instead of 4 (2KB) on 945s. + */ + if (pitch_val > I915_FENCE_MAX_PITCH_VAL || size > (I830_FENCE_MAX_SIZE_VAL << 20)) return false; } else { - if (stride / 128 > I830_FENCE_MAX_PITCH_VAL || + uint32_t pitch_val = ffs(stride / tile_width) - 1; + + if (pitch_val > I830_FENCE_MAX_PITCH_VAL || size > (I830_FENCE_MAX_SIZE_VAL << 19)) return false; } diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 9668cc0d7f4..375569d01d0 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -190,7 +190,8 @@ #define I830_FENCE_SIZE_BITS(size) ((ffs((size) >> 19) - 1) << 8) #define I830_FENCE_PITCH_SHIFT 4 #define I830_FENCE_REG_VALID (1<<0) -#define I830_FENCE_MAX_PITCH_VAL 0x10 +#define I915_FENCE_MAX_PITCH_VAL 0x10 +#define I830_FENCE_MAX_PITCH_VAL 6 #define I830_FENCE_MAX_SIZE_VAL (1<<8) #define I915_FENCE_START_MASK 0x0ff00000 From cfa16a0de5392c54db553ec2233a7110e4b4da7a Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 26 May 2009 18:46:16 -0700 Subject: [PATCH 606/900] drm/i915: Apply a big hammer to 865 GEM object CPU cache flushing. On the 865, but not the 855, the clflush we do appears to not actually make it out to the hardware all the time. An easy way to safely reproduce was X -retro, which would show that some of the blits involved in drawing the lovely root weave didn't make it out to the hardware. Those blits are 32 bytes each, and 1-2 would be missing at various points around the screen. Other experimentation (doing more clflush, doing more AGP chipset flush, poking at some more device registers to maybe trigger more flushing) didn't help. krh came up with the wbinvd as a way to successfully get all those blits to appear. Signed-off-by: Eric Anholt --- drivers/gpu/drm/i915/i915_gem.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index e4408daf8ce..e2421869a40 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2423,6 +2423,16 @@ i915_gem_clflush_object(struct drm_gem_object *obj) if (obj_priv->pages == NULL) return; + /* XXX: The 865 in particular appears to be weird in how it handles + * cache flushing. We haven't figured it out, but the + * clflush+agp_chipset_flush doesn't appear to successfully get the + * data visible to the PGU, while wbinvd + agp_chipset_flush does. + */ + if (IS_I865G(obj->dev)) { + wbinvd(); + return; + } + drm_clflush_pages(obj_priv->pages, obj->size / PAGE_SIZE); } From 84532a0fc3d5811dca8e3726fe4d372ea87bd7c6 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 27 May 2009 13:33:14 +1000 Subject: [PATCH 607/900] Revert "powerpc: Rework dma-noncoherent to use generic vmalloc layer" This reverts commit 33f00dcedb0e22cdb156a23632814fc580fcfcf8. While it was a good idea to try to use the mm/vmalloc.c allocator instead of our own (in fact, ours is itself a dup on an old variant of the vmalloc one), unfortunately, the approach is terminally busted since dma_alloc_coherent() can be called at interrupt time or in atomic contexts and there's little chances we'll make the code in mm/vmalloc.c cope with\ that :-( Until we can get the generic code to forbid that idiocy and fix all drivers abusing it, we pretty much have no choice but revert to our custom virtual space allocator. There's also a problem with SMP safety since freeing such mapping would require an IPI which cannot be done at interrupt time. However, right now, I don't think we support any platform that is both SMP and has non-coherent DMA (don't laugh, I know such things do exist !) so we can sort that out later. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 25 +++ arch/powerpc/lib/dma-noncoherent.c | 299 +++++++++++++++++++++++------ 2 files changed, 269 insertions(+), 55 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a0d1146a057..3bb43adce44 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -868,6 +868,31 @@ config TASK_SIZE default "0x80000000" if PPC_PREP || PPC_8xx default "0xc0000000" +config CONSISTENT_START_BOOL + bool "Set custom consistent memory pool address" + depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE + help + This option allows you to set the base virtual address + of the consistent memory pool. This pool of virtual + memory is used to make consistent memory allocations. + +config CONSISTENT_START + hex "Base virtual address of consistent memory pool" if CONSISTENT_START_BOOL + default "0xfd000000" if (NOT_COHERENT_CACHE && 8xx) + default "0xff100000" if NOT_COHERENT_CACHE + +config CONSISTENT_SIZE_BOOL + bool "Set custom consistent memory pool size" + depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE + help + This option allows you to set the size of the + consistent memory pool. This pool of virtual memory + is used to make consistent memory allocations. + +config CONSISTENT_SIZE + hex "Size of consistent memory pool" if CONSISTENT_SIZE_BOOL + default "0x00200000" if NOT_COHERENT_CACHE + config PIN_TLB bool "Pinned Kernel TLBs (860 ONLY)" depends on ADVANCED_OPTIONS && 8xx diff --git a/arch/powerpc/lib/dma-noncoherent.c b/arch/powerpc/lib/dma-noncoherent.c index 005a28d380a..b7dc4c19f58 100644 --- a/arch/powerpc/lib/dma-noncoherent.c +++ b/arch/powerpc/lib/dma-noncoherent.c @@ -29,10 +29,120 @@ #include #include #include -#include #include +/* + * This address range defaults to a value that is safe for all + * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It + * can be further configured for specific applications under + * the "Advanced Setup" menu. -Matt + */ +#define CONSISTENT_BASE (CONFIG_CONSISTENT_START) +#define CONSISTENT_END (CONFIG_CONSISTENT_START + CONFIG_CONSISTENT_SIZE) +#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT) + +/* + * This is the page table (2MB) covering uncached, DMA consistent allocations + */ +static pte_t *consistent_pte; +static DEFINE_SPINLOCK(consistent_lock); + +/* + * VM region handling support. + * + * This should become something generic, handling VM region allocations for + * vmalloc and similar (ioremap, module space, etc). + * + * I envisage vmalloc()'s supporting vm_struct becoming: + * + * struct vm_struct { + * struct vm_region region; + * unsigned long flags; + * struct page **pages; + * unsigned int nr_pages; + * unsigned long phys_addr; + * }; + * + * get_vm_area() would then call vm_region_alloc with an appropriate + * struct vm_region head (eg): + * + * struct vm_region vmalloc_head = { + * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list), + * .vm_start = VMALLOC_START, + * .vm_end = VMALLOC_END, + * }; + * + * However, vmalloc_head.vm_start is variable (typically, it is dependent on + * the amount of RAM found at boot time.) I would imagine that get_vm_area() + * would have to initialise this each time prior to calling vm_region_alloc(). + */ +struct ppc_vm_region { + struct list_head vm_list; + unsigned long vm_start; + unsigned long vm_end; +}; + +static struct ppc_vm_region consistent_head = { + .vm_list = LIST_HEAD_INIT(consistent_head.vm_list), + .vm_start = CONSISTENT_BASE, + .vm_end = CONSISTENT_END, +}; + +static struct ppc_vm_region * +ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp) +{ + unsigned long addr = head->vm_start, end = head->vm_end - size; + unsigned long flags; + struct ppc_vm_region *c, *new; + + new = kmalloc(sizeof(struct ppc_vm_region), gfp); + if (!new) + goto out; + + spin_lock_irqsave(&consistent_lock, flags); + + list_for_each_entry(c, &head->vm_list, vm_list) { + if ((addr + size) < addr) + goto nospc; + if ((addr + size) <= c->vm_start) + goto found; + addr = c->vm_end; + if (addr > end) + goto nospc; + } + + found: + /* + * Insert this entry _before_ the one we found. + */ + list_add_tail(&new->vm_list, &c->vm_list); + new->vm_start = addr; + new->vm_end = addr + size; + + spin_unlock_irqrestore(&consistent_lock, flags); + return new; + + nospc: + spin_unlock_irqrestore(&consistent_lock, flags); + kfree(new); + out: + return NULL; +} + +static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr) +{ + struct ppc_vm_region *c; + + list_for_each_entry(c, &head->vm_list, vm_list) { + if (c->vm_start == addr) + goto out; + } + c = NULL; + out: + return c; +} + /* * Allocate DMA-coherent memory space and return both the kernel remapped * virtual and bus address for that space. @@ -41,21 +151,21 @@ void * __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) { struct page *page; + struct ppc_vm_region *c; unsigned long order; - int i; - unsigned int nr_pages = PAGE_ALIGN(size)>>PAGE_SHIFT; - unsigned int array_size = nr_pages * sizeof(struct page *); - struct page **pages; - struct page *end; u64 mask = 0x00ffffff, limit; /* ISA default */ - struct vm_struct *area; - BUG_ON(!mem_init_done); + if (!consistent_pte) { + printk(KERN_ERR "%s: not initialised\n", __func__); + dump_stack(); + return NULL; + } + size = PAGE_ALIGN(size); limit = (mask + 1) & ~mask; - if (limit && size >= limit) { - printk(KERN_WARNING "coherent allocation too big (requested " - "%#x mask %#Lx)\n", size, mask); + if ((limit && size >= limit) || size >= (CONSISTENT_END - CONSISTENT_BASE)) { + printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n", + size, mask); return NULL; } @@ -68,8 +178,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) if (!page) goto no_page; - end = page + (1 << order); - /* * Invalidate any data that might be lurking in the * kernel direct-mapped region for device DMA. @@ -80,59 +188,48 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) flush_dcache_range(kaddr, kaddr + size); } - split_page(page, order); - /* - * Set the "dma handle" + * Allocate a virtual address in the consistent mapping region. */ - *handle = page_to_phys(page); + c = ppc_vm_region_alloc(&consistent_head, size, + gfp & ~(__GFP_DMA | __GFP_HIGHMEM)); + if (c) { + unsigned long vaddr = c->vm_start; + pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr); + struct page *end = page + (1 << order); - area = get_vm_area_caller(size, VM_IOREMAP, - __builtin_return_address(1)); - if (!area) - goto out_free_pages; + split_page(page, order); - if (array_size > PAGE_SIZE) { - pages = vmalloc(array_size); - area->flags |= VM_VPAGES; - } else { - pages = kmalloc(array_size, GFP_KERNEL); - } - if (!pages) - goto out_free_area; + /* + * Set the "dma handle" + */ + *handle = page_to_phys(page); - area->pages = pages; - area->nr_pages = nr_pages; + do { + BUG_ON(!pte_none(*pte)); - for (i = 0; i < nr_pages; i++) - pages[i] = page + i; + SetPageReserved(page); + set_pte_at(&init_mm, vaddr, + pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL))); + page++; + pte++; + vaddr += PAGE_SIZE; + } while (size -= PAGE_SIZE); - if (map_vm_area(area, pgprot_noncached(PAGE_KERNEL), &pages)) - goto out_unmap; + /* + * Free the otherwise unused pages. + */ + while (page < end) { + __free_page(page); + page++; + } - /* - * Free the otherwise unused pages. - */ - page += nr_pages; - while (page < end) { - __free_page(page); - page++; + return (void *)c->vm_start; } - return area->addr; -out_unmap: - vunmap(area->addr); - if (array_size > PAGE_SIZE) - vfree(pages); - else - kfree(pages); - goto out_free_pages; -out_free_area: - free_vm_area(area); -out_free_pages: if (page) __free_pages(page, order); -no_page: + no_page: return NULL; } EXPORT_SYMBOL(__dma_alloc_coherent); @@ -142,11 +239,103 @@ EXPORT_SYMBOL(__dma_alloc_coherent); */ void __dma_free_coherent(size_t size, void *vaddr) { - vfree(vaddr); + struct ppc_vm_region *c; + unsigned long flags, addr; + pte_t *ptep; + size = PAGE_ALIGN(size); + + spin_lock_irqsave(&consistent_lock, flags); + + c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr); + if (!c) + goto no_area; + + if ((c->vm_end - c->vm_start) != size) { + printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", + __func__, c->vm_end - c->vm_start, size); + dump_stack(); + size = c->vm_end - c->vm_start; + } + + ptep = consistent_pte + CONSISTENT_OFFSET(c->vm_start); + addr = c->vm_start; + do { + pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep); + unsigned long pfn; + + ptep++; + addr += PAGE_SIZE; + + if (!pte_none(pte) && pte_present(pte)) { + pfn = pte_pfn(pte); + + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + ClearPageReserved(page); + + __free_page(page); + continue; + } + } + + printk(KERN_CRIT "%s: bad page in kernel page table\n", + __func__); + } while (size -= PAGE_SIZE); + + flush_tlb_kernel_range(c->vm_start, c->vm_end); + + list_del(&c->vm_list); + + spin_unlock_irqrestore(&consistent_lock, flags); + + kfree(c); + return; + + no_area: + spin_unlock_irqrestore(&consistent_lock, flags); + printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n", + __func__, vaddr); + dump_stack(); } EXPORT_SYMBOL(__dma_free_coherent); +/* + * Initialise the consistent memory allocation. + */ +static int __init dma_alloc_init(void) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int ret = 0; + + do { + pgd = pgd_offset(&init_mm, CONSISTENT_BASE); + pud = pud_alloc(&init_mm, pgd, CONSISTENT_BASE); + pmd = pmd_alloc(&init_mm, pud, CONSISTENT_BASE); + if (!pmd) { + printk(KERN_ERR "%s: no pmd tables\n", __func__); + ret = -ENOMEM; + break; + } + + pte = pte_alloc_kernel(pmd, CONSISTENT_BASE); + if (!pte) { + printk(KERN_ERR "%s: no pte tables\n", __func__); + ret = -ENOMEM; + break; + } + + consistent_pte = pte; + } while (0); + + return ret; +} + +core_initcall(dma_alloc_init); + /* * make an area consistent. */ From 87ad57bacb25c3f24c54f142ef445f68277705f0 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 19 May 2009 16:09:42 +0800 Subject: [PATCH 608/900] cpuidle: makes AMD C1E work in acpi_idle When AMD C1E is enabled, local APIC timer will stop even in C1. This patch uses broadcast IPI to replace local APIC timer in C1. http://bugzilla.kernel.org/show_bug.cgi?id=13233 [ impact: avoid boot hang in AMD CPU with C1E enabled ] Tested-by: Dmitry Lyzhyn Signed-off-by: Shaohua Li Signed-off-by: Len Brown --- drivers/acpi/processor_idle.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 72069ba5f1e..6b7bcc7e3e1 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -148,6 +148,9 @@ static void acpi_timer_check_state(int state, struct acpi_processor *pr, if (cpu_has(&cpu_data(pr->id), X86_FEATURE_ARAT)) return; + if (boot_cpu_has(X86_FEATURE_AMDC1E)) + type = ACPI_STATE_C1; + /* * Check, if one of the previous states already marked the lapic * unstable @@ -611,6 +614,7 @@ static int acpi_processor_power_verify(struct acpi_processor *pr) switch (cx->type) { case ACPI_STATE_C1: cx->valid = 1; + acpi_timer_check_state(i, pr, cx); break; case ACPI_STATE_C2: @@ -835,6 +839,7 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, return 0; } + acpi_state_timer_broadcast(pr, cx, 1); kt1 = ktime_get_real(); acpi_idle_do_entry(cx); kt2 = ktime_get_real(); @@ -842,6 +847,7 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, local_irq_enable(); cx->usage++; + acpi_state_timer_broadcast(pr, cx, 0); return idle_time; } From 7d60e8ab0d5507229dfbdf456501cc378610fa01 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 19 May 2009 16:09:54 +0800 Subject: [PATCH 609/900] cpuidle: fix AMD C1E suspend hang When AMD C1E is enabled, local APIC timer will stop even in C1. To avoid suspend/resume hang, this patch removes C1 and replace it with a cpu_relax() in suspend/resume path. This hasn't any impact in runtime path. http://bugzilla.kernel.org/show_bug.cgi?id=13233 [ impact: avoid suspend/resume hang in AMD CPU with C1E enabled ] Tested-by: Dmitry Lyzhyn Signed-off-by: Shaohua Li Signed-off-by: Len Brown --- drivers/acpi/processor_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 6b7bcc7e3e1..10a2d913635 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -834,8 +834,8 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, /* Do not access any ACPI IO ports in suspend path */ if (acpi_idle_suspend) { - acpi_safe_halt(); local_irq_enable(); + cpu_relax(); return 0; } From e65fcfd63a9a62baa5708484ff8edbe56eb3e7ec Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Tue, 26 May 2009 20:47:02 -0700 Subject: [PATCH 610/900] cls_cgroup: read classid atomically in classifier Avoid reading the unsynchronized value cs->classid multiple times, since it could change concurrently from non-zero to zero; this would result in the classifier returning a positive result with a bogus (zero) classid. Signed-off-by: Paul Menage Reviewed-by: Li Zefan Signed-off-by: David S. Miller --- net/sched/cls_cgroup.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 91a3db4a76f..cc29b44b150 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -104,8 +104,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) { struct cls_cgroup_head *head = tp->root; - struct cgroup_cls_state *cs; - int ret = 0; + u32 classid; /* * Due to the nature of the classifier it is required to ignore all @@ -121,17 +120,18 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp, return -1; rcu_read_lock(); - cs = task_cls_state(current); - if (cs->classid && tcf_em_tree_match(skb, &head->ematches, NULL)) { - res->classid = cs->classid; - res->class = 0; - ret = tcf_exts_exec(skb, &head->exts, res); - } else - ret = -1; - + classid = task_cls_state(current)->classid; rcu_read_unlock(); - return ret; + if (!classid) + return -1; + + if (!tcf_em_tree_match(skb, &head->ematches, NULL)) + return -1; + + res->classid = classid; + res->class = 0; + return tcf_exts_exec(skb, &head->exts, res); } static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle) From 18a36c1a398055e87a3646738abf3c1b0e98e7e2 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Tue, 26 May 2009 20:48:04 -0700 Subject: [PATCH 611/900] gianfar: fix babbling rx error event bug Gianfar interrupt handler uses IEVENT_ERR_MASK to check and handle errors. Babbling RX error (IEVENT_BABR) should be included in IEVENT_ERROR_MASK. Otherwise if BABR is raised, it never gets handled nor cleared, and an interrupt storm results. This has been observed to happen on sending a burst of ethernet frames to a gianfar based board. Signed-off-by: Xiaotian Feng Signed-off-by: David S. Miller --- drivers/net/gianfar.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h index 0642d52aef5..cf352961ae9 100644 --- a/drivers/net/gianfar.h +++ b/drivers/net/gianfar.h @@ -259,7 +259,7 @@ extern const char gfar_driver_version[]; (IEVENT_RXC | IEVENT_BSY | IEVENT_EBERR | IEVENT_MSRO | \ IEVENT_BABT | IEVENT_TXC | IEVENT_TXE | IEVENT_LC \ | IEVENT_CRL | IEVENT_XFUN | IEVENT_DPE | IEVENT_PERR \ - | IEVENT_MAG) + | IEVENT_MAG | IEVENT_BABR) #define IMASK_INIT_CLEAR 0x00000000 #define IMASK_BABR 0x80000000 From 5ad18900355743757e7f415bcb850ea8406a13e2 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Tue, 26 May 2009 20:50:12 -0700 Subject: [PATCH 612/900] atlx: move modinfo data from atlx.h to atl1.c Both atl1.c and atl2.c include atlx.h, which defines some modinfo stuff. But atl2.c seems like it doesn't want the modinfo data from atlx.h, as it defines its own. Running modinfo on atl2.ko, we get conflicting information: $ /sbin/modinfo drivers/net/atlx/atl2.ko | egrep "version|description|author" version: 2.2.3 description: Atheros Fast Ethernet Network Driver author: Atheros Corporation , Chris Snook version: 2.1.3 author: Xiong Huang , Chris Snook , Jay Cliburn Move the modinfo data out of atlx.h and into atl1.c to eliminate the confusion: $ /sbin/modinfo drivers/net/atlx/atl1.ko | egrep "version|description|author" version: 2.1.3 author: Xiong Huang , Chris Snook , Jay Cliburn description: Atheros L1 Gigabit Ethernet Driver $ /sbin/modinfo drivers/net/atlx/atl2.ko | egrep "version|description|author" version: 2.2.3 description: Atheros Fast Ethernet Network Driver author: Atheros Corporation , Chris Snook Reported-by: Scott Scriven Signed-off-by: Alex Chiang Acked-by: Jay Cliburn Signed-off-by: David S. Miller --- drivers/net/atlx/atl1.c | 6 ++++++ drivers/net/atlx/atlx.h | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/atlx/atl1.c b/drivers/net/atlx/atl1.c index 0ab22540bf5..4e817126e28 100644 --- a/drivers/net/atlx/atl1.c +++ b/drivers/net/atlx/atl1.c @@ -82,6 +82,12 @@ #include "atl1.h" +#define ATLX_DRIVER_VERSION "2.1.3" +MODULE_AUTHOR("Xiong Huang , \ + Chris Snook , Jay Cliburn "); +MODULE_LICENSE("GPL"); +MODULE_VERSION(ATLX_DRIVER_VERSION); + /* Temporary hack for merging atl1 and atl2 */ #include "atlx.c" diff --git a/drivers/net/atlx/atlx.h b/drivers/net/atlx/atlx.h index 297a03da6b7..14054b75aa6 100644 --- a/drivers/net/atlx/atlx.h +++ b/drivers/net/atlx/atlx.h @@ -29,12 +29,6 @@ #include #include -#define ATLX_DRIVER_VERSION "2.1.3" -MODULE_AUTHOR("Xiong Huang , \ - Chris Snook , Jay Cliburn "); -MODULE_LICENSE("GPL"); -MODULE_VERSION(ATLX_DRIVER_VERSION); - #define ATLX_ERR_PHY 2 #define ATLX_ERR_PHY_SPEED 7 #define ATLX_ERR_PHY_RES 8 From b63dc8fef7ca5c51d163295d824e78c770d48ccf Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 26 May 2009 20:55:33 -0700 Subject: [PATCH 613/900] bfin_mac: fix build error due to net_device_ops convert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit "convert to net_device_ops" broke the Blackfin MAC driver as it declared the new structure before the function it used: CC drivers/net/bfin_mac.o drivers/net/bfin_mac.c:984: error: ‘bfin_mac_close’ undeclared here (not in a function) make[1]: *** [drivers/net/bfin_mac.o] Error 1 Signed-off-by: Mike Frysinger Signed-off-by: David S. Miller --- drivers/net/bfin_mac.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c index 9f971ed6b58..b4da1821332 100644 --- a/drivers/net/bfin_mac.c +++ b/drivers/net/bfin_mac.c @@ -979,22 +979,7 @@ static int bfin_mac_open(struct net_device *dev) return 0; } -static const struct net_device_ops bfin_mac_netdev_ops = { - .ndo_open = bfin_mac_open, - .ndo_stop = bfin_mac_close, - .ndo_start_xmit = bfin_mac_hard_start_xmit, - .ndo_set_mac_address = bfin_mac_set_mac_address, - .ndo_tx_timeout = bfin_mac_timeout, - .ndo_set_multicast_list = bfin_mac_set_multicast_list, - .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = bfin_mac_poll, -#endif -}; - /* - * * this makes the board clean up everything that it can * and not talk to the outside world. Caused by * an 'ifconfig ethX down' @@ -1019,6 +1004,20 @@ static int bfin_mac_close(struct net_device *dev) return 0; } +static const struct net_device_ops bfin_mac_netdev_ops = { + .ndo_open = bfin_mac_open, + .ndo_stop = bfin_mac_close, + .ndo_start_xmit = bfin_mac_hard_start_xmit, + .ndo_set_mac_address = bfin_mac_set_mac_address, + .ndo_tx_timeout = bfin_mac_timeout, + .ndo_set_multicast_list = bfin_mac_set_multicast_list, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = eth_change_mtu, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_poll_controller = bfin_mac_poll, +#endif +}; + static int __devinit bfin_mac_probe(struct platform_device *pdev) { struct net_device *ndev; From 7a1450fdf4c69961f3926352fd8bc4ea19676756 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 26 May 2009 04:55:38 -0400 Subject: [PATCH 614/900] Blackfin: hook up preadv/pwritev syscalls Signed-off-by: Mike Frysinger --- arch/blackfin/include/asm/unistd.h | 4 +++- arch/blackfin/mach-common/entry.S | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index 1e57b636e0b..cf5066d3efd 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h @@ -378,8 +378,10 @@ #define __NR_dup3 363 #define __NR_pipe2 364 #define __NR_inotify_init1 365 +#define __NR_preadv 366 +#define __NR_pwritev 367 -#define __NR_syscall 366 +#define __NR_syscall 368 #define NR_syscalls __NR_syscall /* Old optional stuff no one actually uses */ diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S index 21e65a339a2..a063a434f7e 100644 --- a/arch/blackfin/mach-common/entry.S +++ b/arch/blackfin/mach-common/entry.S @@ -1581,6 +1581,8 @@ ENTRY(_sys_call_table) .long _sys_dup3 .long _sys_pipe2 .long _sys_inotify_init1 /* 365 */ + .long _sys_preadv + .long _sys_pwritev .rept NR_syscalls-(.-_sys_call_table)/4 .long _sys_ni_syscall From 6c83429a1c32c914dfb81939cc2ddece97e48294 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Sun, 24 May 2009 02:13:15 -0400 Subject: [PATCH 615/900] MAINTAINERS: update Blackfin items With Bryan Wu having moved on to another job, push the slack onto some other ADI lackeys. Signed-off-by: Mike Frysinger --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 77cbfb1a696..7dd34750264 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1132,16 +1132,16 @@ F: fs/bfs/ F: include/linux/bfs_fs.h BLACKFIN ARCHITECTURE -P: Bryan Wu -M: cooloney@kernel.org +P: Mike Frysinger +M: vapier@gentoo.org L: uclinux-dist-devel@blackfin.uclinux.org W: http://blackfin.uclinux.org S: Supported F: arch/blackfin/ BLACKFIN EMAC DRIVER -P: Bryan Wu -M: cooloney@kernel.org +P: Michael Hennerich +M: michael.hennerich@analog.com L: uclinux-dist-devel@blackfin.uclinux.org (subscribers-only) W: http://blackfin.uclinux.org S: Supported From 49afa60948f859e71d68a74c1af6ccd7b5b94d82 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Mon, 18 May 2009 04:33:07 -0400 Subject: [PATCH 616/900] MAINTAINERS: drop (subscribers-only) markings on Blackfin lists All of the Blackfin lists are transparently moderated for non-subscribers. i.e. there are no annoying notices and people get whitelisted after first their posting. Signed-off-by: Mike Frysinger --- MAINTAINERS | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7dd34750264..5ee166e27b9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1142,7 +1142,7 @@ F: arch/blackfin/ BLACKFIN EMAC DRIVER P: Michael Hennerich M: michael.hennerich@analog.com -L: uclinux-dist-devel@blackfin.uclinux.org (subscribers-only) +L: uclinux-dist-devel@blackfin.uclinux.org W: http://blackfin.uclinux.org S: Supported F: drivers/net/bfin_mac.* @@ -1150,7 +1150,7 @@ F: drivers/net/bfin_mac.* BLACKFIN RTC DRIVER P: Mike Frysinger M: vapier.adi@gmail.com -L: uclinux-dist-devel@blackfin.uclinux.org (subscribers-only) +L: uclinux-dist-devel@blackfin.uclinux.org W: http://blackfin.uclinux.org S: Supported F: drivers/rtc/rtc-bfin.c @@ -1158,7 +1158,7 @@ F: drivers/rtc/rtc-bfin.c BLACKFIN SERIAL DRIVER P: Sonic Zhang M: sonic.zhang@analog.com -L: uclinux-dist-devel@blackfin.uclinux.org (subscribers-only) +L: uclinux-dist-devel@blackfin.uclinux.org W: http://blackfin.uclinux.org S: Supported F: drivers/serial/bfin_5xx.c @@ -1166,7 +1166,7 @@ F: drivers/serial/bfin_5xx.c BLACKFIN WATCHDOG DRIVER P: Mike Frysinger M: vapier.adi@gmail.com -L: uclinux-dist-devel@blackfin.uclinux.org (subscribers-only) +L: uclinux-dist-devel@blackfin.uclinux.org W: http://blackfin.uclinux.org S: Supported F: drivers/watchdog/bfin_wdt.c @@ -1174,7 +1174,7 @@ F: drivers/watchdog/bfin_wdt.c BLACKFIN I2C TWI DRIVER P: Sonic Zhang M: sonic.zhang@analog.com -L: uclinux-dist-devel@blackfin.uclinux.org (subscribers-only) +L: uclinux-dist-devel@blackfin.uclinux.org W: http://blackfin.uclinux.org/ S: Supported F: drivers/i2c/busses/i2c-bfin-twi.c From 6b50520b2fd9bf521f9c947b5f6999bad273a51d Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 19 May 2009 10:03:22 -0400 Subject: [PATCH 617/900] Blackfin: ignore generated vmlinux.lds Signed-off-by: Mike Frysinger --- arch/blackfin/kernel/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 arch/blackfin/kernel/.gitignore diff --git a/arch/blackfin/kernel/.gitignore b/arch/blackfin/kernel/.gitignore new file mode 100644 index 00000000000..c5f676c3c22 --- /dev/null +++ b/arch/blackfin/kernel/.gitignore @@ -0,0 +1 @@ +vmlinux.lds From 2ec10ea91bf3688013b00638f29df4f8f6b5c18b Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 20 May 2009 19:45:39 -0400 Subject: [PATCH 618/900] Blackfin: drop unneeded asm/.gitignore We don't create a include/asm/mach/ symlink anymore, so we don't need the .gitignore for it. Signed-off-by: Mike Frysinger --- arch/blackfin/include/asm/.gitignore | 1 - 1 file changed, 1 deletion(-) delete mode 100644 arch/blackfin/include/asm/.gitignore diff --git a/arch/blackfin/include/asm/.gitignore b/arch/blackfin/include/asm/.gitignore deleted file mode 100644 index 7858564a446..00000000000 --- a/arch/blackfin/include/asm/.gitignore +++ /dev/null @@ -1 +0,0 @@ -+mach From add8a5050a52f1bd1be6b97be86fdd1cfbea2d1d Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 26 May 2009 05:03:52 -0400 Subject: [PATCH 619/900] Blackfin: fix strncmp.o build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix some more fallout of the string changes: CC arch/blackfin/lib/strncmp.o In file included from include/linux/bitmap.h:9, from include/linux/nodemask.h:90, from include/linux/mmzone.h:17, from include/linux/gfp.h:5, from include/linux/kmod.h:23, from include/linux/module.h:14, from arch/blackfin/lib/strncmp.c:14: include/linux/string.h: In function ‘strstarts’: include/linux/string.h:132: error: implicit declaration of function ‘strncmp’ make[1]: *** [arch/blackfin/lib/strncmp.o] Error 1 Signed-off-by: Mike Frysinger CC: Rusty Russell --- arch/blackfin/lib/strncmp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/blackfin/lib/strncmp.c b/arch/blackfin/lib/strncmp.c index 2aaae78a68e..46518b1d298 100644 --- a/arch/blackfin/lib/strncmp.c +++ b/arch/blackfin/lib/strncmp.c @@ -8,9 +8,8 @@ #define strncmp __inline_strncmp #include -#undef strncmp - #include +#undef strncmp int strncmp(const char *cs, const char *ct, size_t count) { From b16e7766d6436835f473ba823ad04fbdfe5e9cbd Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 27 May 2009 13:36:10 +1000 Subject: [PATCH 620/900] powerpc: Move dma-noncoherent.c from arch/powerpc/lib to arch/powerpc/mm (pre-requisite to make the next patches more palatable) Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/lib/Makefile | 1 - arch/powerpc/mm/Makefile | 1 + arch/powerpc/{lib => mm}/dma-noncoherent.c | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename arch/powerpc/{lib => mm}/dma-noncoherent.c (100%) diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 8db35278a4b..29b742b90f1 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -18,7 +18,6 @@ obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ memcpy_64.o usercopy_64.o mem_64.o string.o obj-$(CONFIG_XMON) += sstep.o obj-$(CONFIG_KPROBES) += sstep.o -obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o ifeq ($(CONFIG_PPC64),y) obj-$(CONFIG_SMP) += locks.o diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 17290bcedc5..b746f4ca420 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -26,3 +26,4 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o +obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o diff --git a/arch/powerpc/lib/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c similarity index 100% rename from arch/powerpc/lib/dma-noncoherent.c rename to arch/powerpc/mm/dma-noncoherent.c From f637a49e507c88354ab32b5d914e06acfb7ee00d Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 27 May 2009 13:44:50 +1000 Subject: [PATCH 621/900] powerpc: Minor cleanups of kernel virt address space definitions Make FIXADDR_TOP a compile time constant and cleanup a couple of definitions relative to the layout of the kernel address space on ppc32. We also print out that layout at boot time for debugging purposes. This is a pre-requisite for properly fixing non-coherent DMA allocactions. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/fixmap.h | 4 ++-- arch/powerpc/include/asm/pgtable-ppc32.h | 22 ++++++++++++++++++++-- arch/powerpc/mm/init_32.c | 8 ++------ arch/powerpc/mm/mem.c | 13 +++++++++++++ arch/powerpc/mm/pgtable_32.c | 2 -- 5 files changed, 37 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index d60fd18f428..f1f4e23a84e 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -14,8 +14,6 @@ #ifndef _ASM_FIXMAP_H #define _ASM_FIXMAP_H -extern unsigned long FIXADDR_TOP; - #ifndef __ASSEMBLY__ #include #include @@ -24,6 +22,8 @@ extern unsigned long FIXADDR_TOP; #include #endif +#define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE)) + /* * Here we define all the compile-time 'special' virtual * addresses. The point is to have a constant address at diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h index ba45c997830..28fe9d4bae3 100644 --- a/arch/powerpc/include/asm/pgtable-ppc32.h +++ b/arch/powerpc/include/asm/pgtable-ppc32.h @@ -10,7 +10,7 @@ extern unsigned long va_to_phys(unsigned long address); extern pte_t *va_to_pte(unsigned long address); -extern unsigned long ioremap_bot, ioremap_base; +extern unsigned long ioremap_bot; #ifdef CONFIG_44x extern int icache_44x_need_flush; @@ -55,9 +55,27 @@ extern int icache_44x_need_flush; #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) +/* + * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary + * value (for now) on others, from where we can start layout kernel + * virtual space that goes below PKMAP and FIXMAP + */ +#ifdef CONFIG_HIGHMEM +#define KVIRT_TOP PKMAP_BASE +#else +#define KVIRT_TOP (0xfe000000UL) /* for now, could be FIXMAP_BASE ? */ +#endif + +/* + * ioremap_bot starts at that address. Early ioremaps move down from there, + * until mem_init() at which point this becomes the top of the vmalloc + * and ioremap space + */ +#define IOREMAP_TOP KVIRT_TOP + /* * Just any arbitrary offset to the start of the vmalloc VM area: the - * current 64MB value just means that there will be a 64MB "hole" after the + * current 16MB value just means that there will be a 64MB "hole" after the * physical memory until the kernel virtual memory starts. That means that * any out-of-bounds memory accesses will hopefully be caught. * The vmalloc() routines leaves a hole of 4kB between each vmalloced diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 666a5e8a5be..3de6a0d9382 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -168,12 +168,8 @@ void __init MMU_init(void) ppc_md.progress("MMU:mapin", 0x301); mapin_ram(); -#ifdef CONFIG_HIGHMEM - ioremap_base = PKMAP_BASE; -#else - ioremap_base = 0xfe000000UL; /* for now, could be 0xfffff000 */ -#endif /* CONFIG_HIGHMEM */ - ioremap_bot = ioremap_base; + /* Initialize early top-down ioremap allocator */ + ioremap_bot = IOREMAP_TOP; /* Map in I/O resources */ if (ppc_md.progress) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index d0602a76bf7..d3a4e67561f 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -380,6 +380,19 @@ void __init mem_init(void) bsssize >> 10, initsize >> 10); +#ifdef CONFIG_PPC32 + pr_info("Kernel virtual memory layout:\n"); + pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP); +#ifdef CONFIG_HIGHMEM + pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n", + PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP)); +#endif /* CONFIG_HIGHMEM */ + pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", + ioremap_bot, IOREMAP_TOP); + pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n", + VMALLOC_START, VMALLOC_END); +#endif /* CONFIG_PPC32 */ + mem_init_done = 1; } diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 430d0908fa5..5422169626b 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -399,8 +399,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable) #endif /* CONFIG_DEBUG_PAGEALLOC */ static int fixmaps; -unsigned long FIXADDR_TOP = (-PAGE_SIZE); -EXPORT_SYMBOL(FIXADDR_TOP); void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { From 8b31e49d1d75729c1da9009664ba52abd1adc628 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 27 May 2009 13:50:33 +1000 Subject: [PATCH 622/900] powerpc: Fix up dma_alloc_coherent() on platforms without cache coherency. The implementation we just revived has issues, such as using a Kconfig-defined virtual address area in kernel space that nothing actually carves out (and thus will overlap whatever is there), or having some dependencies on being self contained in a single PTE page which adds unnecessary constraints on the kernel virtual address space. This fixes it by using more classic PTE accessors and automatically locating the area for consistent memory, carving an appropriate hole in the kernel virtual address space, leaving only the size of that area as a Kconfig option. It also brings some dma-mask related fixes from the ARM implementation which was almost identical initially but grew its own fixes. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 13 --- arch/powerpc/include/asm/dma-mapping.h | 6 +- arch/powerpc/include/asm/pgtable-ppc32.h | 4 + arch/powerpc/kernel/dma.c | 2 +- arch/powerpc/mm/dma-noncoherent.c | 108 +++++++++-------------- arch/powerpc/mm/mem.c | 4 + 6 files changed, 54 insertions(+), 83 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3bb43adce44..cdc9a6ff4be 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -868,19 +868,6 @@ config TASK_SIZE default "0x80000000" if PPC_PREP || PPC_8xx default "0xc0000000" -config CONSISTENT_START_BOOL - bool "Set custom consistent memory pool address" - depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE - help - This option allows you to set the base virtual address - of the consistent memory pool. This pool of virtual - memory is used to make consistent memory allocations. - -config CONSISTENT_START - hex "Base virtual address of consistent memory pool" if CONSISTENT_START_BOOL - default "0xfd000000" if (NOT_COHERENT_CACHE && 8xx) - default "0xff100000" if NOT_COHERENT_CACHE - config CONSISTENT_SIZE_BOOL bool "Set custom consistent memory pool size" depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index c69f2b5f0cc..cb448d68452 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -26,7 +26,9 @@ * allocate the space "normally" and use the cache management functions * to ensure it is consistent. */ -extern void *__dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp); +struct device; +extern void *__dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *handle, gfp_t gfp); extern void __dma_free_coherent(size_t size, void *vaddr); extern void __dma_sync(void *vaddr, size_t size, int direction); extern void __dma_sync_page(struct page *page, unsigned long offset, @@ -37,7 +39,7 @@ extern void __dma_sync_page(struct page *page, unsigned long offset, * Cache coherent cores. */ -#define __dma_alloc_coherent(gfp, size, handle) NULL +#define __dma_alloc_coherent(dev, gfp, size, handle) NULL #define __dma_free_coherent(size, addr) ((void)0) #define __dma_sync(addr, size, rw) ((void)0) #define __dma_sync_page(pg, off, sz, rw) ((void)0) diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h index 28fe9d4bae3..c9ff9d75990 100644 --- a/arch/powerpc/include/asm/pgtable-ppc32.h +++ b/arch/powerpc/include/asm/pgtable-ppc32.h @@ -71,7 +71,11 @@ extern int icache_44x_need_flush; * until mem_init() at which point this becomes the top of the vmalloc * and ioremap space */ +#ifdef CONFIG_NOT_COHERENT_CACHE +#define IOREMAP_TOP ((KVIRT_TOP - CONFIG_CONSISTENT_SIZE) & PAGE_MASK) +#else #define IOREMAP_TOP KVIRT_TOP +#endif /* * Just any arbitrary offset to the start of the vmalloc VM area: the diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 53c7788cba7..6b02793dc75 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -32,7 +32,7 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size, { void *ret; #ifdef CONFIG_NOT_COHERENT_CACHE - ret = __dma_alloc_coherent(size, dma_handle, flag); + ret = __dma_alloc_coherent(dev, size, dma_handle, flag); if (ret == NULL) return NULL; *dma_handle += get_dma_direct_offset(dev); diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index b7dc4c19f58..36692f5c9a7 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -32,20 +32,21 @@ #include +#include "mmu_decl.h" + /* * This address range defaults to a value that is safe for all * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It * can be further configured for specific applications under * the "Advanced Setup" menu. -Matt */ -#define CONSISTENT_BASE (CONFIG_CONSISTENT_START) -#define CONSISTENT_END (CONFIG_CONSISTENT_START + CONFIG_CONSISTENT_SIZE) +#define CONSISTENT_BASE (IOREMAP_TOP) +#define CONSISTENT_END (CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE) #define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT) /* * This is the page table (2MB) covering uncached, DMA consistent allocations */ -static pte_t *consistent_pte; static DEFINE_SPINLOCK(consistent_lock); /* @@ -148,22 +149,38 @@ static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsi * virtual and bus address for that space. */ void * -__dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) +__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp) { struct page *page; struct ppc_vm_region *c; unsigned long order; - u64 mask = 0x00ffffff, limit; /* ISA default */ + u64 mask = ISA_DMA_THRESHOLD, limit; - if (!consistent_pte) { - printk(KERN_ERR "%s: not initialised\n", __func__); - dump_stack(); - return NULL; + if (dev) { + mask = dev->coherent_dma_mask; + + /* + * Sanity check the DMA mask - it must be non-zero, and + * must be able to be satisfied by a DMA allocation. + */ + if (mask == 0) { + dev_warn(dev, "coherent DMA mask is unset\n"); + goto no_page; + } + + if ((~mask) & ISA_DMA_THRESHOLD) { + dev_warn(dev, "coherent DMA mask %#llx is smaller " + "than system GFP_DMA mask %#llx\n", + mask, (unsigned long long)ISA_DMA_THRESHOLD); + goto no_page; + } } + size = PAGE_ALIGN(size); limit = (mask + 1) & ~mask; - if ((limit && size >= limit) || size >= (CONSISTENT_END - CONSISTENT_BASE)) { + if ((limit && size >= limit) || + size >= (CONSISTENT_END - CONSISTENT_BASE)) { printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n", size, mask); return NULL; @@ -171,6 +188,7 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) order = get_order(size); + /* Might be useful if we ever have a real legacy DMA zone... */ if (mask != 0xffffffff) gfp |= GFP_DMA; @@ -195,7 +213,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) gfp & ~(__GFP_DMA | __GFP_HIGHMEM)); if (c) { unsigned long vaddr = c->vm_start; - pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr); struct page *end = page + (1 << order); split_page(page, order); @@ -206,13 +223,10 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) *handle = page_to_phys(page); do { - BUG_ON(!pte_none(*pte)); - SetPageReserved(page); - set_pte_at(&init_mm, vaddr, - pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL))); + map_page(vaddr, page_to_phys(page), + pgprot_noncached(PAGE_KERNEL)); page++; - pte++; vaddr += PAGE_SIZE; } while (size -= PAGE_SIZE); @@ -241,8 +255,7 @@ void __dma_free_coherent(size_t size, void *vaddr) { struct ppc_vm_region *c; unsigned long flags, addr; - pte_t *ptep; - + size = PAGE_ALIGN(size); spin_lock_irqsave(&consistent_lock, flags); @@ -258,29 +271,26 @@ void __dma_free_coherent(size_t size, void *vaddr) size = c->vm_end - c->vm_start; } - ptep = consistent_pte + CONSISTENT_OFFSET(c->vm_start); addr = c->vm_start; do { - pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep); + pte_t *ptep; unsigned long pfn; - ptep++; - addr += PAGE_SIZE; - - if (!pte_none(pte) && pte_present(pte)) { - pfn = pte_pfn(pte); - + ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr), + addr), + addr), + addr); + if (!pte_none(*ptep) && pte_present(*ptep)) { + pfn = pte_pfn(*ptep); + pte_clear(&init_mm, addr, ptep); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); - ClearPageReserved(page); + ClearPageReserved(page); __free_page(page); - continue; } } - - printk(KERN_CRIT "%s: bad page in kernel page table\n", - __func__); + addr += PAGE_SIZE; } while (size -= PAGE_SIZE); flush_tlb_kernel_range(c->vm_start, c->vm_end); @@ -300,42 +310,6 @@ void __dma_free_coherent(size_t size, void *vaddr) } EXPORT_SYMBOL(__dma_free_coherent); -/* - * Initialise the consistent memory allocation. - */ -static int __init dma_alloc_init(void) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int ret = 0; - - do { - pgd = pgd_offset(&init_mm, CONSISTENT_BASE); - pud = pud_alloc(&init_mm, pgd, CONSISTENT_BASE); - pmd = pmd_alloc(&init_mm, pud, CONSISTENT_BASE); - if (!pmd) { - printk(KERN_ERR "%s: no pmd tables\n", __func__); - ret = -ENOMEM; - break; - } - - pte = pte_alloc_kernel(pmd, CONSISTENT_BASE); - if (!pte) { - printk(KERN_ERR "%s: no pte tables\n", __func__); - ret = -ENOMEM; - break; - } - - consistent_pte = pte; - } while (0); - - return ret; -} - -core_initcall(dma_alloc_init); - /* * make an area consistent. */ diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index d3a4e67561f..579382c163a 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -387,6 +387,10 @@ void __init mem_init(void) pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n", PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP)); #endif /* CONFIG_HIGHMEM */ +#ifdef CONFIG_NOT_COHERENT_CACHE + pr_info(" * 0x%08lx..0x%08lx : consistent mem\n", + IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE); +#endif /* CONFIG_NOT_COHERENT_CACHE */ pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", ioremap_bot, IOREMAP_TOP); pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n", From 4c713189485dbea875aecd1990daed74908e181d Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 27 May 2009 09:10:28 +0200 Subject: [PATCH 623/900] Bluetooth: Remove useless flush_work() causing lockdep warnings The calls to flush_work() are pointless in a single thread workqueue and they are actually causing a lockdep warning. ============================================= [ INFO: possible recursive locking detected ] 2.6.30-rc6-02911-gbb803cf #16 --------------------------------------------- bluetooth/2518 is trying to acquire lock: (bluetooth){+.+.+.}, at: [] flush_work+0x28/0xb0 but task is already holding lock: (bluetooth){+.+.+.}, at: [] worker_thread+0x149/0x25e other info that might help us debug this: 2 locks held by bluetooth/2518: #0: (bluetooth){+.+.+.}, at: [] worker_thread+0x149/0x25e #1: (&conn->work_del){+.+...}, at: [] worker_thread+0x149/0x25e stack backtrace: Pid: 2518, comm: bluetooth Not tainted 2.6.30-rc6-02911-gbb803cf #16 Call Trace: [] ? printk+0xf/0x11 [] __lock_acquire+0x7ce/0xb1b [] lock_acquire+0x90/0xad [] ? flush_work+0x28/0xb0 [] flush_work+0x42/0xb0 [] ? flush_work+0x28/0xb0 [] del_conn+0x1c/0x84 [bluetooth] [] worker_thread+0x18e/0x25e [] ? worker_thread+0x149/0x25e [] ? del_conn+0x0/0x84 [bluetooth] [] ? autoremove_wake_function+0x0/0x33 [] ? worker_thread+0x0/0x25e [] kthread+0x45/0x6b [] ? kthread+0x0/0x6b [] kernel_thread_helper+0x7/0x10 Based on a report by Oliver Hartkopp Signed-off-by: Dave Young Tested-by: Oliver Hartkopp Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sysfs.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 4cc3624bd22..95f7a7a544b 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -90,9 +90,6 @@ static void add_conn(struct work_struct *work) struct hci_conn *conn = container_of(work, struct hci_conn, work_add); struct hci_dev *hdev = conn->hdev; - /* ensure previous del is complete */ - flush_work(&conn->work_del); - dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle); if (device_add(&conn->dev) < 0) { @@ -118,9 +115,6 @@ static void del_conn(struct work_struct *work) struct hci_conn *conn = container_of(work, struct hci_conn, work_del); struct hci_dev *hdev = conn->hdev; - /* ensure previous add is complete */ - flush_work(&conn->work_add); - if (!device_is_registered(&conn->dev)) return; From 6af3fb72d2437239e5eb13a59e95dc43ccab3e8f Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 27 May 2009 10:49:26 +0200 Subject: [PATCH 624/900] ALSA: Fix invalid jiffies check after pause The hw_ptr_jiffies has to be reset properly to avoid the invalid check of jiffies delta in snd_pcm_update_hw_ptr*() functions. Especailly this patch fixes the bogus jiffies check after the puase and resume. This patch is a modified version of the original patch by Jaroslav. Signed-off-by: Takashi Iwai --- sound/core/pcm_lib.c | 1 - sound/core/pcm_native.c | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index a2a792c18c4..3eea98a4e65 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -1478,7 +1478,6 @@ static int snd_pcm_lib_ioctl_reset(struct snd_pcm_substream *substream, runtime->status->hw_ptr %= runtime->buffer_size; else runtime->status->hw_ptr = 0; - runtime->hw_ptr_jiffies = jiffies; snd_pcm_stream_unlock_irqrestore(substream, flags); return 0; } diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index fc6f98e257d..b5da656d1ec 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -848,6 +848,7 @@ static void snd_pcm_post_start(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_trigger_tstamp(substream); + runtime->hw_ptr_jiffies = jiffies; runtime->status->state = state; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && runtime->silence_size > 0) @@ -961,6 +962,11 @@ static int snd_pcm_do_pause(struct snd_pcm_substream *substream, int push) { if (substream->runtime->trigger_master != substream) return 0; + /* The jiffies check in snd_pcm_update_hw_ptr*() is done by + * a delta betwen the current jiffies, this gives a large enough + * delta, effectively to skip the check once. + */ + substream->runtime->hw_ptr_jiffies = jiffies - HZ * 1000; return substream->ops->trigger(substream, push ? SNDRV_PCM_TRIGGER_PAUSE_PUSH : SNDRV_PCM_TRIGGER_PAUSE_RELEASE); From 9d911d7903926a65ef49ec671bacd86bcee5eb51 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Thu, 21 May 2009 16:21:15 -0600 Subject: [PATCH 625/900] PCI Hotplug: acpiphp: don't store a pci_dev in acpiphp_func An oops can occur if a user attempts to use both PCI logical hotplug and the ACPI physical hotplug driver (acpiphp) in this sequence, where $slot/address == $device. In other words, if acpiphp has claimed a PCI device, and that device is logically removed, then acpiphp may oops when it attempts to access it again. # echo 1 > /sys/bus/pci/devices/$device/remove # echo 0 > /sys/bus/pci/slots/$slot/power Unable to handle kernel NULL pointer dereference (address 0000000000000000) Call Trace: [] show_stack+0x50/0xa0 [] show_regs+0x820/0x860 [] die+0x190/0x2a0 [] ia64_do_page_fault+0x8e0/0xa40 [] ia64_native_leave_kernel+0x0/0x270 [] pci_remove_bus_device+0x120/0x260 [] acpiphp_disable_slot+0x410/0x540 [acpiphp] [] disable_slot+0xc0/0x120 [acpiphp] [] power_write_file+0x1e0/0x2a0 [pci_hotplug] [] pci_slot_attr_store+0x60/0xa0 [] sysfs_write_file+0x230/0x2c0 [] vfs_write+0x190/0x2e0 [] sys_write+0x80/0x100 [] ia64_ret_from_syscall+0x0/0x20 [] __kernel_syscall_via_break+0x0/0x20 The root cause of this oops is that the logical remove ("echo 1 > /sys/bus/pci/devices/$device/remove") destroyed the pci_dev. The pci_dev struct itself wasn't deallocated because acpiphp kept a reference, but some of its fields became invalid. acpiphp doesn't have any real reason to keep a pointer to a pci_dev around. It can always derive it using pci_get_slot(). If a logical remove destroys the pci_dev, acpiphp won't find it and is thus prevented from causing mischief. Reviewed-by: Matthew Wilcox Reviewed-by: Kenji Kaneshige Tested-by: Kenji Kaneshige Reported-by: Kenji Kaneshige Acked-by: Bjorn Helgaas Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/acpiphp.h | 1 - drivers/pci/hotplug/acpiphp_glue.c | 63 ++++++++++++------------------ 2 files changed, 26 insertions(+), 38 deletions(-) diff --git a/drivers/pci/hotplug/acpiphp.h b/drivers/pci/hotplug/acpiphp.h index 4fc168b7009..e68d5f20ffb 100644 --- a/drivers/pci/hotplug/acpiphp.h +++ b/drivers/pci/hotplug/acpiphp.h @@ -129,7 +129,6 @@ struct acpiphp_func { struct acpiphp_bridge *bridge; /* Ejectable PCI-to-PCI bridge */ struct list_head sibling; - struct pci_dev *pci_dev; struct notifier_block nb; acpi_handle handle; diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index a33794d9e0d..3a6064bce56 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -32,9 +32,6 @@ /* * Lifetime rules for pci_dev: - * - The one in acpiphp_func has its refcount elevated by pci_get_slot() - * when the driver is loaded or when an insertion event occurs. It loses - * a refcount when its ejected or the driver unloads. * - The one in acpiphp_bridge has its refcount elevated by pci_get_slot() * when the bridge is scanned and it loses a refcount when the bridge * is removed. @@ -130,6 +127,7 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv) unsigned long long adr, sun; int device, function, retval; struct pci_bus *pbus = bridge->pci_bus; + struct pci_dev *pdev; if (!acpi_pci_check_ejectable(pbus, handle) && !is_dock_device(handle)) return AE_OK; @@ -213,10 +211,10 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv) newfunc->slot = slot; list_add_tail(&newfunc->sibling, &slot->funcs); - /* associate corresponding pci_dev */ - newfunc->pci_dev = pci_get_slot(pbus, PCI_DEVFN(device, function)); - if (newfunc->pci_dev) { + pdev = pci_get_slot(pbus, PCI_DEVFN(device, function)); + if (pdev) { slot->flags |= (SLOT_ENABLED | SLOT_POWEREDON); + pci_dev_put(pdev); } if (is_dock_device(handle)) { @@ -617,7 +615,6 @@ static void cleanup_bridge(struct acpiphp_bridge *bridge) if (ACPI_FAILURE(status)) err("failed to remove notify handler\n"); } - pci_dev_put(func->pci_dev); list_del(list); kfree(func); } @@ -1101,22 +1098,24 @@ static int __ref enable_device(struct acpiphp_slot *slot) pci_enable_bridges(bus); pci_bus_add_devices(bus); - /* associate pci_dev to our representation */ list_for_each (l, &slot->funcs) { func = list_entry(l, struct acpiphp_func, sibling); - func->pci_dev = pci_get_slot(bus, PCI_DEVFN(slot->device, - func->function)); - if (!func->pci_dev) + dev = pci_get_slot(bus, PCI_DEVFN(slot->device, + func->function)); + if (!dev) continue; - if (func->pci_dev->hdr_type != PCI_HEADER_TYPE_BRIDGE && - func->pci_dev->hdr_type != PCI_HEADER_TYPE_CARDBUS) + if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE && + dev->hdr_type != PCI_HEADER_TYPE_CARDBUS) { + pci_dev_put(dev); continue; + } status = find_p2p_bridge(func->handle, (u32)1, bus, NULL); if (ACPI_FAILURE(status)) warn("find_p2p_bridge failed (error code = 0x%x)\n", status); + pci_dev_put(dev); } slot->flags |= SLOT_ENABLED; @@ -1142,17 +1141,14 @@ static void disable_bridges(struct pci_bus *bus) */ static int disable_device(struct acpiphp_slot *slot) { - int retval = 0; struct acpiphp_func *func; - struct list_head *l; + struct pci_dev *pdev; /* is this slot already disabled? */ if (!(slot->flags & SLOT_ENABLED)) goto err_exit; - list_for_each (l, &slot->funcs) { - func = list_entry(l, struct acpiphp_func, sibling); - + list_for_each_entry(func, &slot->funcs, sibling) { if (func->bridge) { /* cleanup p2p bridges under this P2P bridge */ cleanup_p2p_bridge(func->bridge->handle, @@ -1160,35 +1156,28 @@ static int disable_device(struct acpiphp_slot *slot) func->bridge = NULL; } - if (func->pci_dev) { - pci_stop_bus_device(func->pci_dev); - if (func->pci_dev->subordinate) { - disable_bridges(func->pci_dev->subordinate); - pci_disable_device(func->pci_dev); + pdev = pci_get_slot(slot->bridge->pci_bus, + PCI_DEVFN(slot->device, func->function)); + if (pdev) { + pci_stop_bus_device(pdev); + if (pdev->subordinate) { + disable_bridges(pdev->subordinate); + pci_disable_device(pdev); } + pci_remove_bus_device(pdev); + pci_dev_put(pdev); } } - list_for_each (l, &slot->funcs) { - func = list_entry(l, struct acpiphp_func, sibling); - + list_for_each_entry(func, &slot->funcs, sibling) { acpiphp_unconfigure_ioapics(func->handle); acpiphp_bus_trim(func->handle); - /* try to remove anyway. - * acpiphp_bus_add might have been failed */ - - if (!func->pci_dev) - continue; - - pci_remove_bus_device(func->pci_dev); - pci_dev_put(func->pci_dev); - func->pci_dev = NULL; } slot->flags &= (~SLOT_ENABLED); - err_exit: - return retval; +err_exit: + return 0; } From c87d9732004b3f8fd82d729f12ccfb96c0df279e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 27 May 2009 10:53:33 +0200 Subject: [PATCH 626/900] ALSA: Enable PCM hw_ptr_jiffies check only in xrun_debug mode The PCM hw_ptr jiffies check results sometimes in problems when a hardware doesn't give smooth hw_ptr updates. So far, au88x0 and some other drivers appear not working due to this strict check. However, this check is a nice debug tool, and the capability should be still kept. Hence, we disable this check now as default unless the user enables it by setting the xrun_debug mode to the specific stream via a proc file. Signed-off-by: Takashi Iwai --- Documentation/sound/alsa/Procfile.txt | 5 +++++ sound/core/pcm_lib.c | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Documentation/sound/alsa/Procfile.txt b/Documentation/sound/alsa/Procfile.txt index bba2dbb79d8..cfac20cf9e3 100644 --- a/Documentation/sound/alsa/Procfile.txt +++ b/Documentation/sound/alsa/Procfile.txt @@ -104,6 +104,11 @@ card*/pcm*/xrun_debug When this value is greater than 1, the driver will show the stack trace additionally. This may help the debugging. + Since 2.6.30, this option also enables the hwptr check using + jiffies. This detects spontaneous invalid pointer callback + values, but can be lead to too much corrections for a (mostly + buggy) hardware that doesn't give smooth pointer updates. + card*/pcm*/sub*/info The general information of this PCM sub-stream. diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index 3eea98a4e65..d659995ac3a 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -249,6 +249,11 @@ static int snd_pcm_update_hw_ptr_interrupt(struct snd_pcm_substream *substream) new_hw_ptr = hw_base + pos; } } + + /* Do jiffies check only in xrun_debug mode */ + if (!xrun_debug(substream)) + goto no_jiffies_check; + /* Skip the jiffies check for hardwares with BATCH flag. * Such hardware usually just increases the position at each IRQ, * thus it can't give any strange position. @@ -336,7 +341,9 @@ int snd_pcm_update_hw_ptr(struct snd_pcm_substream *substream) hw_base = 0; new_hw_ptr = hw_base + pos; } - if (((delta * HZ) / runtime->rate) > jdelta + HZ/100) { + /* Do jiffies check only in xrun_debug mode */ + if (xrun_debug(substream) && + ((delta * HZ) / runtime->rate) > jdelta + HZ/100) { hw_ptr_error(substream, "hw_ptr skipping! " "(pos=%ld, delta=%ld, period=%ld, jdelta=%lu/%lu)\n", From 55de5ef970c680d8d75f2a9aa7e4f172140dbd9c Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Wed, 27 May 2009 10:49:30 +0200 Subject: [PATCH 627/900] sound: usb-audio: make the MotU Fastlane work again Kernel 2.6.18 broke the MotU Fastlane, which uses duplicate endpoint numbers in a manner that is not only illegal but also confuses the kernel's endpoint descriptor caching mechanism. To work around this, we have to add a separate usb_set_interface() call to guide the USB core to the correct descriptors. Signed-off-by: Clemens Ladisch Reported-and-tested-by: David Fries Cc: Signed-off-by: Takashi Iwai --- sound/usb/usbaudio.c | 2 +- sound/usb/usbaudio.h | 2 +- sound/usb/usbmidi.c | 12 +++++++++++- sound/usb/usbquirks.h | 2 +- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/sound/usb/usbaudio.c b/sound/usb/usbaudio.c index 823296d7d57..a6b88482637 100644 --- a/sound/usb/usbaudio.c +++ b/sound/usb/usbaudio.c @@ -3347,7 +3347,7 @@ static int snd_usb_create_quirk(struct snd_usb_audio *chip, [QUIRK_MIDI_YAMAHA] = snd_usb_create_midi_interface, [QUIRK_MIDI_MIDIMAN] = snd_usb_create_midi_interface, [QUIRK_MIDI_NOVATION] = snd_usb_create_midi_interface, - [QUIRK_MIDI_RAW] = snd_usb_create_midi_interface, + [QUIRK_MIDI_FASTLANE] = snd_usb_create_midi_interface, [QUIRK_MIDI_EMAGIC] = snd_usb_create_midi_interface, [QUIRK_MIDI_CME] = snd_usb_create_midi_interface, [QUIRK_AUDIO_STANDARD_INTERFACE] = create_standard_audio_quirk, diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h index 36e4f7a29ad..8e7f78941ba 100644 --- a/sound/usb/usbaudio.h +++ b/sound/usb/usbaudio.h @@ -153,7 +153,7 @@ enum quirk_type { QUIRK_MIDI_YAMAHA, QUIRK_MIDI_MIDIMAN, QUIRK_MIDI_NOVATION, - QUIRK_MIDI_RAW, + QUIRK_MIDI_FASTLANE, QUIRK_MIDI_EMAGIC, QUIRK_MIDI_CME, QUIRK_MIDI_US122L, diff --git a/sound/usb/usbmidi.c b/sound/usb/usbmidi.c index 26bad373fe6..2fb35cc22a3 100644 --- a/sound/usb/usbmidi.c +++ b/sound/usb/usbmidi.c @@ -1778,8 +1778,18 @@ int snd_usb_create_midi_interface(struct snd_usb_audio* chip, umidi->usb_protocol_ops = &snd_usbmidi_novation_ops; err = snd_usbmidi_detect_per_port_endpoints(umidi, endpoints); break; - case QUIRK_MIDI_RAW: + case QUIRK_MIDI_FASTLANE: umidi->usb_protocol_ops = &snd_usbmidi_raw_ops; + /* + * Interface 1 contains isochronous endpoints, but with the same + * numbers as in interface 0. Since it is interface 1 that the + * USB core has most recently seen, these descriptors are now + * associated with the endpoint numbers. This will foul up our + * attempts to submit bulk/interrupt URBs to the endpoints in + * interface 0, so we have to make sure that the USB core looks + * again at interface 0 by calling usb_set_interface() on it. + */ + usb_set_interface(umidi->chip->dev, 0, 0); err = snd_usbmidi_detect_per_port_endpoints(umidi, endpoints); break; case QUIRK_MIDI_EMAGIC: diff --git a/sound/usb/usbquirks.h b/sound/usb/usbquirks.h index 647ef502965..5d955aaad85 100644 --- a/sound/usb/usbquirks.h +++ b/sound/usb/usbquirks.h @@ -1868,7 +1868,7 @@ YAMAHA_DEVICE(0x7010, "UB99"), .data = & (const struct snd_usb_audio_quirk[]) { { .ifnum = 0, - .type = QUIRK_MIDI_RAW + .type = QUIRK_MIDI_FASTLANE }, { .ifnum = 1, From ed37d83e6aa218192fb28bb6b82498d2a8c74070 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 27 May 2009 21:39:05 +1000 Subject: [PATCH 628/900] md: raid5: change incorrect usage of 'min' macro to 'min_t' A recent patch to raid5.c use min on an int and a sector_t. This isn't allowed. So change it to min_t(sector_t,x,y). Signed-off-by: NeilBrown --- drivers/md/raid5.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3c3626d2a1f..5d400aef8d9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3811,13 +3811,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->delta_disks < 0) { - writepos -= min(reshape_sectors, writepos); + writepos -= min_t(sector_t, reshape_sectors, writepos); readpos += reshape_sectors; safepos += reshape_sectors; } else { writepos += reshape_sectors; - readpos -= min(reshape_sectors, readpos); - safepos -= min(reshape_sectors, safepos); + readpos -= min_t(sector_t, reshape_sectors, readpos); + safepos -= min_t(sector_t, reshape_sectors, safepos); } /* 'writepos' is the most advanced device address we might write. From 346a850e3c3a20159cef2b79235e6d34aa497c65 Mon Sep 17 00:00:00 2001 From: Manuel Traut Date: Wed, 27 May 2009 06:20:05 -0700 Subject: [PATCH 629/900] Input: usb1400_ts - fix access to "device data" in resume function platform_data != driver_data driver data is actually the "correct" place of the struct however it is not placed there due to the need of the ac97 struct. This is broken since d9105c2b01 aka "[ARM] 5184/1: Split ucb1400_ts into core and touchscreen" Signed-off-by: Manuel Traut Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/ucb1400_ts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/ucb1400_ts.c b/drivers/input/touchscreen/ucb1400_ts.c index f100c7f4c1d..6954f550010 100644 --- a/drivers/input/touchscreen/ucb1400_ts.c +++ b/drivers/input/touchscreen/ucb1400_ts.c @@ -419,7 +419,7 @@ static int ucb1400_ts_remove(struct platform_device *dev) #ifdef CONFIG_PM static int ucb1400_ts_resume(struct platform_device *dev) { - struct ucb1400_ts *ucb = platform_get_drvdata(dev); + struct ucb1400_ts *ucb = dev->dev.platform_data; if (ucb->ts_task) { /* From 683a04cebc63819a36b1db19843bd17771f05b55 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 May 2009 15:45:34 +0200 Subject: [PATCH 630/900] netfilter: xt_hashlimit does a wrong SEQ_SKIP The function dl_seq_show() returns 1 (equal to SEQ_SKIP) in case a seq_printf() call return -1. It should return -1. This SEQ_SKIP behavior brakes processing the proc file e.g. via a pipe or just through less. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Patrick McHardy --- net/netfilter/xt_hashlimit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index a5b5369c30f..219dcdbe388 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -926,7 +926,7 @@ static int dl_seq_show(struct seq_file *s, void *v) if (!hlist_empty(&htable->hash[*bucket])) { hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) if (dl_seq_real_show(ent, htable->family, s)) - return 1; + return -1; } return 0; } From eeff9beec3d2563c42cca41e66d4169592bb5475 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 27 May 2009 15:49:11 +0200 Subject: [PATCH 631/900] netfilter: nfnetlink_log: fix wrong skbuff size calculation This problem was introduced in 72961ecf84d67d6359a1b30f9b2a8427f13e1e71 since no space was reserved for the new attributes NFULA_HWTYPE, NFULA_HWLEN and NFULA_HWHEADER. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/netfilter/nfnetlink_log.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index fd326ac27ec..66a6dd5c519 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -581,6 +581,12 @@ nfulnl_log_packet(u_int8_t pf, + nla_total_size(sizeof(struct nfulnl_msg_packet_hw)) + nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp)); + if (in && skb_mac_header_was_set(skb)) { + size += nla_total_size(skb->dev->hard_header_len) + + nla_total_size(sizeof(u_int16_t)) /* hwtype */ + + nla_total_size(sizeof(u_int16_t)); /* hwlen */ + } + spin_lock_bh(&inst->lock); if (inst->flags & NFULNL_CFG_F_SEQ) From 348ca1029e8bae6e0c49097ad25439b17c5326f4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 May 2009 15:46:50 +0100 Subject: [PATCH 632/900] FS-Cache: Fixup renamed filenames in comments in internal.h Fix up renamed filenames in comments in fs/fscache/internal.h. Originally, the files were all called fsc-xxx.c, but they got renamed to just xxx.c. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/fscache/internal.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index e0cbd16f6dc..1c341304621 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -28,7 +28,7 @@ #define FSCACHE_MAX_THREADS 32 /* - * fsc-cache.c + * cache.c */ extern struct list_head fscache_cache_list; extern struct rw_semaphore fscache_addremove_sem; @@ -37,7 +37,7 @@ extern struct fscache_cache *fscache_select_cache_for_object( struct fscache_cookie *); /* - * fsc-cookie.c + * cookie.c */ extern struct kmem_cache *fscache_cookie_jar; @@ -45,13 +45,13 @@ extern void fscache_cookie_init_once(void *); extern void __fscache_cookie_put(struct fscache_cookie *); /* - * fsc-fsdef.c + * fsdef.c */ extern struct fscache_cookie fscache_fsdef_index; extern struct fscache_cookie_def fscache_fsdef_netfs_def; /* - * fsc-histogram.c + * histogram.c */ #ifdef CONFIG_FSCACHE_HISTOGRAM extern atomic_t fscache_obj_instantiate_histogram[HZ]; @@ -75,7 +75,7 @@ extern const struct file_operations fscache_histogram_fops; #endif /* - * fsc-main.c + * main.c */ extern unsigned fscache_defer_lookup; extern unsigned fscache_defer_create; @@ -86,14 +86,14 @@ extern int fscache_wait_bit(void *); extern int fscache_wait_bit_interruptible(void *); /* - * fsc-object.c + * object.c */ extern void fscache_withdrawing_object(struct fscache_cache *, struct fscache_object *); extern void fscache_enqueue_object(struct fscache_object *); /* - * fsc-operation.c + * operation.c */ extern int fscache_submit_exclusive_op(struct fscache_object *, struct fscache_operation *); @@ -104,7 +104,7 @@ extern void fscache_start_operations(struct fscache_object *); extern void fscache_operation_gc(struct work_struct *); /* - * fsc-proc.c + * proc.c */ #ifdef CONFIG_PROC_FS extern int __init fscache_proc_init(void); @@ -115,7 +115,7 @@ extern void fscache_proc_cleanup(void); #endif /* - * fsc-stats.c + * stats.c */ #ifdef CONFIG_FSCACHE_STATS extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS]; From 911e690e70540f009125bacd16c017eb1a7b1916 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 May 2009 15:46:55 +0100 Subject: [PATCH 633/900] CacheFiles: Fixup renamed filenames in comments in internal.h Fix up renamed filenames in comments in fs/cachefiles/internal.h. Originally, the files were all called cf-xxx.c, but they got renamed to just xxx.c. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/cachefiles/internal.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 19218e1463d..f7c255f9c62 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -122,13 +122,13 @@ static inline void cachefiles_state_changed(struct cachefiles_cache *cache) } /* - * cf-bind.c + * bind.c */ extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args); extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache); /* - * cf-daemon.c + * daemon.c */ extern const struct file_operations cachefiles_daemon_fops; @@ -136,17 +136,17 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache, unsigned fnr, unsigned bnr); /* - * cf-interface.c + * interface.c */ extern const struct fscache_cache_ops cachefiles_cache_ops; /* - * cf-key.c + * key.c */ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); /* - * cf-namei.c + * namei.c */ extern int cachefiles_delete_object(struct cachefiles_cache *cache, struct cachefiles_object *object); @@ -165,7 +165,7 @@ extern int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, char *filename); /* - * cf-proc.c + * proc.c */ #ifdef CONFIG_CACHEFILES_HISTOGRAM extern atomic_t cachefiles_lookup_histogram[HZ]; @@ -190,7 +190,7 @@ void cachefiles_hist(atomic_t histogram[], unsigned long start_jif) #endif /* - * cf-rdwr.c + * rdwr.c */ extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, struct page *, gfp_t); @@ -205,7 +205,7 @@ extern int cachefiles_write_page(struct fscache_storage *, struct page *); extern void cachefiles_uncache_page(struct fscache_object *, struct page *); /* - * cf-security.c + * security.c */ extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache, @@ -225,7 +225,7 @@ static inline void cachefiles_end_secure(struct cachefiles_cache *cache, } /* - * cf-xattr.c + * xattr.c */ extern int cachefiles_check_object_type(struct cachefiles_object *object); extern int cachefiles_set_object_xattr(struct cachefiles_object *object, From 07f4f3e8a24138ca2f3650723d670df25687cd05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20H=C3=B8gsberg?= Date: Wed, 27 May 2009 14:37:28 -0400 Subject: [PATCH 634/900] i915: Set object to gtt domain when faulting it back in MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a GEM object is evicted from the GTT we set it to the CPU domain, as it might get swapped in and out or ever mmapped regularly. If the object is mmapped through the GTT it can still get evicted in this way by other objects requiring GTT space. When the GTT mapping is touched again we fault it back into the GTT, but fail to set it back to the GTT domain. This means we fail to flush any cached CPU writes to the pages backing the object which will then happen "eventually", typically after we write to the page through the uncached GTT mapping. [anholt: Note that userland does do a set_domain(GTT, GTT) when starting to access the GTT mapping. That covers getting the existing mapping of the object synchronized if it's bound to the GTT. But set_domain(GTT, GTT) doesn't do anything if the object is currently unbound. This fix covers the transition to being bound for GTT mapping.] Fixes glyph and other pixmap corruption during swapping. fd.o bug #21790 Signed-off-by: Kristian Høgsberg Signed-off-by: Eric Anholt --- drivers/gpu/drm/i915/i915_gem.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index e2421869a40..670d1288146 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1145,6 +1145,13 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) mutex_unlock(&dev->struct_mutex); return VM_FAULT_SIGBUS; } + + ret = i915_gem_object_set_to_gtt_domain(obj, write); + if (ret) { + mutex_unlock(&dev->struct_mutex); + return VM_FAULT_SIGBUS; + } + list_add_tail(&obj_priv->list, &dev_priv->mm.inactive_list); } From b787f2e2a37a373a045f4d9b9bed941ccff01663 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Wed, 13 May 2009 16:25:57 -0500 Subject: [PATCH 635/900] fsldma: Fix compile warnings We we build with dma_addr_t as a 64-bit quantity we get: drivers/dma/fsldma.c: In function 'fsl_chan_xfer_ld_queue': drivers/dma/fsldma.c:625: warning: cast to pointer from integer of different size drivers/dma/fsldma.c: In function 'fsl_dma_chan_do_interrupt': drivers/dma/fsldma.c:737: warning: cast to pointer from integer of different size drivers/dma/fsldma.c:737: warning: cast to pointer from integer of different size drivers/dma/fsldma.c: In function 'of_fsl_dma_probe': drivers/dma/fsldma.c:927: warning: cast to pointer from integer of different Signed-off-by: Kumar Gala Signed-off-by: Dan Williams --- drivers/dma/fsldma.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 15783102bf1..f18d1bde043 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -641,8 +641,8 @@ static void fsl_chan_xfer_ld_queue(struct fsl_dma_chan *fsl_chan) if (ld_node != &fsl_chan->ld_queue) { /* Get the ld start address from ld_queue */ next_dest_addr = to_fsl_desc(ld_node)->async_tx.phys; - dev_dbg(fsl_chan->dev, "xfer LDs staring from %p\n", - (void *)next_dest_addr); + dev_dbg(fsl_chan->dev, "xfer LDs staring from 0x%llx\n", + (unsigned long long)next_dest_addr); set_cdar(fsl_chan, next_dest_addr); dma_start(fsl_chan); } else { @@ -756,8 +756,9 @@ static irqreturn_t fsl_dma_chan_do_interrupt(int irq, void *data) */ if (stat & FSL_DMA_SR_EOSI) { dev_dbg(fsl_chan->dev, "event: End-of-segments INT\n"); - dev_dbg(fsl_chan->dev, "event: clndar %p, nlndar %p\n", - (void *)get_cdar(fsl_chan), (void *)get_ndar(fsl_chan)); + dev_dbg(fsl_chan->dev, "event: clndar 0x%llx, nlndar 0x%llx\n", + (unsigned long long)get_cdar(fsl_chan), + (unsigned long long)get_ndar(fsl_chan)); stat &= ~FSL_DMA_SR_EOSI; update_cookie = 1; } @@ -947,8 +948,8 @@ static int __devinit of_fsl_dma_probe(struct of_device *dev, } dev_info(&dev->dev, "Probe the Freescale DMA driver for %s " - "controller at %p...\n", - match->compatible, (void *)fdev->reg.start); + "controller at 0x%llx...\n", + match->compatible, (unsigned long long)fdev->reg.start); fdev->reg_base = ioremap(fdev->reg.start, fdev->reg.end - fdev->reg.start + 1); From a0d24b295aed7a9daf4ca36bd4784e4d40f82303 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 19 May 2009 12:03:15 +0800 Subject: [PATCH 636/900] nfsd: fix hung up of nfs client while sync write data to nfs server Commit 'Short write in nfsd becomes a full write to the client' (31dec2538e45e9fff2007ea1f4c6bae9f78db724) broken the sync write. With the following commands to reproduce: $ mount -t nfs -o sync 192.168.0.21:/nfsroot /mnt $ cd /mnt $ echo aaaa > temp.txt Then nfs client is hung up. In SYNC mode the server alaways return the write count 0 to the client. This is because the value of host_err in nfsd_vfs_write() will be overwrite in SYNC mode by 'host_err=nfsd_sync(file);', and then we return host_err(which is now 0) as write count. This patch fixed the problem. Signed-off-by: Wei Yongjun Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6c68ffd6b4b..b660435978d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1015,6 +1015,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); set_fs(oldfs); if (host_err >= 0) { + *cnt = host_err; nfsdstats.io_write += host_err; fsnotify_modify(file->f_path.dentry); } @@ -1060,10 +1061,9 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, } dprintk("nfsd: write complete host_err=%d\n", host_err); - if (host_err >= 0) { + if (host_err >= 0) err = 0; - *cnt = host_err; - } else + else err = nfserrno(host_err); out: return err; From dacd2549ca61ddbdd1ed62a76ca95dea3f0e02c6 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 26 May 2009 09:08:03 +0900 Subject: [PATCH 637/900] PCI/ACPI: fix wrong ref count handling in acpi_pci_bind() The 'dev' field of struct acpi_pci_data is having a pointer to struct pci_dev without incrementing the reference counter. Because of this, I got the following kernel oops when I was doing some pci hotplug operations. This patch fixes this bug by replacing wrong hand-made pci_find_slot() with pci_get_slot() in acpi_pci_bind(). BUG: unable to handle kernel NULL pointer dereference at 00000000000000e8 IP: [] acpi_pci_unbind+0xb1/0xdd Call Trace: [] acpi_bus_remove+0x54/0x68 [] acpi_bus_trim+0x75/0xe3 [] acpiphp_disable_slot+0x16d/0x1e0 [acpiphp] [] disable_slot+0x20/0x60 [acpiphp] [] power_write_file+0xc8/0x110 [] pci_slot_attr_store+0x24/0x30 [] sysfs_write_file+0xce/0x140 [] vfs_write+0xc7/0x170 [] sys_write+0x50/0x90 [] system_call_fastpath+0x16/0x1b Signed-off-by: Kenji Kaneshige Reviewed-by: Bjorn Helgaas Reviewed-by: Alex Chiang Tested-by: Alex Chiang Signed-off-by: Len Brown --- drivers/acpi/pci_bind.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/drivers/acpi/pci_bind.c b/drivers/acpi/pci_bind.c index 95650f83ce2..bc46de3d967 100644 --- a/drivers/acpi/pci_bind.c +++ b/drivers/acpi/pci_bind.c @@ -116,9 +116,6 @@ int acpi_pci_bind(struct acpi_device *device) struct acpi_pci_data *pdata; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; acpi_handle handle; - struct pci_dev *dev; - struct pci_bus *bus; - if (!device || !device->parent) return -EINVAL; @@ -176,20 +173,9 @@ int acpi_pci_bind(struct acpi_device *device) * Locate matching device in PCI namespace. If it doesn't exist * this typically means that the device isn't currently inserted * (e.g. docking station, port replicator, etc.). - * We cannot simply search the global pci device list, since - * PCI devices are added to the global pci list when the root - * bridge start ops are run, which may not have happened yet. */ - bus = pci_find_bus(data->id.segment, data->id.bus); - if (bus) { - list_for_each_entry(dev, &bus->devices, bus_list) { - if (dev->devfn == PCI_DEVFN(data->id.device, - data->id.function)) { - data->dev = dev; - break; - } - } - } + data->dev = pci_get_slot(pdata->bus, + PCI_DEVFN(data->id.device, data->id.function)); if (!data->dev) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device %04x:%02x:%02x.%d not present in PCI namespace\n", @@ -259,9 +245,10 @@ int acpi_pci_bind(struct acpi_device *device) end: kfree(buffer.pointer); - if (result) + if (result) { + pci_dev_put(data->dev); kfree(data); - + } return result; } @@ -303,6 +290,7 @@ static int acpi_pci_unbind(struct acpi_device *device) if (data->dev->subordinate) { acpi_pci_irq_del_prt(data->id.segment, data->bus->number); } + pci_dev_put(data->dev); kfree(data); end: From 7f4218354fe312b327af06c3d8c95ed5f214c8ca Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 27 May 2009 18:51:06 -0400 Subject: [PATCH 638/900] nfsd: Revert "svcrpc: take advantage of tcp autotuning" This reverts commit 47a14ef1af48c696b214ac168f056ddc79793d0e "svcrpc: take advantage of tcp autotuning", which uncovered some further problems in the server rpc code, causing significant performance regressions in common cases. We will likely reinstate this patch after releasing 2.6.30 and applying some work on the underlying fixes to the problem (developed by Trond). Reported-by: Jeff Moyer Cc: Olga Kornievskaia Cc: Jim Rees Cc: Trond Myklebust Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index af3198814c1..9d504234af4 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -345,6 +345,7 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, lock_sock(sock->sk); sock->sk->sk_sndbuf = snd * 2; sock->sk->sk_rcvbuf = rcv * 2; + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; release_sock(sock->sk); #endif } @@ -796,6 +797,23 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); + if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) + /* sndbuf needs to have room for one request + * per thread, otherwise we can stall even when the + * network isn't a bottleneck. + * + * We count all threads rather than threads in a + * particular pool, which provides an upper bound + * on the number of threads which will access the socket. + * + * rcvbuf just needs to be able to hold a few requests. + * Normally they will be removed from the queue + * as soon a a complete request arrives. + */ + svc_sock_setbufsize(svsk->sk_sock, + (serv->sv_nrthreads+3) * serv->sv_max_mesg, + 3 * serv->sv_max_mesg); + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* Receive data. If we haven't got the record length yet, get @@ -1043,6 +1061,15 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; + /* initialise setting must have enough space to + * receive and respond to one request. + * svc_tcp_recvfrom will re-adjust if necessary + */ + svc_sock_setbufsize(svsk->sk_sock, + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); + + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); if (sk->sk_state != TCP_ESTABLISHED) set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); @@ -1112,14 +1139,8 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Initialize the socket */ if (sock->type == SOCK_DGRAM) svc_udp_init(svsk, serv); - else { - /* initialise setting must have enough space to - * receive and respond to one request. - */ - svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg, - 4 * serv->sv_max_mesg); + else svc_tcp_init(svsk, serv); - } dprintk("svc: svc_setup_socket created %p (inet %p)\n", svsk, svsk->sk_sk); From 98779be861a05c4cb75bed916df72ec0cba8b53d Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 14 May 2009 16:34:28 -0500 Subject: [PATCH 639/900] svcrdma: dma unmap the correct length for the RPCRDMA header page. The svcrdma module was incorrectly unmapping the RPCRDMA header page. On IBM pserver systems this causes a resource leak that results in running out of bus address space (10 cthon iterations will reproduce it). The code was mapping the full page but only unmapping the actual header length. The fix is to only map the header length. I also cleaned up the use of ib_dma_map_page() calls since the unmap logic always uses ib_dma_unmap_single(). I made these symmetrical. Signed-off-by: Steve Wise Signed-off-by: Tom Tucker Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 12 ++++++------ net/sunrpc/xprtrdma/svc_rdma_transport.c | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 8b510c5e877..f11be72a1a8 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -128,7 +128,8 @@ static int fast_reg_xdr(struct svcxprt_rdma *xprt, page_bytes -= sge_bytes; frmr->page_list->page_list[page_no] = - ib_dma_map_page(xprt->sc_cm_id->device, page, 0, + ib_dma_map_single(xprt->sc_cm_id->device, + page_address(page), PAGE_SIZE, DMA_TO_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, frmr->page_list->page_list[page_no])) @@ -532,18 +533,17 @@ static int send_reply(struct svcxprt_rdma *rdma, clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); /* Prepare the SGE for the RPCRDMA Header */ + ctxt->sge[0].lkey = rdma->sc_dma_lkey; + ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); ctxt->sge[0].addr = - ib_dma_map_page(rdma->sc_cm_id->device, - page, 0, PAGE_SIZE, DMA_TO_DEVICE); + ib_dma_map_single(rdma->sc_cm_id->device, page_address(page), + ctxt->sge[0].length, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) goto err; atomic_inc(&rdma->sc_dma_used); ctxt->direction = DMA_TO_DEVICE; - ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); - ctxt->sge[0].lkey = rdma->sc_dma_lkey; - /* Determine how many of our SGE are to be transmitted */ for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 4b0c2fa15e0..5151f9f6c57 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -500,8 +500,8 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) BUG_ON(sge_no >= xprt->sc_max_sge); page = svc_rdma_get_page(); ctxt->pages[sge_no] = page; - pa = ib_dma_map_page(xprt->sc_cm_id->device, - page, 0, PAGE_SIZE, + pa = ib_dma_map_single(xprt->sc_cm_id->device, + page_address(page), PAGE_SIZE, DMA_FROM_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) goto err_put_ctxt; @@ -1315,8 +1315,8 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); /* Prepare SGE for local address */ - sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, - p, 0, PAGE_SIZE, DMA_FROM_DEVICE); + sge.addr = ib_dma_map_single(xprt->sc_cm_id->device, + page_address(p), PAGE_SIZE, DMA_FROM_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) { put_page(p); return; @@ -1343,7 +1343,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, if (ret) { dprintk("svcrdma: Error %d posting send for protocol error\n", ret); - ib_dma_unmap_page(xprt->sc_cm_id->device, + ib_dma_unmap_single(xprt->sc_cm_id->device, sge.addr, PAGE_SIZE, DMA_FROM_DEVICE); svc_rdma_put_context(ctxt, 1); From 5b6045a906f48d37591365c5dcdd6d1d146bfd4a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 May 2009 17:28:02 +0200 Subject: [PATCH 640/900] trace: disable preemption before taking raw spinlocks s390 code uses smp_processor_id() in __raw_spin_lock() code which reveals that a (raw) spinlock is taken without preemption disabled. This can potentially deadlock. To fix this explicitly disable and enable preemption. BUG: using smp_processor_id() in preemptible [00000000] code: cat/2278 caller is trace_find_cmdline+0x40/0xfc CPU: 0 Not tainted 2.6.30-rc7-dirty #39 Process cat (pid: 2278, task: 000000003faedb68, ksp: 000000003b33b988) 000000003b33b988 000000003b33bae0 0000000000000002 0000000000000000 000000003b33bb80 000000003b33baf8 000000003b33baf8 00000000000175d6 0000000000000001 000000003b33b988 000000003f9b0000 000000000000000b 000000000000000c 000000003b33bb40 000000003b33bae0 0000000000000000 0000000000000000 00000000000175d6 000000003b33bae0 000000003b33bb28 Call Trace: ([<00000000000174b2>] show_trace+0x112/0x170) [<0000000000017582>] show_stack+0x72/0x100 [<0000000000441538>] dump_stack+0xc8/0xd8 [<000000000025c350>] debug_smp_processor_id+0x114/0x130 [<00000000000bf0e4>] trace_find_cmdline+0x40/0xfc [<00000000000c35d4>] trace_print_context+0x58/0xac [<00000000000bb676>] print_trace_line+0x416/0x470 [<00000000000bc8fe>] s_show+0x4e/0x428 [<000000000013834e>] seq_read+0x36a/0x5d4 [<0000000000112a78>] vfs_read+0xc8/0x174 [<0000000000112c58>] SyS_read+0x74/0xc4 [<000000000002c7ae>] sysc_noemu+0x10/0x16 [<000002000012436c>] 0x2000012436c 1 lock held by cat/2278: #0: (&p->lock){+.+.+.}, at: [<0000000000138056>] seq_read+0x72/0x5d4 [ Impact: fix preempt-unsafe raw spinlock ] Signed-off-by: Heiko Carstens Acked-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 02d32baa23a..a3a8a87d7e9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -808,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[]) return; } + preempt_disable(); __raw_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) @@ -816,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[]) strcpy(comm, "<...>"); __raw_spin_unlock(&trace_cmdline_lock); + preempt_enable(); } void tracing_record_cmdline(struct task_struct *tsk) From f2aebaee653a35b01c3665de2cbb1e31456b8ea8 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Wed, 27 May 2009 21:36:02 +0800 Subject: [PATCH 641/900] ftrace: don't convert function's local variable name in macro "call" is an argument of macro, but it is also used as a local variable name of function in macro. We should keep this local variable name distinct from any CPP macro parameter name if both are in the same macro scope, although it hasn't caused any problem yet. [ Impact: robustify macro ] Signed-off-by: Zhao Lei Acked-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/trace/ftrace.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 87fc227c6fb..b4ec83ae711 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -397,19 +397,19 @@ static void ftrace_profile_##call(proto) \ perf_tpcounter_event(event_##call.id); \ } \ \ -static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \ +static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ { \ int ret = 0; \ \ - if (!atomic_inc_return(&call->profile_count)) \ + if (!atomic_inc_return(&event_call->profile_count)) \ ret = register_trace_##call(ftrace_profile_##call); \ \ return ret; \ } \ \ -static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ +static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ { \ - if (atomic_add_negative(-1, &call->profile_count)) \ + if (atomic_add_negative(-1, &event_call->profile_count)) \ unregister_trace_##call(ftrace_profile_##call); \ } @@ -433,9 +433,9 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ #define __array(type, item, len) #undef __string -#define __string(item, src) \ - __str_offsets.item = __str_size + \ - offsetof(typeof(*entry), __str_data); \ +#define __string(item, src) \ + __str_offsets.item = __str_size + \ + offsetof(typeof(*entry), __str_data); \ __str_size += strlen(src) + 1; #undef __assign_str @@ -451,8 +451,8 @@ static struct ftrace_event_call event_##call; \ \ static void ftrace_raw_event_##call(proto) \ { \ - struct ftrace_str_offsets_##call __maybe_unused __str_offsets; \ - struct ftrace_event_call *call = &event_##call; \ + struct ftrace_str_offsets_##call __maybe_unused __str_offsets; \ + struct ftrace_event_call *event_call = &event_##call; \ struct ring_buffer_event *event; \ struct ftrace_raw_##call *entry; \ unsigned long irq_flags; \ @@ -473,7 +473,7 @@ static void ftrace_raw_event_##call(proto) \ \ assign; \ \ - if (!filter_current_check_discard(call, entry, event)) \ + if (!filter_current_check_discard(event_call, entry, event)) \ trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ } \ \ From abfe0af9813153bae8c85d9bac966bafcb8ddab1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 20 May 2009 00:37:40 -0700 Subject: [PATCH 642/900] x86: enable_update_mptable should be a macro instead of declaring one variant as an inline function... because other case is a variable Signed-off-by: Yinghai Lu LKML-Reference: <4A13B344.7030307@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mpspec.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 3dcbaaaa363..e2a1bb6d71e 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -61,9 +61,11 @@ extern void get_smp_config(void); #ifdef CONFIG_X86_MPPARSE extern void find_smp_config(void); extern void early_reserve_e820_mpc_new(void); +extern int enable_update_mptable; #else static inline void find_smp_config(void) { } static inline void early_reserve_e820_mpc_new(void) { } +#define enable_update_mptable 0 #endif void __cpuinit generic_processor_info(int apicid, int version); @@ -87,15 +89,6 @@ static inline int acpi_probe_gsi(void) } #endif /* CONFIG_ACPI */ -#ifdef CONFIG_X86_MPPARSE -extern int enable_update_mptable; -#else -static inline int enable_update_mptable(void) -{ - return 0; -} -#endif - #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) struct physid_mask { From 1812e67c7410c8d0d57f14a3dc81a99bd5b30e3e Mon Sep 17 00:00:00 2001 From: Tony Vroon Date: Wed, 27 May 2009 21:00:41 +0100 Subject: [PATCH 643/900] ALSA: hda - Compaq Presario CQ60 patching for Conexant A docking mic control is shown by default. The Compaq Presario CQ60 laptop has no docking connector, so designate it as a CXT5051_HP model. This makes the phantom mixer slider disappear. Signed-off-by: Tony Vroon Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_conexant.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 56ce19e68cb..4fcbe21829a 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -1848,6 +1848,7 @@ static const char *cxt5051_models[CXT5051_MODELS] = { static struct snd_pci_quirk cxt5051_cfg_tbl[] = { SND_PCI_QUIRK(0x103c, 0x30cf, "HP DV6736", CXT5051_HP_DV6736), + SND_PCI_QUIRK(0x103c, 0x360b, "Compaq Presario CQ60", CXT5051_HP), SND_PCI_QUIRK(0x14f1, 0x0101, "Conexant Reference board", CXT5051_LAPTOP), SND_PCI_QUIRK(0x14f1, 0x5051, "HP Spartan 1.1", CXT5051_HP), From 7d96fd41cadc55f4e00231c8c71b8e25c779f122 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 25 May 2009 11:02:02 +0200 Subject: [PATCH 644/900] x86: move rdtsc_barrier() into the TSC vread method The *fence instructions were moved to vsyscall_64.c by commit cb9e35dce94a1b9c59d46224e8a94377d673e204. But this breaks the vDSO, because vread methods are also called from there. Besides, the synchronization might be unnecessary for other time sources than TSC. [ Impact: fix potential time warp in VDSO ] Signed-off-by: Petr Tesarik LKML-Reference: <9d0ea9ea0f866bdc1f4d76831221ae117f11ea67.1243241859.git.ptesarik@suse.cz> Signed-off-by: Thomas Gleixner Cc: --- arch/x86/kernel/tsc.c | 11 ++++++++++- arch/x86/kernel/vsyscall_64.c | 8 -------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index d57de05dc43..cf8611d991e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs) #ifdef CONFIG_X86_64 static cycle_t __vsyscall_fn vread_tsc(void) { - cycle_t ret = (cycle_t)vget_cycles(); + cycle_t ret; + + /* + * Surround the RDTSC by barriers, to make sure it's not + * speculated to outside the seqlock critical section and + * does not cause time warps: + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + rdtsc_barrier(); return ret >= __vsyscall_gtod_data.clock.cycle_last ? ret : __vsyscall_gtod_data.clock.cycle_last; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 44153afc906..25ee06a80aa 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) return; } - /* - * Surround the RDTSC by barriers, to make sure it's not - * speculated to outside the seqlock critical section and - * does not cause time warps: - */ - rdtsc_barrier(); now = vread(); - rdtsc_barrier(); - base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; From f49afbb572d5e08ae12f1a979dc2e41745040339 Mon Sep 17 00:00:00 2001 From: Paulius Zaleckas Date: Thu, 28 May 2009 16:41:36 +0300 Subject: [PATCH 645/900] MAINTAINER: Add F: entries for Gemini and FA526 Signed-off-by: Paulius Zaleckas --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 5ee166e27b9..42c53ab6b9f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -624,6 +624,7 @@ M: paulius.zaleckas@teltonika.lt L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) T: git git://gitorious.org/linux-gemini/mainline.git S: Maintained +F: arch/arm/mach-gemini/ ARM/EBSA110 MACHINE SUPPORT P: Russell King @@ -650,6 +651,7 @@ P: Paulius Zaleckas M: paulius.zaleckas@teltonika.lt L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) S: Maintained +F: arch/arm/mm/*-fa* ARM/FOOTBRIDGE ARCHITECTURE P: Russell King From 67a433ce278b98f47272726a22537fab7fd99de9 Mon Sep 17 00:00:00 2001 From: Paulius Zaleckas Date: Thu, 28 May 2009 16:42:25 +0300 Subject: [PATCH 646/900] Gemini: Fix SRAM/ROM location after memory swap Signed-off-by: Paulius Zaleckas --- arch/arm/mach-gemini/include/mach/hardware.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/mach-gemini/include/mach/hardware.h b/arch/arm/mach-gemini/include/mach/hardware.h index de6752674c0..213a4fcfeb1 100644 --- a/arch/arm/mach-gemini/include/mach/hardware.h +++ b/arch/arm/mach-gemini/include/mach/hardware.h @@ -15,10 +15,9 @@ /* * Memory Map definitions */ -/* FIXME: Does it really swap SRAM like this? */ #ifdef CONFIG_GEMINI_MEM_SWAP # define GEMINI_DRAM_BASE 0x00000000 -# define GEMINI_SRAM_BASE 0x20000000 +# define GEMINI_SRAM_BASE 0x70000000 #else # define GEMINI_SRAM_BASE 0x00000000 # define GEMINI_DRAM_BASE 0x10000000 From ed888aef427365d19f887c271a3a906d16422d24 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 17:16:04 +0200 Subject: [PATCH 647/900] dma-debug: re-add dma memory leak detection This is basically a revert of commit 314eeac9 but now in a fixed version. Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index cdd205d6bf7..e47e1a08c33 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -105,6 +105,11 @@ static const char *type2name[4] = { "single", "page", static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", "DMA_FROM_DEVICE", "DMA_NONE" }; +/* little merge helper - remove it after the merge window */ +#ifndef BUS_NOTIFY_UNBOUND_DRIVER +#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 +#endif + /* * The access to some variables in this macro is racy. We can't use atomic_t * here because all these variables are exported to debugfs. Some of them even @@ -458,9 +463,60 @@ out_err: return -ENOMEM; } +static int device_dma_allocations(struct device *dev) +{ + struct dma_debug_entry *entry; + unsigned long flags; + int count = 0, i; + + for (i = 0; i < HASH_SIZE; ++i) { + spin_lock_irqsave(&dma_entry_hash[i].lock, flags); + list_for_each_entry(entry, &dma_entry_hash[i].list, list) { + if (entry->dev == dev) + count += 1; + } + spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags); + } + + return count; +} + +static int dma_debug_device_change(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + int count; + + + switch (action) { + case BUS_NOTIFY_UNBOUND_DRIVER: + count = device_dma_allocations(dev); + if (count == 0) + break; + err_printk(dev, NULL, "DMA-API: device driver has pending " + "DMA allocations while released from device " + "[count=%d]\n", count); + break; + default: + break; + } + + return 0; +} + void dma_debug_add_bus(struct bus_type *bus) { - /* FIXME: register notifier */ + struct notifier_block *nb; + + nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); + if (nb == NULL) { + printk(KERN_ERR "dma_debug_add_bus: out of memory\n"); + return; + } + + nb->notifier_call = dma_debug_device_change; + + bus_register_notifier(bus, nb); } /* From fefda117ddb324b872312f1f061230e627c9f5ee Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 May 2009 12:21:42 +0200 Subject: [PATCH 648/900] amd-iommu: add amd_iommu_dump parameter This kernel parameter will be useful to get some AMD IOMMU related information in dmesg that is not necessary for the default user but may be helpful in debug situations. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 6 ++++++ arch/x86/kernel/amd_iommu_init.c | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 95c8cd9d22b..89dfb3793ed 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -194,6 +194,12 @@ #define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ #define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops domain for an IOMMU */ +extern bool amd_iommu_dump; +#define DUMP_printk(format, arg...) \ + do { \ + if (amd_iommu_dump) \ + printk(KERN_INFO "AMD IOMMU: " format, ## arg); \ + } while(0); /* * This structure contains generic data for IOMMU protection domains diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902da..57fb7a7cb6e 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -115,6 +115,8 @@ struct ivmd_header { u64 range_length; } __attribute__((packed)); +bool amd_iommu_dump; + static int __initdata amd_iommu_detected; u16 amd_iommu_last_bdf; /* largest PCI device id we have @@ -1211,6 +1213,13 @@ void __init amd_iommu_detect(void) * ****************************************************************************/ +static int __init parse_amd_iommu_dump(char *str) +{ + amd_iommu_dump = true; + + return 1; +} + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { @@ -1235,5 +1244,6 @@ static int __init parse_amd_iommu_size_options(char *str) return 1; } +__setup("amd_iommu_dump", parse_amd_iommu_dump); __setup("amd_iommu=", parse_amd_iommu_options); __setup("amd_iommu_size=", parse_amd_iommu_size_options); From 9c72041f719e2864d4208a89341c36b316dbf893 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 May 2009 13:53:57 +0200 Subject: [PATCH 649/900] amd-iommu: add dump for iommus described in ivrs table Add information about IOMMU devices described in the IVRS ACPI table to the kernel log if amd_iommu_dump was specified on the kernel command line. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 57fb7a7cb6e..28165902ae2 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -748,6 +748,15 @@ static int __init init_iommu_all(struct acpi_table_header *table) h = (struct ivhd_header *)p; switch (*p) { case ACPI_IVHD_TYPE: + + DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " + "seg: %d flags: %01x info %04x\n", + PCI_BUS(h->devid), PCI_SLOT(h->devid), + PCI_FUNC(h->devid), h->cap_ptr, + h->pci_seg, h->flags, h->info); + DUMP_printk(" mmio-addr: %016llx\n", + h->mmio_phys); + iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); if (iommu == NULL) return -ENOMEM; From 42a698f40a0946f5517308411b9e003ae031414d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 May 2009 15:41:28 +0200 Subject: [PATCH 650/900] amd-iommu: print ivhd information to dmesg when requested Add information about devices belonging to an IOMMU as described in the IVRS ACPI table to the kernel log if amd_iommu_dump was specified on the kernel command line. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 73 ++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 28165902ae2..fe3e6453cbf 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -598,32 +598,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, p += sizeof(struct ivhd_header); end += h->length; + while (p < end) { e = (struct ivhd_entry *)p; switch (e->type) { case IVHD_DEV_ALL: + + DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x" + " last device %02x:%02x.%x flags: %02x\n", + PCI_BUS(iommu->first_device), + PCI_SLOT(iommu->first_device), + PCI_FUNC(iommu->first_device), + PCI_BUS(iommu->last_device), + PCI_SLOT(iommu->last_device), + PCI_FUNC(iommu->last_device), + e->flags); + for (dev_i = iommu->first_device; dev_i <= iommu->last_device; ++dev_i) set_dev_entry_from_acpi(iommu, dev_i, e->flags, 0); break; case IVHD_DEV_SELECT: + + DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x " + "flags: %02x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags); + devid = e->devid; set_dev_entry_from_acpi(iommu, devid, e->flags, 0); break; case IVHD_DEV_SELECT_RANGE_START: + + DUMP_printk(" DEV_SELECT_RANGE_START\t " + "devid: %02x:%02x.%x flags: %02x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags); + devid_start = e->devid; flags = e->flags; ext_flags = 0; alias = false; break; case IVHD_DEV_ALIAS: + + DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x " + "flags: %02x devid_to: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, + PCI_BUS(e->ext >> 8), + PCI_SLOT(e->ext >> 8), + PCI_FUNC(e->ext >> 8)); + devid = e->devid; devid_to = e->ext >> 8; set_dev_entry_from_acpi(iommu, devid, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: + + DUMP_printk(" DEV_ALIAS_RANGE\t\t " + "devid: %02x:%02x.%x flags: %02x " + "devid_to: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, + PCI_BUS(e->ext >> 8), + PCI_SLOT(e->ext >> 8), + PCI_FUNC(e->ext >> 8)); + devid_start = e->devid; flags = e->flags; devid_to = e->ext >> 8; @@ -631,17 +682,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, alias = true; break; case IVHD_DEV_EXT_SELECT: + + DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x " + "flags: %02x ext: %08x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, e->ext); + devid = e->devid; set_dev_entry_from_acpi(iommu, devid, e->flags, e->ext); break; case IVHD_DEV_EXT_SELECT_RANGE: + + DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: " + "%02x:%02x.%x flags: %02x ext: %08x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, e->ext); + devid_start = e->devid; flags = e->flags; ext_flags = e->ext; alias = false; break; case IVHD_DEV_RANGE_END: + + DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid)); + devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { if (alias) From 02acc43a294098c2a4cd22cf24e9c988644f9f7f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 May 2009 16:24:21 +0200 Subject: [PATCH 651/900] amd-iommu: print ivmd information to dmesg when requested Add information about device memory mapping requirements for the IOMMU as described in the IVRS ACPI table to the kernel log if amd_iommu_dump was specified on the kernel command line. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index fe3e6453cbf..b90a78cfdcb 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -983,6 +983,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) static int __init init_unity_map_range(struct ivmd_header *m) { struct unity_map_entry *e = 0; + char *s; e = kzalloc(sizeof(*e), GFP_KERNEL); if (e == NULL) @@ -991,13 +992,16 @@ static int __init init_unity_map_range(struct ivmd_header *m) switch (m->type) { default: case ACPI_IVMD_TYPE: + s = "IVMD_TYPEi\t\t\t"; e->devid_start = e->devid_end = m->devid; break; case ACPI_IVMD_TYPE_ALL: + s = "IVMD_TYPE_ALL\t\t"; e->devid_start = 0; e->devid_end = amd_iommu_last_bdf; break; case ACPI_IVMD_TYPE_RANGE: + s = "IVMD_TYPE_RANGE\t\t"; e->devid_start = m->devid; e->devid_end = m->aux; break; @@ -1006,6 +1010,13 @@ static int __init init_unity_map_range(struct ivmd_header *m) e->address_end = e->address_start + PAGE_ALIGN(m->range_length); e->prot = m->flags >> 1; + DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x" + " range_start: %016llx range_end: %016llx flags: %x\n", s, + PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start), + PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end), + PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end), + e->address_start, e->address_end, m->flags); + list_add_tail(&e->list, &amd_iommu_unity_map); return 0; From b3b99ef8b4f80f3f093a72110e7697c2281ae45d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:02:48 +0200 Subject: [PATCH 652/900] amd-iommu: move protection domain printk to dump code This information is only helpful for debugging. Don't print it anymore unless explicitly requested. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad5..33565990164 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1009,8 +1009,9 @@ static int device_change_notifier(struct notifier_block *nb, if (!dma_domain) dma_domain = iommu->default_dom; attach_device(iommu, &dma_domain->domain, devid); - printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device %s\n", dma_domain->domain.id, dev_name(dev)); + DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain " + "%d for device %s\n", + dma_domain->domain.id, dev_name(dev)); break; case BUS_NOTIFY_UNBIND_DRIVER: if (!domain) @@ -1133,8 +1134,9 @@ static int get_device_resources(struct device *dev, dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; attach_device(*iommu, *domain, *bdf); - printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device %s\n", (*domain)->id, dev_name(dev)); + DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain " + "%d for device %s\n", + (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) From 2be69c79e9a46a554fc3ff57d886e65e7a73eb72 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:15:49 +0200 Subject: [PATCH 653/900] x86/iommu: add IOMMU_STRESS Kconfig entry This Kconfig option is intended to enable various code paths or parameters in IOMMU implementations to stress test the code and/or the hardware. This can also be done by disabling optimizations in the code when this option is switched on. Signed-off-by: Joerg Roedel Cc: David Woodhouse Cc: FUJITA Tomonori --- arch/x86/Kconfig.debug | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 5865712d105..33fac6bbe1c 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -159,6 +159,14 @@ config IOMMU_DEBUG options. See Documentation/x86_64/boot-options.txt for more details. +config IOMMU_STRESS + bool "Enable IOMMU stress-test mode" + ---help--- + This option disables various optimizations in IOMMU related + code to do real stress testing of the IOMMU code. This option + will cause a performance drop and should only be enabled for + testing. + config IOMMU_LEAK bool "IOMMU leak tracing" depends on IOMMU_DEBUG && DMA_API_DEBUG From 2e8b569614b89c9b1b85cba37db36daeeeff744e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:44:03 +0200 Subject: [PATCH 654/900] amd-iommu: disable device isolation with CONFIG_IOMMU_STRESS With device isolation disabled we can test better for race conditions in dma_ops related code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index b90a78cfdcb..66941129e9c 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -124,8 +124,14 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ + +#ifdef CONFIG_IOMMU_STRESS +bool amd_iommu_isolate = false; +#else bool amd_iommu_isolate = true; /* if true, device isolation is enabled */ +#endif + bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the From 421f909c803d1c397f6c66b75653f238696c39ee Mon Sep 17 00:00:00 2001 From: Neil Turton Date: Thu, 14 May 2009 14:00:35 +0100 Subject: [PATCH 655/900] amd-iommu: fix an off-by-one error in the AMD IOMMU driver. The variable amd_iommu_last_bdf holds the maximum bdf of any device controlled by an IOMMU, so the number of device entries needed is amd_iommu_last_bdf+1. The function tbl_size used amd_iommu_last_bdf instead. This would be a problem if the last device were a large enough power of 2. [ Impact: fix amd_iommu_last_bdf off-by-one error ] Signed-off-by: Neil Turton Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902da..35fc9654c7a 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -175,7 +175,7 @@ static inline void update_last_devid(u16 devid) static inline unsigned long tbl_size(int entry_size) { unsigned shift = PAGE_SHIFT + - get_order(amd_iommu_last_bdf * entry_size); + get_order(((int)amd_iommu_last_bdf + 1) * entry_size); return 1UL << shift; } From 7455aab1f95f6464c5af3fbdee28744e73f38564 Mon Sep 17 00:00:00 2001 From: Neil Turton Date: Thu, 14 May 2009 14:08:11 +0100 Subject: [PATCH 656/900] amd-iommu: fix the handling of device aliases in the AMD IOMMU driver. The devid parameter to set_dev_entry_from_acpi is the requester ID rather than the device ID since it is used to index the IOMMU device table. The handling of IVHD_DEV_ALIAS used to pass the device ID. This patch fixes it to pass the requester ID. [ Impact: fix setting the wrong req-id in acpi-table parsing ] Signed-off-by: Neil Turton Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 35fc9654c7a..53f93db54c4 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -618,7 +618,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, case IVHD_DEV_ALIAS: devid = e->devid; devid_to = e->ext >> 8; - set_dev_entry_from_acpi(iommu, devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: From 0bc252f430d6a3ac7836d40f00d0ae020593b11b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:48:05 +0200 Subject: [PATCH 657/900] amd-iommu: make sure only ivmd entries are parsed The bug never triggered. But it should be fixed to protect against broken ACPI tables in the future. [ Impact: protect against broken ivrs acpi table ] Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 53f93db54c4..a3a2b98bb39 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -906,6 +906,8 @@ static int __init init_unity_map_range(struct ivmd_header *m) switch (m->type) { default: + kfree(e); + return 0; case ACPI_IVMD_TYPE: e->devid_start = e->devid_end = m->devid; break; From c1eee67b2d8464781f5868a34168df61e40e85a6 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Thu, 21 May 2009 00:56:58 -0700 Subject: [PATCH 658/900] amd iommu: properly detach from protection domain on ->remove Some drivers may use the dma api during ->remove which will cause a protection domain to get reattached to a device. Delay the detach until after the driver is completely unbound. [ joro: added a little merge helper ] [ Impact: fix too early device<->domain removal ] Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad5..d6898833c36 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -57,6 +57,10 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, static struct dma_ops_domain *find_protection_domain(u16 devid); +#ifndef BUS_NOTIFY_UNBOUND_DRIVER +#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 +#endif + #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -1012,7 +1016,7 @@ static int device_change_notifier(struct notifier_block *nb, printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " "device %s\n", dma_domain->domain.id, dev_name(dev)); break; - case BUS_NOTIFY_UNBIND_DRIVER: + case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; detach_device(domain, devid); From 3bd221724adb9d642270df0e78b0105fb61e4a1c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 15:06:20 +0200 Subject: [PATCH 659/900] amd-iommu: introduce for_each_iommu* macros This patch introduces the for_each_iommu and for_each_iommu_safe macros to simplify the developers life when having to iterate over all AMD IOMMUs in the system. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 8 ++++++++ arch/x86/kernel/amd_iommu.c | 8 ++++---- arch/x86/kernel/amd_iommu_init.c | 8 ++++---- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 95c8cd9d22b..cf5ef172cfc 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -195,6 +195,14 @@ #define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops domain for an IOMMU */ +/* + * Make iterating over all IOMMUs easier + */ +#define for_each_iommu(iommu) \ + list_for_each_entry((iommu), &amd_iommu_list, list) +#define for_each_iommu_safe(iommu, next) \ + list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list) + /* * This structure contains generic data for IOMMU protection domains * independent of their use. diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad5..d9e9dc141a1 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -213,7 +213,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) { struct amd_iommu *iommu; - list_for_each_entry(iommu, &amd_iommu_list, list) + for_each_iommu(iommu) iommu_poll_events(iommu); return IRQ_HANDLED; @@ -440,7 +440,7 @@ static void iommu_flush_domain(u16 domid) __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, domid, 1, 1); - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { spin_lock_irqsave(&iommu->lock, flags); __iommu_queue_command(iommu, &cmd); __iommu_completion_wait(iommu); @@ -1672,7 +1672,7 @@ int __init amd_iommu_init_dma_ops(void) * found in the system. Devices not assigned to any other * protection domain will be assigned to the default one. */ - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { iommu->default_dom = dma_ops_domain_alloc(iommu, order); if (iommu->default_dom == NULL) return -ENOMEM; @@ -1710,7 +1710,7 @@ int __init amd_iommu_init_dma_ops(void) free_domains: - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { if (iommu->default_dom) dma_ops_domain_free(iommu->default_dom); } diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902da..675a4b642f7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -679,7 +679,7 @@ static void __init free_iommu_all(void) { struct amd_iommu *iommu, *next; - list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { + for_each_iommu_safe(iommu, next) { list_del(&iommu->list); free_iommu_one(iommu); kfree(iommu); @@ -779,7 +779,7 @@ static int __init iommu_setup_msix(struct amd_iommu *iommu) struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ int nvec = 0, i; - list_for_each_entry(curr, &amd_iommu_list, list) { + for_each_iommu(curr) { if (curr->dev == iommu->dev) { entries[nvec].entry = curr->evt_msi_num; entries[nvec].vector = 0; @@ -818,7 +818,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) int r; struct amd_iommu *curr; - list_for_each_entry(curr, &amd_iommu_list, list) { + for_each_iommu(curr) { if (curr->dev == iommu->dev) curr->int_enabled = true; } @@ -971,7 +971,7 @@ static void __init enable_iommus(void) { struct amd_iommu *iommu; - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); iommu_enable_event_logging(iommu); From 58492e128892e3b55f1a6ef0cf3c3ab4ce7cc214 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 18:41:16 +0200 Subject: [PATCH 660/900] amd-iommu: consolidate hardware initialization to one function This patch restructures the AMD IOMMU initialization code to initialize all hardware registers with one single function call. This is helpful for suspend/resume support. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 50 ++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 675a4b642f7..74f4f1fea93 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -252,13 +252,6 @@ static void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } -/* Function to enable IOMMU event logging and event interrupts */ -static void __init iommu_enable_event_logging(struct amd_iommu *iommu) -{ - iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); - iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); -} - /* * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in * the system has one. @@ -413,25 +406,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(CMD_BUFFER_SIZE)); - u64 entry; if (cmd_buf == NULL) return NULL; iommu->cmd_buf_size = CMD_BUFFER_SIZE; - entry = (u64)virt_to_phys(cmd_buf); + return cmd_buf; +} + +/* + * This function writes the command buffer address to the hardware and + * enables it. + */ +static void iommu_enable_command_buffer(struct amd_iommu *iommu) +{ + u64 entry; + + BUG_ON(iommu->cmd_buf == NULL); + + entry = (u64)virt_to_phys(iommu->cmd_buf); entry |= MMIO_CMD_SIZE_512; + memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, - &entry, sizeof(entry)); + &entry, sizeof(entry)); /* set head and tail to zero manually */ writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); - - return cmd_buf; } static void __init free_command_buffer(struct amd_iommu *iommu) @@ -443,20 +447,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu) /* allocates the memory where the IOMMU will log its events to */ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) { - u64 entry; iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(EVT_BUFFER_SIZE)); if (iommu->evt_buf == NULL) return NULL; + return iommu->evt_buf; +} + +static void iommu_enable_event_buffer(struct amd_iommu *iommu) +{ + u64 entry; + + BUG_ON(iommu->evt_buf == NULL); + entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, &entry, sizeof(entry)); - iommu->evt_buf_size = EVT_BUFFER_SIZE; - - return iommu->evt_buf; + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); } static void __init free_event_buffer(struct amd_iommu *iommu) @@ -710,7 +721,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (!iommu->mmio_base) return -ENOMEM; - iommu_set_device_table(iommu); iommu->cmd_buf = alloc_command_buffer(iommu); if (!iommu->cmd_buf) return -ENOMEM; @@ -837,6 +847,8 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) return 1; } + iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + return 0; } @@ -972,9 +984,11 @@ static void __init enable_iommus(void) struct amd_iommu *iommu; for_each_iommu(iommu) { + iommu_set_device_table(iommu); + iommu_enable_command_buffer(iommu); + iommu_enable_event_buffer(iommu); iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); - iommu_enable_event_logging(iommu); iommu_enable(iommu); } } From fab6afa30954a0684ef8ac1d9a606e74a6215ab6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 18:46:34 +0200 Subject: [PATCH 661/900] amd-iommu: drop pointless iommu-loop in msi setup code It is not necessary to loop again over all IOMMUs in this code. So drop the loop. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 74f4f1fea93..cc99f609223 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -826,13 +826,6 @@ out_free: static int __init iommu_setup_msi(struct amd_iommu *iommu) { int r; - struct amd_iommu *curr; - - for_each_iommu(curr) { - if (curr->dev == iommu->dev) - curr->int_enabled = true; - } - if (pci_enable_msi(iommu->dev)) return 1; @@ -847,6 +840,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) return 1; } + iommu->int_enabled = true; iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); return 0; From d91cecdd796c27df46339e80ed436a980c56fcad Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 18:51:00 +0200 Subject: [PATCH 662/900] amd-iommu: remove support for msi-x Current hardware uses msi instead of msi-x so this code it not necessary and can not be tested. The best thing is to drop this code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 44 +------------------------------- 1 file changed, 1 insertion(+), 43 deletions(-) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index cc99f609223..feee475e626 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -783,46 +783,6 @@ static int __init init_iommu_all(struct acpi_table_header *table) * ****************************************************************************/ -static int __init iommu_setup_msix(struct amd_iommu *iommu) -{ - struct amd_iommu *curr; - struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ - int nvec = 0, i; - - for_each_iommu(curr) { - if (curr->dev == iommu->dev) { - entries[nvec].entry = curr->evt_msi_num; - entries[nvec].vector = 0; - curr->int_enabled = true; - nvec++; - } - } - - if (pci_enable_msix(iommu->dev, entries, nvec)) { - pci_disable_msix(iommu->dev); - return 1; - } - - for (i = 0; i < nvec; ++i) { - int r = request_irq(entries->vector, amd_iommu_int_handler, - IRQF_SAMPLE_RANDOM, - "AMD IOMMU", - NULL); - if (r) - goto out_free; - } - - return 0; - -out_free: - for (i -= 1; i >= 0; --i) - free_irq(entries->vector, NULL); - - pci_disable_msix(iommu->dev); - - return 1; -} - static int __init iommu_setup_msi(struct amd_iommu *iommu) { int r; @@ -851,9 +811,7 @@ static int __init iommu_init_msi(struct amd_iommu *iommu) if (iommu->int_enabled) return 0; - if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) - return iommu_setup_msix(iommu); - else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) + if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) return iommu_setup_msi(iommu); return 1; From 92ac4320af6ed4294c2c221dd4ccbfd9026a3aa7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2009 19:06:27 +0200 Subject: [PATCH 663/900] amd-iommu: add function to disable all iommus This function is required for suspend/resume support with AMD IOMMU enabled. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index feee475e626..ed10c0f5ff7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -252,6 +252,11 @@ static void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } +static void iommu_disable(struct amd_iommu *iommu) +{ + iommu_feature_disable(iommu, CONTROL_IOMMU_EN); +} + /* * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in * the system has one. @@ -945,6 +950,14 @@ static void __init enable_iommus(void) } } +static void disable_iommus(void) +{ + struct amd_iommu *iommu; + + for_each_iommu(iommu) + iommu_disable(iommu); +} + /* * Suspend/Resume support * disable suspend until real resume implemented From bfd1be1857e5a3385bf146e02e6dc3dd4241bec1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 5 May 2009 15:33:57 +0200 Subject: [PATCH 664/900] amd-iommu: add function to flush tlb for all domains This function is required for suspend/resume support with AMD IOMMU enabled. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu.h | 1 + arch/x86/kernel/amd_iommu.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index f712344329b..1750e1f85d3 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -27,6 +27,7 @@ extern int amd_iommu_init(void); extern int amd_iommu_init_dma_ops(void); extern void amd_iommu_detect(void); extern irqreturn_t amd_iommu_int_handler(int irq, void *data); +extern void amd_iommu_flush_all_domains(void); #else static inline int amd_iommu_init(void) { return -ENODEV; } static inline void amd_iommu_detect(void) { } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d9e9dc141a1..826ad079efc 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -449,6 +449,17 @@ static void iommu_flush_domain(u16 domid) } } +void amd_iommu_flush_all_domains(void) +{ + int i; + + for (i = 1; i < MAX_DOMAIN_ID; ++i) { + if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + continue; + iommu_flush_domain(i); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for From 7d7a110c6127b7fc683dc6d764555f2dbd22b054 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 5 May 2009 15:48:10 +0200 Subject: [PATCH 665/900] amd-iommu: add function to flush tlb for all devices This function is required for suspend/resume support with AMD IOMMU enabled. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu.h | 1 + arch/x86/kernel/amd_iommu.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 1750e1f85d3..262e0282004 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -28,6 +28,7 @@ extern int amd_iommu_init_dma_ops(void); extern void amd_iommu_detect(void); extern irqreturn_t amd_iommu_int_handler(int irq, void *data); extern void amd_iommu_flush_all_domains(void); +extern void amd_iommu_flush_all_devices(void); #else static inline int amd_iommu_init(void) { return -ENODEV; } static inline void amd_iommu_detect(void) { } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 826ad079efc..92b0e1881e0 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -460,6 +460,24 @@ void amd_iommu_flush_all_domains(void) } } +void amd_iommu_flush_all_devices(void) +{ + struct amd_iommu *iommu; + int i; + + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (amd_iommu_pd_table[i] == NULL) + continue; + + iommu = amd_iommu_rlookup_table[i]; + if (!iommu) + continue; + + iommu_queue_inv_dev_entry(iommu, i); + iommu_completion_wait(iommu); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for From 05f92db9f47f852ff48bbed1b063b8ab8ad00285 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 09:52:46 +0200 Subject: [PATCH 666/900] amd_iommu: un __init functions required for suspend/resume This patch makes sure that no function required for suspend/resume of AMD IOMMU driver is thrown away after boot. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index ed10c0f5ff7..330896ba6a9 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -193,7 +193,7 @@ static inline unsigned long tbl_size(int entry_size) * This function set the exclusion range in the IOMMU. DMA accesses to the * exclusion range are passed through untranslated */ -static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) +static void iommu_set_exclusion_range(struct amd_iommu *iommu) { u64 start = iommu->exclusion_start & PAGE_MASK; u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; @@ -225,7 +225,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu) } /* Generic functions to enable/disable certain features of the IOMMU. */ -static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -244,7 +244,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) } /* Function to enable the hardware */ -static void __init iommu_enable(struct amd_iommu *iommu) +static void iommu_enable(struct amd_iommu *iommu) { printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", dev_name(&iommu->dev->dev), iommu->cap_ptr); @@ -811,7 +811,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) return 0; } -static int __init iommu_init_msi(struct amd_iommu *iommu) +static int iommu_init_msi(struct amd_iommu *iommu) { if (iommu->int_enabled) return 0; @@ -936,7 +936,7 @@ static void init_device_table(void) * This function finally enables all IOMMUs found in the system after * they have been initialized */ -static void __init enable_iommus(void) +static void enable_iommus(void) { struct amd_iommu *iommu; From 736501ee000757082a4f0832826ae1eda7ea106e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 09:56:12 +0200 Subject: [PATCH 667/900] amd-iommu: implement suspend/resume This patch puts everything together and enables suspend/resume support in the AMD IOMMU driver. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 330896ba6a9..4ca8fbfb68d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -965,12 +965,31 @@ static void disable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { + /* + * Disable IOMMUs before reprogramming the hardware registers. + * IOMMU is still enabled from the resume kernel. + */ + disable_iommus(); + + /* re-load the hardware */ + enable_iommus(); + + /* + * we have to flush after the IOMMUs are enabled because a + * disabled IOMMU will never execute the commands we send + */ + amd_iommu_flush_all_domains(); + amd_iommu_flush_all_devices(); + return 0; } static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) { - return -EINVAL; + /* disable IOMMUs to go out of the way for BIOS */ + disable_iommus(); + + return 0; } static struct sysdev_class amd_iommu_sysdev_class = { From c3239567a20e90e3026ac5453d5267506ef7b030 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 10:56:44 +0200 Subject: [PATCH 668/900] amd-iommu: introduce aperture_range structure This is a preperation for extended address allocator. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 32 ++++++++++++------ arch/x86/kernel/amd_iommu.c | 46 ++++++++++++-------------- 2 files changed, 43 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 95c8cd9d22b..4c64c9bc683 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -195,6 +195,8 @@ #define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops domain for an IOMMU */ +#define APERTURE_RANGE_SIZE (128 * 1024 * 1024) + /* * This structure contains generic data for IOMMU protection domains * independent of their use. @@ -209,6 +211,24 @@ struct protection_domain { void *priv; /* private data */ }; +/* + * For dynamic growth the aperture size is split into ranges of 128MB of + * DMA address space each. This struct represents one such range. + */ +struct aperture_range { + + /* address allocation bitmap */ + unsigned long *bitmap; + + /* + * Array of PTE pages for the aperture. In this array we save all the + * leaf pages of the domain page table used for the aperture. This way + * we don't need to walk the page table to find a specific PTE. We can + * just calculate its address in constant time. + */ + u64 *pte_pages[64]; +}; + /* * Data container for a dma_ops specific protection domain */ @@ -224,16 +244,8 @@ struct dma_ops_domain { /* address we start to search for free addresses */ unsigned long next_bit; - /* address allocation bitmap */ - unsigned long *bitmap; - - /* - * Array of PTE pages for the aperture. In this array we save all the - * leaf pages of the domain page table used for the aperture. This way - * we don't need to walk the page table to find a specific PTE. We can - * just calculate its address in constant time. - */ - u64 **pte_pages; + /* address space relevant data */ + struct aperture_range aperture; /* This will be set to true when TLB needs to be flushed */ bool need_flush; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad5..62acd09cd19 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -595,7 +595,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, * as allocated in the aperture */ if (addr < dma_dom->aperture_size) - __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); + __set_bit(addr >> PAGE_SHIFT, + dma_dom->aperture.bitmap); } return 0; @@ -656,11 +657,12 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, dom->need_flush = true; } - address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, - 0 , boundary_size, align_mask); + address = iommu_area_alloc(dom->aperture.bitmap, limit, dom->next_bit, + pages, 0 , boundary_size, align_mask); if (address == -1) { - address = iommu_area_alloc(dom->bitmap, limit, 0, pages, - 0, boundary_size, align_mask); + address = iommu_area_alloc(dom->aperture.bitmap, limit, 0, + pages, 0, boundary_size, + align_mask); dom->need_flush = true; } @@ -685,7 +687,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned int pages) { address >>= PAGE_SHIFT; - iommu_area_free(dom->bitmap, address, pages); + iommu_area_free(dom->aperture.bitmap, address, pages); if (address >= dom->next_bit) dom->need_flush = true; @@ -741,7 +743,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, if (start_page + pages > last_page) pages = last_page - start_page; - iommu_area_reserve(dom->bitmap, start_page, pages); + iommu_area_reserve(dom->aperture.bitmap, start_page, pages); } static void free_pagetable(struct protection_domain *domain) @@ -785,9 +787,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) free_pagetable(&dom->domain); - kfree(dom->pte_pages); - - kfree(dom->bitmap); + free_page((unsigned long)dom->aperture.bitmap); kfree(dom); } @@ -826,16 +826,15 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; - dma_dom->aperture_size = (1ULL << order); - dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8), - GFP_KERNEL); - if (!dma_dom->bitmap) + dma_dom->aperture_size = APERTURE_RANGE_SIZE; + dma_dom->aperture.bitmap = (void *)get_zeroed_page(GFP_KERNEL); + if (!dma_dom->aperture.bitmap) goto free_dma_dom; /* * mark the first page as allocated so we never return 0 as * a valid dma-address. So we can use 0 as error value */ - dma_dom->bitmap[0] = 1; + dma_dom->aperture.bitmap[0] = 1; dma_dom->next_bit = 0; dma_dom->need_flush = false; @@ -854,13 +853,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, /* * At the last step, build the page tables so we don't need to * allocate page table pages in the dma_ops mapping/unmapping - * path. + * path for the first 128MB of dma address space. */ num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); - dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), - GFP_KERNEL); - if (!dma_dom->pte_pages) - goto free_dma_dom; l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); if (l2_pde == NULL) @@ -869,10 +864,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); for (i = 0; i < num_pte_pages; ++i) { - dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!dma_dom->pte_pages[i]) + u64 **pte_page = &dma_dom->aperture.pte_pages[i]; + *pte_page = (u64 *)get_zeroed_page(GFP_KERNEL); + if (!*pte_page) goto free_dma_dom; - address = virt_to_phys(dma_dom->pte_pages[i]); + address = virt_to_phys(*pte_page); l2_pde[i] = IOMMU_L1_PDE(address); } @@ -1159,7 +1155,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, paddr &= PAGE_MASK; - pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + pte = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1192,7 +1188,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); - pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + pte = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); WARN_ON(!*pte); From 8bda3092bcfa68f786d94549ae026e8db1eff041 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 12:02:46 +0200 Subject: [PATCH 669/900] amd-iommu: move page table allocation code to seperate function This patch makes page table allocation usable for dma_ops code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 86 ++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 62acd09cd19..ded79f7747c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -55,7 +55,9 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); static struct dma_ops_domain *find_protection_domain(u16 devid); - +static u64* alloc_pte(struct protection_domain *dom, + unsigned long address, u64 + **pte_page, gfp_t gfp); #ifdef CONFIG_AMD_IOMMU_STATS @@ -468,7 +470,7 @@ static int iommu_map_page(struct protection_domain *dom, unsigned long phys_addr, int prot) { - u64 __pte, *pte, *page; + u64 __pte, *pte; bus_addr = PAGE_ALIGN(bus_addr); phys_addr = PAGE_ALIGN(phys_addr); @@ -477,27 +479,7 @@ static int iommu_map_page(struct protection_domain *dom, if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) return -EINVAL; - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - *pte = IOMMU_L2_PDE(virt_to_phys(page)); - } - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - *pte = IOMMU_L1_PDE(virt_to_phys(page)); - } - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)]; + pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); if (IOMMU_PTE_PRESENT(*pte)) return -EBUSY; @@ -1139,6 +1121,61 @@ static int get_device_resources(struct device *dev, return 1; } +/* + * If the pte_page is not yet allocated this function is called + */ +static u64* alloc_pte(struct protection_domain *dom, + unsigned long address, u64 **pte_page, gfp_t gfp) +{ + u64 *pte, *page; + + pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = IOMMU_L2_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = IOMMU_L1_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + + if (pte_page) + *pte_page = pte; + + pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + + return pte; +} + +/* + * This function fetches the PTE for a given address in the aperture + */ +static u64* dma_ops_get_pte(struct dma_ops_domain *dom, + unsigned long address) +{ + struct aperture_range *aperture = &dom->aperture; + u64 *pte, *pte_page; + + pte = aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + if (!pte) { + pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); + aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)] = pte_page; + } + + return pte; +} + /* * This is the generic map function. It maps one 4kb page at paddr to * the given address in the DMA address space for the domain. @@ -1155,8 +1192,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, paddr &= PAGE_MASK; - pte = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)]; - pte += IOMMU_PTE_L0_INDEX(address); + pte = dma_ops_get_pte(dom, address); __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; From 53812c115cda1f660b286c939669154a56976f6b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 12:17:38 +0200 Subject: [PATCH 670/900] amd-iommu: handle page table allocation failures in dma_ops code The code will be required when the aperture size increases dynamically in the extended address allocator. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index ded79f7747c..a467addb44b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1193,6 +1193,8 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, paddr &= PAGE_MASK; pte = dma_ops_get_pte(dom, address); + if (!pte) + return bad_dma_address; __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1248,7 +1250,7 @@ static dma_addr_t __map_single(struct device *dev, u64 dma_mask) { dma_addr_t offset = paddr & ~PAGE_MASK; - dma_addr_t address, start; + dma_addr_t address, start, ret; unsigned int pages; unsigned long align_mask = 0; int i; @@ -1271,7 +1273,10 @@ static dma_addr_t __map_single(struct device *dev, start = address; for (i = 0; i < pages; ++i) { - dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + if (ret == bad_dma_address) + goto out_unmap; + paddr += PAGE_SIZE; start += PAGE_SIZE; } @@ -1287,6 +1292,17 @@ static dma_addr_t __map_single(struct device *dev, out: return address; + +out_unmap: + + for (--i; i >= 0; --i) { + start -= PAGE_SIZE; + dma_ops_domain_unmap(iommu, dma_dom, start); + } + + dma_ops_free_addresses(dma_dom, address, pages); + + return bad_dma_address; } /* From 384de72910a7bf96a02a6d8023fe9e16d872beb2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 15 May 2009 12:30:05 +0200 Subject: [PATCH 671/900] amd-iommu: make address allocator aware of multiple aperture ranges This patch changes the AMD IOMMU address allocator to allow up to 32 aperture ranges per dma_ops domain. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 11 +- arch/x86/kernel/amd_iommu.c | 138 ++++++++++++++++++------- 2 files changed, 110 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 4c64c9bc683..eca912931a8 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -195,7 +195,12 @@ #define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops domain for an IOMMU */ -#define APERTURE_RANGE_SIZE (128 * 1024 * 1024) +#define APERTURE_RANGE_SHIFT 27 /* 128 MB */ +#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT) +#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT) +#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */ +#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT) +#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL) /* * This structure contains generic data for IOMMU protection domains @@ -227,6 +232,8 @@ struct aperture_range { * just calculate its address in constant time. */ u64 *pte_pages[64]; + + unsigned long offset; }; /* @@ -245,7 +252,7 @@ struct dma_ops_domain { unsigned long next_bit; /* address space relevant data */ - struct aperture_range aperture; + struct aperture_range *aperture[APERTURE_MAX_RANGES]; /* This will be set to true when TLB needs to be flushed */ bool need_flush; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a467addb44b..794163ae97b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -578,7 +578,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, */ if (addr < dma_dom->aperture_size) __set_bit(addr >> PAGE_SHIFT, - dma_dom->aperture.bitmap); + dma_dom->aperture[0]->bitmap); } return 0; @@ -615,43 +615,74 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, ****************************************************************************/ /* - * The address allocator core function. + * The address allocator core functions. * * called with domain->lock held */ + +static unsigned long dma_ops_area_alloc(struct device *dev, + struct dma_ops_domain *dom, + unsigned int pages, + unsigned long align_mask, + u64 dma_mask, + unsigned long start) +{ + unsigned long next_bit = dom->next_bit % APERTURE_RANGE_PAGES; + int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; + int i = start >> APERTURE_RANGE_SHIFT; + unsigned long boundary_size; + unsigned long address = -1; + unsigned long limit; + + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; + + for (;i < max_index; ++i) { + unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT; + + if (dom->aperture[i]->offset >= dma_mask) + break; + + limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset, + dma_mask >> PAGE_SHIFT); + + address = iommu_area_alloc(dom->aperture[i]->bitmap, + limit, next_bit, pages, 0, + boundary_size, align_mask); + if (address != -1) { + address = dom->aperture[i]->offset + + (address << PAGE_SHIFT); + dom->next_bit = (address >> PAGE_SHIFT) + pages; + break; + } + + next_bit = 0; + } + + return address; +} + static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, unsigned int pages, unsigned long align_mask, u64 dma_mask) { - unsigned long limit; unsigned long address; - unsigned long boundary_size; + unsigned long start = dom->next_bit << PAGE_SHIFT; - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, - dma_mask >> PAGE_SHIFT); - if (dom->next_bit >= limit) { - dom->next_bit = 0; - dom->need_flush = true; - } + address = dma_ops_area_alloc(dev, dom, pages, align_mask, + dma_mask, start); - address = iommu_area_alloc(dom->aperture.bitmap, limit, dom->next_bit, - pages, 0 , boundary_size, align_mask); if (address == -1) { - address = iommu_area_alloc(dom->aperture.bitmap, limit, 0, - pages, 0, boundary_size, - align_mask); + dom->next_bit = 0; + address = dma_ops_area_alloc(dev, dom, pages, align_mask, + dma_mask, 0); dom->need_flush = true; } - if (likely(address != -1)) { - dom->next_bit = address + pages; - address <<= PAGE_SHIFT; - } else + if (unlikely(address == -1)) address = bad_dma_address; WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -668,11 +699,17 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned long address, unsigned int pages) { - address >>= PAGE_SHIFT; - iommu_area_free(dom->aperture.bitmap, address, pages); + unsigned i = address >> APERTURE_RANGE_SHIFT; + struct aperture_range *range = dom->aperture[i]; - if (address >= dom->next_bit) + BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); + + if ((address >> PAGE_SHIFT) >= dom->next_bit) dom->need_flush = true; + + address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; + iommu_area_free(range->bitmap, address, pages); + } /**************************************************************************** @@ -720,12 +757,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages) { - unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; + unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; if (start_page + pages > last_page) pages = last_page - start_page; - iommu_area_reserve(dom->aperture.bitmap, start_page, pages); + for (i = start_page; i < start_page + pages; ++i) { + int index = i / APERTURE_RANGE_PAGES; + int page = i % APERTURE_RANGE_PAGES; + __set_bit(page, dom->aperture[index]->bitmap); + } } static void free_pagetable(struct protection_domain *domain) @@ -764,12 +805,19 @@ static void free_pagetable(struct protection_domain *domain) */ static void dma_ops_domain_free(struct dma_ops_domain *dom) { + int i; + if (!dom) return; free_pagetable(&dom->domain); - free_page((unsigned long)dom->aperture.bitmap); + for (i = 0; i < APERTURE_MAX_RANGES; ++i) { + if (!dom->aperture[i]) + continue; + free_page((unsigned long)dom->aperture[i]->bitmap); + kfree(dom->aperture[i]); + } kfree(dom); } @@ -797,6 +845,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (!dma_dom) return NULL; + dma_dom->aperture[0] = kzalloc(sizeof(struct aperture_range), + GFP_KERNEL); + if (!dma_dom->aperture[0]) + goto free_dma_dom; + spin_lock_init(&dma_dom->domain.lock); dma_dom->domain.id = domain_id_alloc(); @@ -809,14 +862,14 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (!dma_dom->domain.pt_root) goto free_dma_dom; dma_dom->aperture_size = APERTURE_RANGE_SIZE; - dma_dom->aperture.bitmap = (void *)get_zeroed_page(GFP_KERNEL); - if (!dma_dom->aperture.bitmap) + dma_dom->aperture[0]->bitmap = (void *)get_zeroed_page(GFP_KERNEL); + if (!dma_dom->aperture[0]->bitmap) goto free_dma_dom; /* * mark the first page as allocated so we never return 0 as * a valid dma-address. So we can use 0 as error value */ - dma_dom->aperture.bitmap[0] = 1; + dma_dom->aperture[0]->bitmap[0] = 1; dma_dom->next_bit = 0; dma_dom->need_flush = false; @@ -846,7 +899,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); for (i = 0; i < num_pte_pages; ++i) { - u64 **pte_page = &dma_dom->aperture.pte_pages[i]; + u64 **pte_page = &dma_dom->aperture[0]->pte_pages[i]; *pte_page = (u64 *)get_zeroed_page(GFP_KERNEL); if (!*pte_page) goto free_dma_dom; @@ -1164,14 +1217,19 @@ static u64* alloc_pte(struct protection_domain *dom, static u64* dma_ops_get_pte(struct dma_ops_domain *dom, unsigned long address) { - struct aperture_range *aperture = &dom->aperture; + struct aperture_range *aperture; u64 *pte, *pte_page; - pte = aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; + if (!aperture) + return NULL; + + pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; if (!pte) { pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); - aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)] = pte_page; - } + aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; + } else + pte += IOMMU_PTE_L0_INDEX(address); return pte; } @@ -1219,14 +1277,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address) { + struct aperture_range *aperture; u64 *pte; if (address >= dom->aperture_size) return; - WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); + aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; + if (!aperture) + return; + + pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; + if (!pte) + return; - pte = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); WARN_ON(!*pte); From 803b8cb4d9a93b90c67aba2aab7f2c54d595b5b9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 May 2009 15:32:48 +0200 Subject: [PATCH 672/900] amd-iommu: change dma_dom->next_bit to dma_dom->next_address Simplify the code a little bit by using the same unit for all address space related state in the dma_ops domain structure. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 2 +- arch/x86/kernel/amd_iommu.c | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index eca912931a8..4ff4cf1f080 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -249,7 +249,7 @@ struct dma_ops_domain { unsigned long aperture_size; /* address we start to search for free addresses */ - unsigned long next_bit; + unsigned long next_address; /* address space relevant data */ struct aperture_range *aperture[APERTURE_MAX_RANGES]; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 794163ae97b..c1a08b9119c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -627,13 +627,15 @@ static unsigned long dma_ops_area_alloc(struct device *dev, u64 dma_mask, unsigned long start) { - unsigned long next_bit = dom->next_bit % APERTURE_RANGE_PAGES; + unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE; int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; int i = start >> APERTURE_RANGE_SHIFT; unsigned long boundary_size; unsigned long address = -1; unsigned long limit; + next_bit >>= PAGE_SHIFT; + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; @@ -652,7 +654,7 @@ static unsigned long dma_ops_area_alloc(struct device *dev, if (address != -1) { address = dom->aperture[i]->offset + (address << PAGE_SHIFT); - dom->next_bit = (address >> PAGE_SHIFT) + pages; + dom->next_address = address + (pages << PAGE_SHIFT); break; } @@ -669,14 +671,12 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, u64 dma_mask) { unsigned long address; - unsigned long start = dom->next_bit << PAGE_SHIFT; - address = dma_ops_area_alloc(dev, dom, pages, align_mask, - dma_mask, start); + dma_mask, dom->next_address); if (address == -1) { - dom->next_bit = 0; + dom->next_address = 0; address = dma_ops_area_alloc(dev, dom, pages, align_mask, dma_mask, 0); dom->need_flush = true; @@ -704,10 +704,11 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); - if ((address >> PAGE_SHIFT) >= dom->next_bit) + if (address >= dom->next_address) dom->need_flush = true; address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; + iommu_area_free(range->bitmap, address, pages); } @@ -870,7 +871,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, * a valid dma-address. So we can use 0 as error value */ dma_dom->aperture[0]->bitmap[0] = 1; - dma_dom->next_bit = 0; + dma_dom->next_address = 0; dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; From 9cabe89b99773e682538a8809abc7d4000c77083 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 May 2009 16:38:55 +0200 Subject: [PATCH 673/900] amd-iommu: move aperture_range allocation code to seperate function This patch prepares the dynamic increasement of dma_ops domain apertures. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 95 +++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 36 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c1a08b9119c..8ff02ee69e8 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -620,6 +620,59 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * called with domain->lock held */ +/* + * This function is used to add a new aperture range to an existing + * aperture in case of dma_ops domain allocation or address allocation + * failure. + */ +static int alloc_new_range(struct dma_ops_domain *dma_dom, + bool populate, gfp_t gfp) +{ + int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + + if (index >= APERTURE_MAX_RANGES) + return -ENOMEM; + + dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp); + if (!dma_dom->aperture[index]) + return -ENOMEM; + + dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp); + if (!dma_dom->aperture[index]->bitmap) + goto out_free; + + dma_dom->aperture[index]->offset = dma_dom->aperture_size; + + if (populate) { + unsigned long address = dma_dom->aperture_size; + int i, num_ptes = APERTURE_RANGE_PAGES / 512; + u64 *pte, *pte_page; + + for (i = 0; i < num_ptes; ++i) { + pte = alloc_pte(&dma_dom->domain, address, + &pte_page, gfp); + if (!pte) + goto out_free; + + dma_dom->aperture[index]->pte_pages[i] = pte_page; + + address += APERTURE_RANGE_SIZE / 64; + } + } + + dma_dom->aperture_size += APERTURE_RANGE_SIZE; + + return 0; + +out_free: + free_page((unsigned long)dma_dom->aperture[index]->bitmap); + + kfree(dma_dom->aperture[index]); + dma_dom->aperture[index] = NULL; + + return -ENOMEM; +} + static unsigned long dma_ops_area_alloc(struct device *dev, struct dma_ops_domain *dom, unsigned int pages, @@ -832,9 +885,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, unsigned order) { struct dma_ops_domain *dma_dom; - unsigned i, num_pte_pages; - u64 *l2_pde; - u64 address; /* * Currently the DMA aperture must be between 32 MB and 1GB in size @@ -846,11 +896,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (!dma_dom) return NULL; - dma_dom->aperture[0] = kzalloc(sizeof(struct aperture_range), - GFP_KERNEL); - if (!dma_dom->aperture[0]) - goto free_dma_dom; - spin_lock_init(&dma_dom->domain.lock); dma_dom->domain.id = domain_id_alloc(); @@ -862,10 +907,13 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; - dma_dom->aperture_size = APERTURE_RANGE_SIZE; - dma_dom->aperture[0]->bitmap = (void *)get_zeroed_page(GFP_KERNEL); - if (!dma_dom->aperture[0]->bitmap) + + dma_dom->need_flush = false; + dma_dom->target_dev = 0xffff; + + if (alloc_new_range(dma_dom, true, GFP_KERNEL)) goto free_dma_dom; + /* * mark the first page as allocated so we never return 0 as * a valid dma-address. So we can use 0 as error value @@ -873,9 +921,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->aperture[0]->bitmap[0] = 1; dma_dom->next_address = 0; - dma_dom->need_flush = false; - dma_dom->target_dev = 0xffff; - /* Intialize the exclusion range if necessary */ if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { @@ -886,28 +931,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_ops_reserve_addresses(dma_dom, startpage, pages); } - /* - * At the last step, build the page tables so we don't need to - * allocate page table pages in the dma_ops mapping/unmapping - * path for the first 128MB of dma address space. - */ - num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); - - l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); - if (l2_pde == NULL) - goto free_dma_dom; - - dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); - - for (i = 0; i < num_pte_pages; ++i) { - u64 **pte_page = &dma_dom->aperture[0]->pte_pages[i]; - *pte_page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!*pte_page) - goto free_dma_dom; - address = virt_to_phys(*pte_page); - l2_pde[i] = IOMMU_L1_PDE(address); - } - return dma_dom; free_dma_dom: From 00cd122ae5e5e7c60cce2af3c35b190d4c3f2d0d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2009 09:52:40 +0200 Subject: [PATCH 674/900] amd-iommu: handle exlusion ranges and unity mappings in alloc_new_range This patch makes sure no reserved addresses are allocated in an dma_ops domain when the aperture is increased dynamically. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 71 +++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 8ff02ee69e8..59ee1b94a7c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -58,6 +58,9 @@ static struct dma_ops_domain *find_protection_domain(u16 devid); static u64* alloc_pte(struct protection_domain *dom, unsigned long address, u64 **pte_page, gfp_t gfp); +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages); #ifdef CONFIG_AMD_IOMMU_STATS @@ -620,15 +623,43 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * called with domain->lock held */ +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64* fetch_pte(struct protection_domain *domain, + unsigned long address) +{ + u64 *pte; + + pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + + return pte; +} + /* * This function is used to add a new aperture range to an existing * aperture in case of dma_ops domain allocation or address allocation * failure. */ -static int alloc_new_range(struct dma_ops_domain *dma_dom, +static int alloc_new_range(struct amd_iommu *iommu, + struct dma_ops_domain *dma_dom, bool populate, gfp_t gfp) { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + int i; if (index >= APERTURE_MAX_RANGES) return -ENOMEM; @@ -662,6 +693,33 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, dma_dom->aperture_size += APERTURE_RANGE_SIZE; + /* Intialize the exclusion range if necessary */ + if (iommu->exclusion_start && + iommu->exclusion_start >= dma_dom->aperture[index]->offset && + iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } + + /* + * Check for areas already mapped as present in the new aperture + * range and mark those pages as reserved in the allocator. Such + * mappings may already exist as a result of requested unity + * mappings for devices. + */ + for (i = dma_dom->aperture[index]->offset; + i < dma_dom->aperture_size; + i += PAGE_SIZE) { + u64 *pte = fetch_pte(&dma_dom->domain, i); + if (!pte || !IOMMU_PTE_PRESENT(*pte)) + continue; + + dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); + } + return 0; out_free: @@ -911,7 +969,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; - if (alloc_new_range(dma_dom, true, GFP_KERNEL)) + if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) goto free_dma_dom; /* @@ -921,15 +979,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->aperture[0]->bitmap[0] = 1; dma_dom->next_address = 0; - /* Intialize the exclusion range if necessary */ - if (iommu->exclusion_start && - iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - dma_ops_reserve_addresses(dma_dom, startpage, pages); - } return dma_dom; From 11b83888ae729457b5cfb936dbd498481f6408df Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2009 10:23:15 +0200 Subject: [PATCH 675/900] amd-iommu: enlarge the aperture dynamically By dynamically increasing the aperture the extended allocator is now ready for use. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 59ee1b94a7c..d129d8feba0 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1403,10 +1403,26 @@ static dma_addr_t __map_single(struct device *dev, if (align) align_mask = (1UL << get_order(size)) - 1; +retry: address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, dma_mask); - if (unlikely(address == bad_dma_address)) - goto out; + if (unlikely(address == bad_dma_address)) { + /* + * setting next_address here will let the address + * allocator only scan the new allocated range in the + * first run. This is a small optimization. + */ + dma_dom->next_address = dma_dom->aperture_size; + + if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) + goto out; + + /* + * aperture was sucessfully enlarged by 128 MB, try + * allocation again + */ + goto retry; + } start = address; for (i = 0; i < pages; ++i) { From d9cfed925448f097ec7faab80d903eb7e5f99712 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2009 12:16:29 +0200 Subject: [PATCH 676/900] amd-iommu: remove amd_iommu_size kernel parameter This parameter is not longer necessary when aperture increases dynamically. Signed-off-by: Joerg Roedel --- Documentation/kernel-parameters.txt | 5 ----- arch/x86/kernel/amd_iommu.c | 18 ++++-------------- arch/x86/kernel/amd_iommu_init.c | 15 --------------- 3 files changed, 4 insertions(+), 34 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e87bdbfbcc7..5b776c6e796 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -329,11 +329,6 @@ and is between 256 and 4096 characters. It is defined in the file flushed before they will be reused, which is a lot of faster - amd_iommu_size= [HW,X86-64] - Define the size of the aperture for the AMD IOMMU - driver. Possible values are: - '32M', '64M' (default), '128M', '256M', '512M', '1G' - amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: , diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d129d8feba0..31d56c36010 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -939,17 +939,10 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) * It also intializes the page table and the address allocator data * structures required for the dma_ops interface */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, - unsigned order) +static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) { struct dma_ops_domain *dma_dom; - /* - * Currently the DMA aperture must be between 32 MB and 1GB in size - */ - if ((order < 25) || (order > 30)) - return NULL; - dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); if (!dma_dom) return NULL; @@ -1087,7 +1080,6 @@ static int device_change_notifier(struct notifier_block *nb, struct protection_domain *domain; struct dma_ops_domain *dma_domain; struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; unsigned long flags; if (devid > amd_iommu_last_bdf) @@ -1126,7 +1118,7 @@ static int device_change_notifier(struct notifier_block *nb, dma_domain = find_protection_domain(devid); if (dma_domain) goto out; - dma_domain = dma_ops_domain_alloc(iommu, order); + dma_domain = dma_ops_domain_alloc(iommu); if (!dma_domain) goto out; dma_domain->target_dev = devid; @@ -1826,7 +1818,6 @@ static void prealloc_protection_domains(void) struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { @@ -1839,7 +1830,7 @@ static void prealloc_protection_domains(void) iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; - dma_dom = dma_ops_domain_alloc(iommu, order); + dma_dom = dma_ops_domain_alloc(iommu); if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); @@ -1865,7 +1856,6 @@ static struct dma_map_ops amd_iommu_dma_ops = { int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; int ret; /* @@ -1874,7 +1864,7 @@ int __init amd_iommu_init_dma_ops(void) * protection domain will be assigned to the default one. */ list_for_each_entry(iommu, &amd_iommu_list, list) { - iommu->default_dom = dma_ops_domain_alloc(iommu, order); + iommu->default_dom = dma_ops_domain_alloc(iommu); if (iommu->default_dom == NULL) return -ENOMEM; iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902da..762a4eefec9 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -121,7 +121,6 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ bool amd_iommu_isolate = true; /* if true, device isolation is enabled */ bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ @@ -1137,9 +1136,6 @@ int __init amd_iommu_init(void) enable_iommus(); - printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", - (1 << (amd_iommu_aperture_order-20))); - printk(KERN_INFO "AMD IOMMU: device isolation "); if (amd_iommu_isolate) printk("enabled\n"); @@ -1225,15 +1221,4 @@ static int __init parse_amd_iommu_options(char *str) return 1; } -static int __init parse_amd_iommu_size_options(char *str) -{ - unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); - - if ((order > 24) && (order < 31)) - amd_iommu_aperture_order = order; - - return 1; -} - __setup("amd_iommu=", parse_amd_iommu_options); -__setup("amd_iommu_size=", parse_amd_iommu_size_options); From fe16f088a88fb73161bba8784375c829f7e87b54 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:27:53 +0200 Subject: [PATCH 677/900] amd-iommu: disable round-robin allocator for CONFIG_IOMMU_STRESS Disabling the round-robin allocator results in reusing the same dma-addresses again very fast. This is a good test if the iotlb flushing is working correctly. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 31d56c36010..543822b39a8 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -783,6 +783,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, { unsigned long address; +#ifdef CONFIG_IOMMU_STRESS + dom->next_address = 0; + dom->need_flush = true; +#endif + address = dma_ops_area_alloc(dev, dom, pages, align_mask, dma_mask, dom->next_address); From f5e9705c6429d24dee832b2edd7f4848d432ea03 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:31:53 +0200 Subject: [PATCH 678/900] amd-iommu: don't preallocate page tables with CONFIG_IOMMU_STRESS This forces testing of on-demand page table allocation code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 543822b39a8..33434c497a6 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -661,6 +661,10 @@ static int alloc_new_range(struct amd_iommu *iommu, int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; int i; +#ifdef CONFIG_IOMMU_STRESS + populate = false; +#endif + if (index >= APERTURE_MAX_RANGES) return -ENOMEM; From 47bccd6bb2b866449d3ecf2ba350ac1c7473b2b8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:40:54 +0200 Subject: [PATCH 679/900] amd-iommu: don't free dma adresses below 512MB with CONFIG_IOMMU_STRESS This will test the automatic aperture enlargement code. This is important because only very few devices will ever trigger this code path. So force it under CONFIG_IOMMU_STRESS. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 33434c497a6..04ff5ec4ac0 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -824,6 +824,11 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); +#ifdef CONFIG_IOMMU_STRESS + if (i < 4) + return; +#endif + if (address >= dom->next_address) dom->need_flush = true; From a3ce6ea46cc0d6397d1b92b1a5983bb2935306ed Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 28 May 2009 09:51:31 -0700 Subject: [PATCH 680/900] Input: libps2 - better handle bad scheduler decisions Sometimes devices send us their responses in time but due to unfortunate scheduling decisions the receiving thread does not get scheduled till much later and we erroneously decide that device timed out. Work around this problem by checking whether we received the data we needed instead of checking timeout condition. Tested-by: Sitsofe Wheeler Signed-off-by: Dmitry Torokhov --- drivers/input/serio/libps2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c index 67248c31e19..be5bbbb8ae4 100644 --- a/drivers/input/serio/libps2.c +++ b/drivers/input/serio/libps2.c @@ -210,7 +210,7 @@ int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command) timeout = wait_event_timeout(ps2dev->wait, !(ps2dev->flags & PS2_FLAG_CMD1), timeout); - if (ps2dev->cmdcnt && timeout > 0) { + if (ps2dev->cmdcnt && !(ps2dev->flags & PS2_FLAG_CMD1)) { timeout = ps2_adjust_timeout(ps2dev, command, timeout); wait_event_timeout(ps2dev->wait, From bac4e960b5ce2453d862beaf20e59aa68af3b43a Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 25 May 2009 20:58:00 +0100 Subject: [PATCH 681/900] [ARM] barriers: improve xchg, bitops and atomic SMP barriers Mathieu Desnoyers pointed out that the ARM barriers were lacking: - cmpxchg, xchg and atomic add return need memory barriers on architectures which can reorder the relative order in which memory read/writes can be seen between CPUs, which seems to include recent ARM architectures. Those barriers are currently missing on ARM. - test_and_xxx_bit were missing SMP barriers. So put these barriers in. Provide separate atomic_add/atomic_sub operations which do not require barriers. Reported-Reviewed-and-Acked-by: Mathieu Desnoyers Signed-off-by: Russell King --- arch/arm/include/asm/assembler.h | 13 +++++++ arch/arm/include/asm/atomic.h | 61 +++++++++++++++++++++++++++----- arch/arm/include/asm/system.h | 3 ++ arch/arm/kernel/entry-armv.S | 5 +-- arch/arm/lib/bitops.h | 2 ++ 5 files changed, 71 insertions(+), 13 deletions(-) diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 6116e4893c0..15f8a092b70 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -114,3 +114,16 @@ .align 3; \ .long 9999b,9001f; \ .previous + +/* + * SMP data memory barrier + */ + .macro smp_dmb +#ifdef CONFIG_SMP +#if __LINUX_ARM_ARCH__ >= 7 + dmb +#elif __LINUX_ARM_ARCH__ == 6 + mcr p15, 0, r0, c7, c10, 5 @ dmb +#endif +#endif + .endm diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index ee99723b3a6..16b52f39798 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -44,11 +44,29 @@ static inline void atomic_set(atomic_t *v, int i) : "cc"); } +static inline void atomic_add(int i, atomic_t *v) +{ + unsigned long tmp; + int result; + + __asm__ __volatile__("@ atomic_add\n" +"1: ldrex %0, [%2]\n" +" add %0, %0, %3\n" +" strex %1, %0, [%2]\n" +" teq %1, #0\n" +" bne 1b" + : "=&r" (result), "=&r" (tmp) + : "r" (&v->counter), "Ir" (i) + : "cc"); +} + static inline int atomic_add_return(int i, atomic_t *v) { unsigned long tmp; int result; + smp_mb(); + __asm__ __volatile__("@ atomic_add_return\n" "1: ldrex %0, [%2]\n" " add %0, %0, %3\n" @@ -59,14 +77,34 @@ static inline int atomic_add_return(int i, atomic_t *v) : "r" (&v->counter), "Ir" (i) : "cc"); + smp_mb(); + return result; } +static inline void atomic_sub(int i, atomic_t *v) +{ + unsigned long tmp; + int result; + + __asm__ __volatile__("@ atomic_sub\n" +"1: ldrex %0, [%2]\n" +" sub %0, %0, %3\n" +" strex %1, %0, [%2]\n" +" teq %1, #0\n" +" bne 1b" + : "=&r" (result), "=&r" (tmp) + : "r" (&v->counter), "Ir" (i) + : "cc"); +} + static inline int atomic_sub_return(int i, atomic_t *v) { unsigned long tmp; int result; + smp_mb(); + __asm__ __volatile__("@ atomic_sub_return\n" "1: ldrex %0, [%2]\n" " sub %0, %0, %3\n" @@ -77,6 +115,8 @@ static inline int atomic_sub_return(int i, atomic_t *v) : "r" (&v->counter), "Ir" (i) : "cc"); + smp_mb(); + return result; } @@ -84,6 +124,8 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) { unsigned long oldval, res; + smp_mb(); + do { __asm__ __volatile__("@ atomic_cmpxchg\n" "ldrex %1, [%2]\n" @@ -95,6 +137,8 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) : "cc"); } while (res); + smp_mb(); + return oldval; } @@ -135,6 +179,7 @@ static inline int atomic_add_return(int i, atomic_t *v) return val; } +#define atomic_add(i, v) (void) atomic_add_return(i, v) static inline int atomic_sub_return(int i, atomic_t *v) { @@ -148,6 +193,7 @@ static inline int atomic_sub_return(int i, atomic_t *v) return val; } +#define atomic_sub(i, v) (void) atomic_sub_return(i, v) static inline int atomic_cmpxchg(atomic_t *v, int old, int new) { @@ -187,10 +233,8 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) } #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) -#define atomic_add(i, v) (void) atomic_add_return(i, v) -#define atomic_inc(v) (void) atomic_add_return(1, v) -#define atomic_sub(i, v) (void) atomic_sub_return(i, v) -#define atomic_dec(v) (void) atomic_sub_return(1, v) +#define atomic_inc(v) atomic_add(1, v) +#define atomic_dec(v) atomic_sub(1, v) #define atomic_inc_and_test(v) (atomic_add_return(1, v) == 0) #define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) @@ -200,11 +244,10 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) #define atomic_add_negative(i,v) (atomic_add_return(i, v) < 0) -/* Atomic operations are already serializing on ARM */ -#define smp_mb__before_atomic_dec() barrier() -#define smp_mb__after_atomic_dec() barrier() -#define smp_mb__before_atomic_inc() barrier() -#define smp_mb__after_atomic_inc() barrier() +#define smp_mb__before_atomic_dec() smp_mb() +#define smp_mb__after_atomic_dec() smp_mb() +#define smp_mb__before_atomic_inc() smp_mb() +#define smp_mb__after_atomic_inc() smp_mb() #include #endif diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index bd4dc8ed53d..7fce8f3b391 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -248,6 +248,8 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size unsigned int tmp; #endif + smp_mb(); + switch (size) { #if __LINUX_ARM_ARCH__ >= 6 case 1: @@ -307,6 +309,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size __bad_xchg(ptr, size), ret = 0; break; } + smp_mb(); return ret; } diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index d662a2f1fd8..83b1da6b7ba 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -815,10 +815,7 @@ __kuser_helper_start: */ __kuser_memory_barrier: @ 0xffff0fa0 - -#if __LINUX_ARM_ARCH__ >= 6 && defined(CONFIG_SMP) - mcr p15, 0, r0, c7, c10, 5 @ dmb -#endif + smp_dmb usr_ret lr .align 5 diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h index 2e787d40d59..c7f2627385e 100644 --- a/arch/arm/lib/bitops.h +++ b/arch/arm/lib/bitops.h @@ -18,12 +18,14 @@ mov r2, #1 add r1, r1, r0, lsr #3 @ Get byte offset mov r3, r2, lsl r3 @ create mask + smp_dmb 1: ldrexb r2, [r1] ands r0, r2, r3 @ save old value of bit \instr r2, r2, r3 @ toggle bit strexb ip, r2, [r1] cmp ip, #0 bne 1b + smp_dmb cmp r0, #0 movne r0, #1 2: mov pc, lr From ecd322c9b3e4ac70f9f108badde3eb6b99c7993d Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 28 May 2009 16:07:39 -0400 Subject: [PATCH 682/900] [ARM] Add cmpxchg support for ARMv6+ systems (v5) Add cmpxchg/cmpxchg64 support for ARMv6K and ARMv7 systems (original patch from Catalin Marinas ) The cmpxchg and cmpxchg64 functions can be implemented using the LDREX*/STREX* instructions. Since operand lengths other than 32bit are required, the full implementations are only available if the ARMv6K extensions are present (for the LDREXB, LDREXH and LDREXD instructions). For ARMv6, only 32-bits cmpxchg is available. Mathieu : Make cmpxchg_local always available with best implementation for all type sizes (1, 2, 4 bytes). Make cmpxchg64_local always available. Use "Ir" constraint for "old" operand, like atomic.h atomic_cmpxchg does. Change since v3 : - Add "memory" clobbers (thanks to Nicolas Pitre) - removed __asmeq(), only needed for old compilers, very unlikely on ARMv6+. Note : ARMv7-M should eventually be ifdefed-out of cmpxchg64. But it's not supported by the Linux kernel currently. Put back arm < v6 cmpxchg support. Signed-off-by: Mathieu Desnoyers CC: Catalin Marinas CC: Nicolas Pitre Signed-off-by: Russell King --- arch/arm/include/asm/system.h | 173 ++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index 7fce8f3b391..d65b2f5bf41 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -319,6 +319,12 @@ extern void enable_hlt(void); #include +#if __LINUX_ARM_ARCH__ < 6 + +#ifdef CONFIG_SMP +#error "SMP is not supported on this platform" +#endif + /* * cmpxchg_local and cmpxchg64_local are atomic wrt current CPU. Always make * them available. @@ -332,6 +338,173 @@ extern void enable_hlt(void); #include #endif +#else /* __LINUX_ARM_ARCH__ >= 6 */ + +extern void __bad_cmpxchg(volatile void *ptr, int size); + +/* + * cmpxchg only support 32-bits operands on ARMv6. + */ + +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, + unsigned long new, int size) +{ + unsigned long oldval, res; + + switch (size) { +#ifdef CONFIG_CPU_32v6K + case 1: + do { + asm volatile("@ __cmpxchg1\n" + " ldrexb %1, [%2]\n" + " mov %0, #0\n" + " teq %1, %3\n" + " strexbeq %0, %4, [%2]\n" + : "=&r" (res), "=&r" (oldval) + : "r" (ptr), "Ir" (old), "r" (new) + : "memory", "cc"); + } while (res); + break; + case 2: + do { + asm volatile("@ __cmpxchg1\n" + " ldrexh %1, [%2]\n" + " mov %0, #0\n" + " teq %1, %3\n" + " strexheq %0, %4, [%2]\n" + : "=&r" (res), "=&r" (oldval) + : "r" (ptr), "Ir" (old), "r" (new) + : "memory", "cc"); + } while (res); + break; +#endif /* CONFIG_CPU_32v6K */ + case 4: + do { + asm volatile("@ __cmpxchg4\n" + " ldrex %1, [%2]\n" + " mov %0, #0\n" + " teq %1, %3\n" + " strexeq %0, %4, [%2]\n" + : "=&r" (res), "=&r" (oldval) + : "r" (ptr), "Ir" (old), "r" (new) + : "memory", "cc"); + } while (res); + break; + default: + __bad_cmpxchg(ptr, size); + oldval = 0; + } + + return oldval; +} + +static inline unsigned long __cmpxchg_mb(volatile void *ptr, unsigned long old, + unsigned long new, int size) +{ + unsigned long ret; + + smp_mb(); + ret = __cmpxchg(ptr, old, new, size); + smp_mb(); + + return ret; +} + +#define cmpxchg(ptr,o,n) \ + ((__typeof__(*(ptr)))__cmpxchg_mb((ptr), \ + (unsigned long)(o), \ + (unsigned long)(n), \ + sizeof(*(ptr)))) + +static inline unsigned long __cmpxchg_local(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long ret; + + switch (size) { +#ifndef CONFIG_CPU_32v6K + case 1: + case 2: + ret = __cmpxchg_local_generic(ptr, old, new, size); + break; +#endif /* !CONFIG_CPU_32v6K */ + default: + ret = __cmpxchg(ptr, old, new, size); + } + + return ret; +} + +#define cmpxchg_local(ptr,o,n) \ + ((__typeof__(*(ptr)))__cmpxchg_local((ptr), \ + (unsigned long)(o), \ + (unsigned long)(n), \ + sizeof(*(ptr)))) + +#ifdef CONFIG_CPU_32v6K + +/* + * Note : ARMv7-M (currently unsupported by Linux) does not support + * ldrexd/strexd. If ARMv7-M is ever supported by the Linux kernel, it should + * not be allowed to use __cmpxchg64. + */ +static inline unsigned long long __cmpxchg64(volatile void *ptr, + unsigned long long old, + unsigned long long new) +{ + register unsigned long long oldval asm("r0"); + register unsigned long long __old asm("r2") = old; + register unsigned long long __new asm("r4") = new; + unsigned long res; + + do { + asm volatile( + " @ __cmpxchg8\n" + " ldrexd %1, %H1, [%2]\n" + " mov %0, #0\n" + " teq %1, %3\n" + " teqeq %H1, %H3\n" + " strexdeq %0, %4, %H4, [%2]\n" + : "=&r" (res), "=&r" (oldval) + : "r" (ptr), "Ir" (__old), "r" (__new) + : "memory", "cc"); + } while (res); + + return oldval; +} + +static inline unsigned long long __cmpxchg64_mb(volatile void *ptr, + unsigned long long old, + unsigned long long new) +{ + unsigned long long ret; + + smp_mb(); + ret = __cmpxchg64(ptr, old, new); + smp_mb(); + + return ret; +} + +#define cmpxchg64(ptr,o,n) \ + ((__typeof__(*(ptr)))__cmpxchg64_mb((ptr), \ + (unsigned long long)(o), \ + (unsigned long long)(n))) + +#define cmpxchg64_local(ptr,o,n) \ + ((__typeof__(*(ptr)))__cmpxchg64((ptr), \ + (unsigned long long)(o), \ + (unsigned long long)(n))) + +#else /* !CONFIG_CPU_32v6K */ + +#define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n)) + +#endif /* CONFIG_CPU_32v6K */ + +#endif /* __LINUX_ARM_ARCH__ >= 6 */ + #endif /* __ASSEMBLY__ */ #define arch_align_stack(x) (x) From f42706c90470851fd2e97eda7a4109e8949bde8a Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Fri, 1 May 2009 17:21:11 +0200 Subject: [PATCH 683/900] USB: atmel-usba-udc : fix control out requests. usbtest #14 was failing with "udc: ep0: TXCOMP: Invalid endpoint state 2, halting endpoint..." This occured since ep0 is bidirectional and ep->is_in is not valid (must always use ep->state) Signed-off-by: Martin Fuzzey Acked-by: David Brownell Acked-by: Haavard Skinnemoen Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/atmel_usba_udc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/atmel_usba_udc.c b/drivers/usb/gadget/atmel_usba_udc.c index 563d5727544..5644897c1c5 100644 --- a/drivers/usb/gadget/atmel_usba_udc.c +++ b/drivers/usb/gadget/atmel_usba_udc.c @@ -794,7 +794,8 @@ usba_ep_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t gfp_flags) if (ep->desc) { list_add_tail(&req->queue, &ep->queue); - if (ep->is_in || (ep_is_control(ep) + if ((!ep_is_control(ep) && ep->is_in) || + (ep_is_control(ep) && (ep->state == DATA_STAGE_IN || ep->state == STATUS_STAGE_IN))) usba_ep_writel(ep, CTL_ENB, USBA_TX_PK_RDY); From fe92c9e481a147a9e1e14f55870f32903b967777 Mon Sep 17 00:00:00 2001 From: Haavard Skinnemoen Date: Mon, 4 May 2009 17:22:43 -0700 Subject: [PATCH 684/900] USB: atmel_usb_udc: Use kzalloc() to allocate ep structures This ensures that all fields are properly initialized. Signed-off-by: Haavard Skinnemoen Acked-by: David Brownell Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/atmel_usba_udc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/atmel_usba_udc.c b/drivers/usb/gadget/atmel_usba_udc.c index 5644897c1c5..05c913cc365 100644 --- a/drivers/usb/gadget/atmel_usba_udc.c +++ b/drivers/usb/gadget/atmel_usba_udc.c @@ -1941,7 +1941,7 @@ static int __init usba_udc_probe(struct platform_device *pdev) usba_writel(udc, CTRL, USBA_DISABLE_MASK); clk_disable(pclk); - usba_ep = kmalloc(sizeof(struct usba_ep) * pdata->num_ep, + usba_ep = kzalloc(sizeof(struct usba_ep) * pdata->num_ep, GFP_KERNEL); if (!usba_ep) goto err_alloc_ep; From cab98a0a349829b145d924c0649a2d30cd6a9e3d Mon Sep 17 00:00:00 2001 From: Xiao Kaijian Date: Fri, 8 May 2009 00:48:23 +0800 Subject: [PATCH 685/900] USB: Yet another Conexant Clone to add to cdc-acm.c This patch adds another quirky Conexant USB Modem Clone to usb cdc-acm.c Signed-off-by: Xiao Kaijian Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 0a69c0977e3..7a1164dd1d3 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1375,6 +1375,9 @@ static struct usb_device_id acm_ids[] = { { USB_DEVICE(0x0572, 0x1324), /* Conexant USB MODEM RD02-D400 */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, + { USB_DEVICE(0x0572, 0x1328), /* Shiro / Aztech USB MODEM UM-3100 */ + .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ + }, { USB_DEVICE(0x22b8, 0x6425), /* Motorola MOTOMAGX phones */ }, { USB_DEVICE(0x0572, 0x1329), /* Hummingbird huc56s (Conexant) */ From 0afb20e00b5053170c85298fed842b32d20b4ea9 Mon Sep 17 00:00:00 2001 From: Warren Free Date: Fri, 8 May 2009 10:27:08 +0200 Subject: [PATCH 686/900] USB: isp1760: urb_dequeue doesn't always find the urbs The option driver (and presumably others) allocates several URBs when it opens and tries to free them when it closes. The isp1760_urb_dequeue function gets called, but the packet being dequeued is not necessarily at the front of one of the 32 queues. If not, the isp1760_urb_done function doesn't get called for the URB and the process trying to free it hangs forever on a wait_queue. This patch does two things. If the URB being dequeued has others queued behind it, it re-queues them. And it searches the queues looking for the URB being dequeued rather than just looking at the one at the front of the queue. [bigeasy@linutronix] whitespace fixes, reformating Cc: stable Signed-off-by: Warren Free Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/isp1760-hcd.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/isp1760-hcd.c b/drivers/usb/host/isp1760-hcd.c index cd07ea3f0c6..15438469f21 100644 --- a/drivers/usb/host/isp1760-hcd.c +++ b/drivers/usb/host/isp1760-hcd.c @@ -1658,6 +1658,7 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, u32 reg_base, or_reg, skip_reg; unsigned long flags; struct ptd ptd; + packet_enqueue *pe; switch (usb_pipetype(urb->pipe)) { case PIPE_ISOCHRONOUS: @@ -1669,6 +1670,7 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, reg_base = INT_REGS_OFFSET; or_reg = HC_INT_IRQ_MASK_OR_REG; skip_reg = HC_INT_PTD_SKIPMAP_REG; + pe = enqueue_an_INT_packet; break; default: @@ -1676,6 +1678,7 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, reg_base = ATL_REGS_OFFSET; or_reg = HC_ATL_IRQ_MASK_OR_REG; skip_reg = HC_ATL_PTD_SKIPMAP_REG; + pe = enqueue_an_ATL_packet; break; } @@ -1687,6 +1690,7 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, u32 skip_map; u32 or_map; struct isp1760_qtd *qtd; + struct isp1760_qh *qh = ints->qh; skip_map = isp1760_readl(hcd->regs + skip_reg); skip_map |= 1 << i; @@ -1699,8 +1703,7 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, priv_write_copy(priv, (u32 *)&ptd, hcd->regs + reg_base + i * sizeof(ptd), sizeof(ptd)); qtd = ints->qtd; - - clean_up_qtdlist(qtd); + qtd = clean_up_qtdlist(qtd); free_mem(priv, ints->payload); @@ -1711,7 +1714,24 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, ints->payload = 0; isp1760_urb_done(priv, urb, status); + if (qtd) + pe(hcd, qh, qtd); break; + + } else if (ints->qtd) { + struct isp1760_qtd *qtd, *prev_qtd = ints->qtd; + + for (qtd = ints->qtd->hw_next; qtd; qtd = qtd->hw_next) { + if (qtd->urb == urb) { + prev_qtd->hw_next = clean_up_qtdlist(qtd); + isp1760_urb_done(priv, urb, status); + break; + } + prev_qtd = qtd; + } + /* we found the urb before the end of the list */ + if (qtd) + break; } ints++; } From 0a3c8549ea7e94d74a41096d42bc6cdf43d183bf Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 27 May 2009 11:25:52 -0400 Subject: [PATCH 687/900] usb-serial: fix crash when sub-driver updates firmware This patch (as1244) fixes a crash in usb-serial that occurs when a sub-driver returns a positive value from its attach method, indicating that new firmware was loaded and the device will disconnect and reconnect. The usb-serial core then skips the step of registering the port devices; when the disconnect occurs, the attempt to unregister the ports fails dramatically. This problem shows up with Keyspan devices and it might affect others as well. When the attach method returns a positive value, the patch sets num_ports to 0. This tells usb_serial_disconnect() not to try unregistering any of the ports; instead they are cleaned up by destroy_serial(). Signed-off-by: Alan Stern Tested-by: Benjamin Herrenschmidt Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/usb-serial.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index 0a566eea49c..f331e2bde88 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -974,6 +974,7 @@ int usb_serial_probe(struct usb_interface *interface, if (retval > 0) { /* quietly accept this device, but don't bind to a serial port as it's about to disappear */ + serial->num_ports = 0; goto exit; } } From 29868b281f6d057b4cbe348f4483f1717c021c5c Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 20 May 2009 08:49:48 -0400 Subject: [PATCH 688/900] Revert "USB: Correct Makefile to make isp1760 buildable" This reverts commit 26e1287594864169577327fef233befc9739be3b. A larger patch (f7e7aa585) a few days after this one added the same line to the Makefile, but in a different place. While it'd be more correct to revert that one, it's easier to revert this one because this is a one-liner. Signed-off-by: Mike Frysinger CC: Greg Kroah-Hartman CC: linux-usb@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile index 0716cdb44cd..0a3dc5ece63 100644 --- a/drivers/usb/Makefile +++ b/drivers/usb/Makefile @@ -11,7 +11,6 @@ obj-$(CONFIG_USB_MON) += mon/ obj-$(CONFIG_PCI) += host/ obj-$(CONFIG_USB_EHCI_HCD) += host/ obj-$(CONFIG_USB_ISP116X_HCD) += host/ -obj-$(CONFIG_USB_ISP1760_HCD) += host/ obj-$(CONFIG_USB_OHCI_HCD) += host/ obj-$(CONFIG_USB_UHCI_HCD) += host/ obj-$(CONFIG_USB_FHCI_HCD) += host/ From 086a377edc969aea6c761176a7e4ff68f264d6fe Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 7 May 2009 12:36:53 -0700 Subject: [PATCH 689/900] sysfs: file.c: use create_singlethread_workqueue() We don't need a kernel thread per CPU for this application. Acked-by: Alex Chiang Cc: Lai Jiangshan Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index b1606e07b7a..561a9c050ce 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -723,7 +723,7 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), mutex_unlock(&sysfs_workq_mutex); if (sysfs_workqueue == NULL) { - sysfs_workqueue = create_workqueue("sysfsd"); + sysfs_workqueue = create_singlethread_workqueue("sysfsd"); if (sysfs_workqueue == NULL) { module_put(owner); return -ENOMEM; From 5c8563d773c0e9f0ac2a552e84806decd98ce732 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 28 May 2009 14:24:07 -0700 Subject: [PATCH 690/900] Driver Core: do not oops when driver_unregister() is called for unregistered drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We also fix a problem with cleaning up properly when initializing drivers and devices, so checks like this will work successfully. Portions of the patch by Linus and Greg and Ingo. Reported-by: Ozan ÇaÄŸlayan Signed-off-by: Kay Sievers Cc: Linus Torvalds Cc: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- drivers/base/bus.c | 4 +++- drivers/base/core.c | 5 ++++- drivers/base/driver.c | 4 ++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/base/bus.c b/drivers/base/bus.c index dc030f1f00f..c6599618523 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -700,8 +700,10 @@ int bus_add_driver(struct device_driver *drv) } kobject_uevent(&priv->kobj, KOBJ_ADD); - return error; + return 0; out_unregister: + kfree(drv->p); + drv->p = NULL; kobject_put(&priv->kobj); out_put_bus: bus_put(bus); diff --git a/drivers/base/core.c b/drivers/base/core.c index 4aa527b8a91..1977d4beb89 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -879,7 +879,7 @@ int device_add(struct device *dev) } if (!dev_name(dev)) - goto done; + goto name_error; pr_debug("device: '%s': %s\n", dev_name(dev), __func__); @@ -978,6 +978,9 @@ done: cleanup_device_parent(dev); if (parent) put_device(parent); +name_error: + kfree(dev->p); + dev->p = NULL; goto done; } diff --git a/drivers/base/driver.c b/drivers/base/driver.c index c51f11bb29a..8ae0f63602e 100644 --- a/drivers/base/driver.c +++ b/drivers/base/driver.c @@ -257,6 +257,10 @@ EXPORT_SYMBOL_GPL(driver_register); */ void driver_unregister(struct device_driver *drv) { + if (!drv || !drv->p) { + WARN(1, "Unexpected driver unregister!\n"); + return; + } driver_remove_groups(drv, drv->groups); bus_remove_driver(drv); } From 2f102607ac77354b02a76cf2748598ce9f270f08 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 27 May 2009 23:59:58 -0400 Subject: [PATCH 691/900] i7300_idle: allow testing on i5000-series hardware w/o re-compile Testing the i7300_idle driver on i5000-series hardware required an edit to i7300_idle.h to "#define SUPPORT_I5000 1" and a re-build of both i7300_idle and ioat_dma. Replace that build-time scheme with a load-time module parameter: "7300_idle.forceload=1" to make it easier to test the driver on hardware that while not officially validated, works fine and is much more commonly available. By default (no modparam) the driver will continue to load only on the i7300. Note that ioat_dma runs a copy of i7300_idle's probe routine to know to reserve an IOAT channel for i7300_idle. This change makes ioat_dma do that always on the i5000, just like it does on the i7300. Signed-off-by: Len Brown Acked-by: Andrew Henroid --- drivers/dma/ioat_dma.c | 2 +- drivers/idle/i7300_idle.c | 6 +++++- include/linux/i7300_idle.h | 20 ++++++++++---------- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index 1955ee8d6d2..a600fc0f796 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -173,7 +173,7 @@ static int ioat_dma_enumerate_channels(struct ioatdma_device *device) xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale)); #ifdef CONFIG_I7300_IDLE_IOAT_CHANNEL - if (i7300_idle_platform_probe(NULL, NULL) == 0) { + if (i7300_idle_platform_probe(NULL, NULL, 1) == 0) { device->common.chancnt--; } #endif diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c index bf740394d70..949c97ff57e 100644 --- a/drivers/idle/i7300_idle.c +++ b/drivers/idle/i7300_idle.c @@ -41,6 +41,10 @@ static int debug; module_param_named(debug, debug, uint, 0644); MODULE_PARM_DESC(debug, "Enable debug printks in this driver"); +static int forceload; +module_param_named(forceload, forceload, uint, 0644); +MODULE_PARM_DESC(debug, "Enable driver testing on unvalidated i5000"); + #define dprintk(fmt, arg...) \ do { if (debug) printk(KERN_INFO I7300_PRINT fmt, ##arg); } while (0) @@ -552,7 +556,7 @@ static int __init i7300_idle_init(void) cpus_clear(idle_cpumask); total_us = 0; - if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev)) + if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev, forceload)) return -ENODEV; if (i7300_idle_thrt_save()) diff --git a/include/linux/i7300_idle.h b/include/linux/i7300_idle.h index 05a80c44513..1587b7dec50 100644 --- a/include/linux/i7300_idle.h +++ b/include/linux/i7300_idle.h @@ -16,35 +16,33 @@ struct fbd_ioat { unsigned int vendor; unsigned int ioat_dev; + unsigned int enabled; }; /* * The i5000 chip-set has the same hooks as the i7300 - * but support is disabled by default because this driver - * has not been validated on that platform. + * but it is not enabled by default and must be manually + * manually enabled with "forceload=1" because it is + * only lightly validated. */ -#define SUPPORT_I5000 0 static const struct fbd_ioat fbd_ioat_list[] = { - {PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB}, -#if SUPPORT_I5000 - {PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT}, -#endif + {PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB, 1}, + {PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT, 0}, {0, 0} }; /* table of devices that work with this driver */ static const struct pci_device_id pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_FBD_CNB) }, -#if SUPPORT_I5000 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5000_ERR) }, -#endif { } /* Terminating entry */ }; /* Check for known platforms with I/O-AT */ static inline int i7300_idle_platform_probe(struct pci_dev **fbd_dev, - struct pci_dev **ioat_dev) + struct pci_dev **ioat_dev, + int enable_all) { int i; struct pci_dev *memdev, *dmadev; @@ -69,6 +67,8 @@ static inline int i7300_idle_platform_probe(struct pci_dev **fbd_dev, for (i = 0; fbd_ioat_list[i].vendor != 0; i++) { if (dmadev->vendor == fbd_ioat_list[i].vendor && dmadev->device == fbd_ioat_list[i].ioat_dev) { + if (!(fbd_ioat_list[i].enabled || enable_all)) + continue; if (fbd_dev) *fbd_dev = memdev; if (ioat_dev) From 10b6d95612672f89deb39b5a60fb677c78ba4844 Mon Sep 17 00:00:00 2001 From: Divy Le Ray Date: Thu, 28 May 2009 11:23:02 +0000 Subject: [PATCH 692/900] cxgb3: fix dma mapping regression Commit 5e68b772e6efd189d6aca76f6872fb75d51ace60 cxgb3: map entire Rx page, feed map+offset to Rx ring. introduced a regression on platforms defining DECLARE_PCI_UNMAP_ADDR() and related macros as no-ops. Rx descriptors are fed with the a page buffer bus address + page chunk offset. The page buffer bus address is set and retrieved through pci_unamp_addr_set(), pci_unmap_addr(). These functions being meaningless on x86 (if CONFIG_DMA_API_DEBUG is not set). The HW ends up with a bogus bus address. This patch saves the page buffer bus address for all plaftorms. Signed-off-by: Divy Le Ray Signed-off-by: David S. Miller --- drivers/net/cxgb3/adapter.h | 4 ++-- drivers/net/cxgb3/sge.c | 11 +++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h index 714df2b675e..c888e97c967 100644 --- a/drivers/net/cxgb3/adapter.h +++ b/drivers/net/cxgb3/adapter.h @@ -85,8 +85,8 @@ struct fl_pg_chunk { struct page *page; void *va; unsigned int offset; - u64 *p_cnt; - DECLARE_PCI_UNMAP_ADDR(mapping); + unsigned long *p_cnt; + dma_addr_t mapping; }; struct rx_desc; diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c index 26d3587f339..b3ee2bc1a00 100644 --- a/drivers/net/cxgb3/sge.c +++ b/drivers/net/cxgb3/sge.c @@ -355,7 +355,7 @@ static void clear_rx_desc(struct pci_dev *pdev, const struct sge_fl *q, (*d->pg_chunk.p_cnt)--; if (!*d->pg_chunk.p_cnt) pci_unmap_page(pdev, - pci_unmap_addr(&d->pg_chunk, mapping), + d->pg_chunk.mapping, q->alloc_size, PCI_DMA_FROMDEVICE); put_page(d->pg_chunk.page); @@ -454,7 +454,7 @@ static int alloc_pg_chunk(struct adapter *adapter, struct sge_fl *q, q->pg_chunk.offset = 0; mapping = pci_map_page(adapter->pdev, q->pg_chunk.page, 0, q->alloc_size, PCI_DMA_FROMDEVICE); - pci_unmap_addr_set(&q->pg_chunk, mapping, mapping); + q->pg_chunk.mapping = mapping; } sd->pg_chunk = q->pg_chunk; @@ -511,8 +511,7 @@ static int refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp_t gfp) nomem: q->alloc_failed++; break; } - mapping = pci_unmap_addr(&sd->pg_chunk, mapping) + - sd->pg_chunk.offset; + mapping = sd->pg_chunk.mapping + sd->pg_chunk.offset; pci_unmap_addr_set(sd, dma_addr, mapping); add_one_rx_chunk(mapping, d, q->gen); @@ -881,7 +880,7 @@ recycle: (*sd->pg_chunk.p_cnt)--; if (!*sd->pg_chunk.p_cnt) pci_unmap_page(adap->pdev, - pci_unmap_addr(&sd->pg_chunk, mapping), + sd->pg_chunk.mapping, fl->alloc_size, PCI_DMA_FROMDEVICE); if (!skb) { @@ -2096,7 +2095,7 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs, (*sd->pg_chunk.p_cnt)--; if (!*sd->pg_chunk.p_cnt) pci_unmap_page(adap->pdev, - pci_unmap_addr(&sd->pg_chunk, mapping), + sd->pg_chunk.mapping, fl->alloc_size, PCI_DMA_FROMDEVICE); From c22c8149313ee85c912e7b77a7afd04be8b8cba8 Mon Sep 17 00:00:00 2001 From: Divy Le Ray Date: Thu, 28 May 2009 11:23:08 +0000 Subject: [PATCH 693/900] cxgb3: link fault fixes Do not call t3_link_fault() under spinlock, as it calls msleep(). Besides, only the access to pi->link_fault needs to be serialized. Also initialize local variables before checking the link status, link state fields might otherwise end up containing garbage. Signed-off-by: Divy Le Ray Signed-off-by: David S. Miller --- drivers/net/cxgb3/cxgb3_main.c | 8 +++++--- drivers/net/cxgb3/t3_hw.c | 5 +++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c index 7ea48414c6c..17858b9a583 100644 --- a/drivers/net/cxgb3/cxgb3_main.c +++ b/drivers/net/cxgb3/cxgb3_main.c @@ -2496,14 +2496,16 @@ static void check_link_status(struct adapter *adapter) for_each_port(adapter, i) { struct net_device *dev = adapter->port[i]; struct port_info *p = netdev_priv(dev); + int link_fault; spin_lock_irq(&adapter->work_lock); - if (p->link_fault) { + link_fault = p->link_fault; + spin_unlock_irq(&adapter->work_lock); + + if (link_fault) { t3_link_fault(adapter, i); - spin_unlock_irq(&adapter->work_lock); continue; } - spin_unlock_irq(&adapter->work_lock); if (!(p->phy.caps & SUPPORTED_IRQ) && netif_running(dev)) { t3_xgm_intr_disable(adapter, i); diff --git a/drivers/net/cxgb3/t3_hw.c b/drivers/net/cxgb3/t3_hw.c index 4f68aeb2679..4950d5d789a 100644 --- a/drivers/net/cxgb3/t3_hw.c +++ b/drivers/net/cxgb3/t3_hw.c @@ -1274,6 +1274,11 @@ void t3_link_fault(struct adapter *adapter, int port_id) A_XGM_INT_STATUS + mac->offset); link_fault &= F_LINKFAULTCHANGE; + link_ok = lc->link_ok; + speed = lc->speed; + duplex = lc->duplex; + fc = lc->fc; + phy->ops->get_link_status(phy, &link_ok, &speed, &duplex, &fc); if (link_fault) { From 4e0168fa4842e27795a75b205a510f25b62181d9 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Thu, 28 May 2009 02:05:53 +0000 Subject: [PATCH 694/900] mac8390: fix build with NET_POLL_CONTROLLER Fix the build for CONFIG_NET_POLL_CONTROLLER that I broke with 217cbfa856dc1cbc2890781626c4032d9e3ec59f ("mac8390: fix regression caused during net_device_ops conversion"). Signed-off-by: Finn Thain Signed-off-by: David S. Miller --- drivers/net/mac8390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/mac8390.c b/drivers/net/mac8390.c index f26667d5eaa..22e74a0e036 100644 --- a/drivers/net/mac8390.c +++ b/drivers/net/mac8390.c @@ -489,7 +489,7 @@ static const struct net_device_ops mac8390_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ei_poll, + .ndo_poll_controller = __ei_poll, #endif }; From 6daad5c6c586bf07528ae5b39e801b204468f907 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 29 May 2009 10:15:08 +0100 Subject: [PATCH 695/900] [ARM] update mach-types Signed-off-by: Russell King --- arch/arm/tools/mach-types | 131 +++++++++++++++++++++++++++++++++++--- 1 file changed, 122 insertions(+), 9 deletions(-) diff --git a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types index 945e0d237a1..fec64678a63 100644 --- a/arch/arm/tools/mach-types +++ b/arch/arm/tools/mach-types @@ -12,7 +12,7 @@ # # http://www.arm.linux.org.uk/developer/machines/?action=new # -# Last update: Mon Mar 23 20:09:01 2009 +# Last update: Fri May 29 10:14:20 2009 # # machine_is_xxx CONFIG_xxxx MACH_TYPE_xxx number # @@ -916,7 +916,7 @@ nxdb500 MACH_NXDB500 NXDB500 905 apf9328 MACH_APF9328 APF9328 906 omap_wipoq MACH_OMAP_WIPOQ OMAP_WIPOQ 907 omap_twip MACH_OMAP_TWIP OMAP_TWIP 908 -palmt650 MACH_PALMT650 PALMT650 909 +treo650 MACH_TREO650 TREO650 909 acumen MACH_ACUMEN ACUMEN 910 xp100 MACH_XP100 XP100 911 fs2410 MACH_FS2410 FS2410 912 @@ -1232,7 +1232,7 @@ ql202b MACH_QL202B QL202B 1226 vpac270 MACH_VPAC270 VPAC270 1227 rd129 MACH_RD129 RD129 1228 htcwizard MACH_HTCWIZARD HTCWIZARD 1229 -xscale_treo680 MACH_XSCALE_TREO680 XSCALE_TREO680 1230 +treo680 MACH_TREO680 TREO680 1230 tecon_tmezon MACH_TECON_TMEZON TECON_TMEZON 1231 zylonite MACH_ZYLONITE ZYLONITE 1233 gene1270 MACH_GENE1270 GENE1270 1234 @@ -1418,10 +1418,10 @@ looxc550 MACH_LOOXC550 LOOXC550 1417 cnty_titan MACH_CNTY_TITAN CNTY_TITAN 1418 app3xx MACH_APP3XX APP3XX 1419 sideoatsgrama MACH_SIDEOATSGRAMA SIDEOATSGRAMA 1420 -palmtreo700p MACH_PALMTREO700P PALMTREO700P 1421 -palmtreo700w MACH_PALMTREO700W PALMTREO700W 1422 -palmtreo750 MACH_PALMTREO750 PALMTREO750 1423 -palmtreo755p MACH_PALMTREO755P PALMTREO755P 1424 +treo700p MACH_TREO700P TREO700P 1421 +treo700w MACH_TREO700W TREO700W 1422 +treo750 MACH_TREO750 TREO750 1423 +treo755p MACH_TREO755P TREO755P 1424 ezreganut9200 MACH_EZREGANUT9200 EZREGANUT9200 1425 sarge MACH_SARGE SARGE 1426 a696 MACH_A696 A696 1427 @@ -1721,7 +1721,7 @@ sapphire MACH_SAPPHIRE SAPPHIRE 1729 csb637xo MACH_CSB637XO CSB637XO 1730 evisiong MACH_EVISIONG EVISIONG 1731 stmp37xx MACH_STMP37XX STMP37XX 1732 -stmp378x MACH_STMP38XX STMP38XX 1733 +stmp378x MACH_STMP378X STMP378X 1733 tnt MACH_TNT TNT 1734 tbxt MACH_TBXT TBXT 1735 playmate MACH_PLAYMATE PLAYMATE 1736 @@ -1817,7 +1817,7 @@ smdkc100 MACH_SMDKC100 SMDKC100 1826 tavorevb MACH_TAVOREVB TAVOREVB 1827 saar MACH_SAAR SAAR 1828 deister_eyecam MACH_DEISTER_EYECAM DEISTER_EYECAM 1829 -at91sam9m10ek MACH_AT91SAM9M10EK AT91SAM9M10EK 1830 +at91sam9m10g45ek MACH_AT91SAM9M10G45EK AT91SAM9M10G45EK 1830 linkstation_produo MACH_LINKSTATION_PRODUO LINKSTATION_PRODUO 1831 hit_b0 MACH_HIT_B0 HIT_B0 1832 adx_rmu MACH_ADX_RMU ADX_RMU 1833 @@ -2132,3 +2132,116 @@ apollo MACH_APOLLO APOLLO 2141 at91cap9stk MACH_AT91CAP9STK AT91CAP9STK 2142 spc300 MACH_SPC300 SPC300 2143 eko MACH_EKO EKO 2144 +ccw9m2443 MACH_CCW9M2443 CCW9M2443 2145 +ccw9m2443js MACH_CCW9M2443JS CCW9M2443JS 2146 +m2m_router_device MACH_M2M_ROUTER_DEVICE M2M_ROUTER_DEVICE 2147 +str9104nas MACH_STAR9104NAS STAR9104NAS 2148 +pca100 MACH_PCA100 PCA100 2149 +z3_dm365_mod_01 MACH_Z3_DM365_MOD_01 Z3_DM365_MOD_01 2150 +hipox MACH_HIPOX HIPOX 2151 +omap3_piteds MACH_OMAP3_PITEDS OMAP3_PITEDS 2152 +bm150r MACH_BM150R BM150R 2153 +tbone MACH_TBONE TBONE 2154 +merlin MACH_MERLIN MERLIN 2155 +falcon MACH_FALCON FALCON 2156 +davinci_da850_evm MACH_DAVINCI_DA850_EVM DAVINCI_DA850_EVM 2157 +s5p6440 MACH_S5P6440 S5P6440 2158 +at91sam9g10ek MACH_AT91SAM9G10EK AT91SAM9G10EK 2159 +omap_4430sdp MACH_OMAP_4430SDP OMAP_4430SDP 2160 +lpc313x MACH_LPC313X LPC313X 2161 +magx_zn5 MACH_MAGX_ZN5 MAGX_ZN5 2162 +magx_em30 MACH_MAGX_EM30 MAGX_EM30 2163 +magx_ve66 MACH_MAGX_VE66 MAGX_VE66 2164 +meesc MACH_MEESC MEESC 2165 +otc570 MACH_OTC570 OTC570 2166 +bcu2412 MACH_BCU2412 BCU2412 2167 +beacon MACH_BEACON BEACON 2168 +actia_tgw MACH_ACTIA_TGW ACTIA_TGW 2169 +e4430 MACH_E4430 E4430 2170 +ql300 MACH_QL300 QL300 2171 +btmavb101 MACH_BTMAVB101 BTMAVB101 2172 +btmawb101 MACH_BTMAWB101 BTMAWB101 2173 +sq201 MACH_SQ201 SQ201 2174 +quatro45xx MACH_QUATRO45XX QUATRO45XX 2175 +openpad MACH_OPENPAD OPENPAD 2176 +tx25 MACH_TX25 TX25 2177 +omap3_torpedo MACH_OMAP3_TORPEDO OMAP3_TORPEDO 2178 +htcraphael_k MACH_HTCRAPHAEL_K HTCRAPHAEL_K 2179 +lal43 MACH_LAL43 LAL43 2181 +htcraphael_cdma500 MACH_HTCRAPHAEL_CDMA500 HTCRAPHAEL_CDMA500 2182 +anw6410 MACH_ANW6410 ANW6410 2183 +htcprophet MACH_HTCPROPHET HTCPROPHET 2185 +cfa_10022 MACH_CFA_10022 CFA_10022 2186 +imx27_visstrim_m10 MACH_IMX27_VISSTRIM_M10 IMX27_VISSTRIM_M10 2187 +px2imx27 MACH_PX2IMX27 PX2IMX27 2188 +stm3210e_eval MACH_STM3210E_EVAL STM3210E_EVAL 2189 +dvs10 MACH_DVS10 DVS10 2190 +portuxg20 MACH_PORTUXG20 PORTUXG20 2191 +arm_spv MACH_ARM_SPV ARM_SPV 2192 +smdkc110 MACH_SMDKC110 SMDKC110 2193 +cabespresso MACH_CABESPRESSO CABESPRESSO 2194 +hmc800 MACH_HMC800 HMC800 2195 +sholes MACH_SHOLES SHOLES 2196 +btmxc31 MACH_BTMXC31 BTMXC31 2197 +dt501 MACH_DT501 DT501 2198 +ktx MACH_KTX KTX 2199 +omap3517evm MACH_OMAP3517EVM OMAP3517EVM 2200 +netspace_v2 MACH_NETSPACE_V2 NETSPACE_V2 2201 +netspace_max_v2 MACH_NETSPACE_MAX_V2 NETSPACE_MAX_V2 2202 +d2net_v2 MACH_D2NET_V2 D2NET_V2 2203 +net2big_v2 MACH_NET2BIG_V2 NET2BIG_V2 2204 +net4big_v2 MACH_NET4BIG_V2 NET4BIG_V2 2205 +net5big_v2 MACH_NET5BIG_V2 NET5BIG_V2 2206 +endb2443 MACH_ENDB2443 ENDB2443 2207 +inetspace_v2 MACH_INETSPACE_V2 INETSPACE_V2 2208 +tros MACH_TROS TROS 2209 +pelco_homer MACH_PELCO_HOMER PELCO_HOMER 2210 +ofsp8 MACH_OFSP8 OFSP8 2211 +at91sam9g45ekes MACH_AT91SAM9G45EKES AT91SAM9G45EKES 2212 +guf_cupid MACH_GUF_CUPID GUF_CUPID 2213 +eab1r MACH_EAB1R EAB1R 2214 +desirec MACH_DESIREC DESIREC 2215 +cordoba MACH_CORDOBA CORDOBA 2216 +irvine MACH_IRVINE IRVINE 2217 +sff772 MACH_SFF772 SFF772 2218 +pelco_milano MACH_PELCO_MILANO PELCO_MILANO 2219 +pc7302 MACH_PC7302 PC7302 2220 +bip6000 MACH_BIP6000 BIP6000 2221 +silvermoon MACH_SILVERMOON SILVERMOON 2222 +vc0830 MACH_VC0830 VC0830 2223 +dt430 MACH_DT430 DT430 2224 +ji42pf MACH_JI42PF JI42PF 2225 +gnet_ksm MACH_GNET_KSM GNET_KSM 2226 +gnet_sgm MACH_GNET_SGM GNET_SGM 2227 +gnet_sgr MACH_GNET_SGR GNET_SGR 2228 +omap3_icetekevm MACH_OMAP3_ICETEKEVM OMAP3_ICETEKEVM 2229 +pnp MACH_PNP PNP 2230 +ctera_2bay_k MACH_CTERA_2BAY_K CTERA_2BAY_K 2231 +ctera_2bay_u MACH_CTERA_2BAY_U CTERA_2BAY_U 2232 +sas_c MACH_SAS_C SAS_C 2233 +vma2315 MACH_VMA2315 VMA2315 2234 +vcs MACH_VCS VCS 2235 +spear600 MACH_SPEAR600 SPEAR600 2236 +spear300 MACH_SPEAR300 SPEAR300 2237 +spear1300 MACH_SPEAR1300 SPEAR1300 2238 +lilly1131 MACH_LILLY1131 LILLY1131 2239 +arvoo_ax301 MACH_ARVOO_AX301 ARVOO_AX301 2240 +mapphone MACH_MAPPHONE MAPPHONE 2241 +legend MACH_LEGEND LEGEND 2242 +salsa MACH_SALSA SALSA 2243 +lounge MACH_LOUNGE LOUNGE 2244 +vision MACH_VISION VISION 2245 +vmb20 MACH_VMB20 VMB20 2246 +hy2410 MACH_HY2410 HY2410 2247 +hy9315 MACH_HY9315 HY9315 2248 +bullwinkle MACH_BULLWINKLE BULLWINKLE 2249 +arm_ultimator2 MACH_ARM_ULTIMATOR2 ARM_ULTIMATOR2 2250 +vs_v210 MACH_VS_V210 VS_V210 2252 +vs_v212 MACH_VS_V212 VS_V212 2253 +hmt MACH_HMT HMT 2254 +suen3 MACH_SUEN3 SUEN3 2255 +vesper MACH_VESPER VESPER 2256 +str9 MACH_STR9 STR9 2257 +omap3_wl_ff MACH_OMAP3_WL_FF OMAP3_WL_FF 2258 +simcom MACH_SIMCOM SIMCOM 2259 +mcwebio MACH_MCWEBIO MCWEBIO 2260 From 8541c1180a355c4da283fc6b03a92c0233823c1b Mon Sep 17 00:00:00 2001 From: Vladimir Barinov Date: Thu, 23 Apr 2009 15:47:22 +0400 Subject: [PATCH 696/900] mtd: MXC NAND driver fixes (v5) The following patch fixes: - re-initialization of host->col_addr which is used as byte index between the successive READID flash commands. - compile error when CONFIG_PM is enabled - pass on the error code from clk_get() - return -ENOMEM in case of failed ioremap() - pass on the return value of platform_driver_probe() directly - remove excessive printk - let command line partition table parsing with mxc_nand name. The cmd_line parsing is done via name that differs from mxc_nand by default and looks like "NAND 256MiB 1,8V 8-bit" Signed-off-by: Vladimir Barinov Signed-off-by: Lothar Wassmann Acked-by: Sascha Hauer Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/nand/mxc_nand.c | 43 ++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c index f3548d04801..40c26080ecd 100644 --- a/drivers/mtd/nand/mxc_nand.c +++ b/drivers/mtd/nand/mxc_nand.c @@ -831,6 +831,7 @@ static void mxc_nand_command(struct mtd_info *mtd, unsigned command, break; case NAND_CMD_READID: + host->col_addr = 0; send_read_id(host); break; @@ -867,6 +868,7 @@ static int __init mxcnd_probe(struct platform_device *pdev) mtd->priv = this; mtd->owner = THIS_MODULE; mtd->dev.parent = &pdev->dev; + mtd->name = "mxc_nand"; /* 50 us command delay time */ this->chip_delay = 5; @@ -882,8 +884,10 @@ static int __init mxcnd_probe(struct platform_device *pdev) this->verify_buf = mxc_nand_verify_buf; host->clk = clk_get(&pdev->dev, "nfc"); - if (IS_ERR(host->clk)) + if (IS_ERR(host->clk)) { + err = PTR_ERR(host->clk); goto eclk; + } clk_enable(host->clk); host->clk_act = 1; @@ -896,7 +900,7 @@ static int __init mxcnd_probe(struct platform_device *pdev) host->regs = ioremap(res->start, res->end - res->start + 1); if (!host->regs) { - err = -EIO; + err = -ENOMEM; goto eres; } @@ -1011,30 +1015,35 @@ static int __devexit mxcnd_remove(struct platform_device *pdev) #ifdef CONFIG_PM static int mxcnd_suspend(struct platform_device *pdev, pm_message_t state) { - struct mtd_info *info = platform_get_drvdata(pdev); + struct mtd_info *mtd = platform_get_drvdata(pdev); + struct nand_chip *nand_chip = mtd->priv; + struct mxc_nand_host *host = nand_chip->priv; int ret = 0; DEBUG(MTD_DEBUG_LEVEL0, "MXC_ND : NAND suspend\n"); - if (info) - ret = info->suspend(info); - - /* Disable the NFC clock */ - clk_disable(nfc_clk); /* FIXME */ + if (mtd) { + ret = mtd->suspend(mtd); + /* Disable the NFC clock */ + clk_disable(host->clk); + } return ret; } static int mxcnd_resume(struct platform_device *pdev) { - struct mtd_info *info = platform_get_drvdata(pdev); + struct mtd_info *mtd = platform_get_drvdata(pdev); + struct nand_chip *nand_chip = mtd->priv; + struct mxc_nand_host *host = nand_chip->priv; int ret = 0; DEBUG(MTD_DEBUG_LEVEL0, "MXC_ND : NAND resume\n"); - /* Enable the NFC clock */ - clk_enable(nfc_clk); /* FIXME */ - if (info) - info->resume(info); + if (mtd) { + /* Enable the NFC clock */ + clk_enable(host->clk); + mtd->resume(mtd); + } return ret; } @@ -1055,13 +1064,7 @@ static struct platform_driver mxcnd_driver = { static int __init mxc_nd_init(void) { - /* Register the device driver structure. */ - pr_info("MXC MTD nand Driver\n"); - if (platform_driver_probe(&mxcnd_driver, mxcnd_probe) != 0) { - printk(KERN_ERR "Driver register failed for mxcnd_driver\n"); - return -ENODEV; - } - return 0; + return platform_driver_probe(&mxcnd_driver, mxcnd_probe); } static void __exit mxc_nd_cleanup(void) From 81e2962801bbb4e740c501ca687d5cb857929c04 Mon Sep 17 00:00:00 2001 From: Joakim Tjernlund Date: Thu, 28 May 2009 17:43:59 +0200 Subject: [PATCH 697/900] jffs2: Fix corruption when flash erase/write failure Erase errors such as: "Newly-erased block contained word 0xa4ef223e at offset 0x0296a014" and failure to write the clean marker, moves the offending erase block to erasing list before calling jffs2_erase_failed(). This is bad as jffs2_erase_failed() will also move the block to the bad_list, but is now moving the wrong block, causing FS corruption. Signed-off-by: Joakim Tjernlund Signed-off-by: David Woodhouse --- fs/jffs2/erase.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index c32b4a1ad6c..a0244740b75 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -480,13 +480,6 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb return; filebad: - mutex_lock(&c->erase_free_sem); - spin_lock(&c->erase_completion_lock); - /* Stick it on a list (any list) so erase_failed can take it - right off again. Silly, but shouldn't happen often. */ - list_move(&jeb->list, &c->erasing_list); - spin_unlock(&c->erase_completion_lock); - mutex_unlock(&c->erase_free_sem); jffs2_erase_failed(c, jeb, bad_offset); return; From 21a4cc00e8e67edcfc1bdb9af6d370ed1226eb86 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Thu, 28 May 2009 11:39:02 +0200 Subject: [PATCH 698/900] at76c50x-usb: avoid mutex deadlock in at76_dwork_hw_scan http://bugzilla.kernel.org/show_bug.cgi?id=13312 at76_dwork_hw_scan holds a mutex while calling ieee80211_scan_completed, which then calls at76_config which needs the same mutex. This reworks the ordering to not hold the lock while calling ieee80211_scan_completed. Signed-off-by: John W. Linville --- drivers/net/wireless/at76c50x-usb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/at76c50x-usb.c b/drivers/net/wireless/at76c50x-usb.c index 744f4f4dd3d..8d93ca4651b 100644 --- a/drivers/net/wireless/at76c50x-usb.c +++ b/drivers/net/wireless/at76c50x-usb.c @@ -1873,18 +1873,18 @@ static void at76_dwork_hw_scan(struct work_struct *work) if (ret != CMD_STATUS_COMPLETE) { queue_delayed_work(priv->hw->workqueue, &priv->dwork_hw_scan, SCAN_POLL_INTERVAL); - goto exit; + mutex_unlock(&priv->mtx); + return; } - ieee80211_scan_completed(priv->hw, false); - if (is_valid_ether_addr(priv->bssid)) at76_join(priv); - ieee80211_wake_queues(priv->hw); - -exit: mutex_unlock(&priv->mtx); + + ieee80211_scan_completed(priv->hw, false); + + ieee80211_wake_queues(priv->hw); } static int at76_hw_scan(struct ieee80211_hw *hw, From aeeab4ff06b8e29cfe2fe730ba626f7e2487ba03 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 27 May 2009 09:21:57 +0200 Subject: [PATCH 699/900] rtl8187: add USB ID for Linksys WUSB54GC-EU v2 USB wifi dongle http://bugzilla.kernel.org/show_bug.cgi?id=13383 Reported-by: Przemyslaw Kulczycki Signed-off-by: John W. Linville --- drivers/net/wireless/Kconfig | 1 + drivers/net/wireless/rtl818x/rtl8187_dev.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/net/wireless/Kconfig b/drivers/net/wireless/Kconfig index 8a0823588c5..3d94e7dfea6 100644 --- a/drivers/net/wireless/Kconfig +++ b/drivers/net/wireless/Kconfig @@ -430,6 +430,7 @@ config RTL8187 ASUS P5B Deluxe Toshiba Satellite Pro series of laptops Asus Wireless Link + Linksys WUSB54GC-EU Thanks to Realtek for their support! diff --git a/drivers/net/wireless/rtl818x/rtl8187_dev.c b/drivers/net/wireless/rtl818x/rtl8187_dev.c index bac6cfba6ab..d51ba0a88c2 100644 --- a/drivers/net/wireless/rtl818x/rtl8187_dev.c +++ b/drivers/net/wireless/rtl818x/rtl8187_dev.c @@ -71,6 +71,8 @@ static struct usb_device_id rtl8187_table[] __devinitdata = { {USB_DEVICE(0x18E8, 0x6232), .driver_info = DEVICE_RTL8187}, /* AirLive */ {USB_DEVICE(0x1b75, 0x8187), .driver_info = DEVICE_RTL8187}, + /* Linksys */ + {USB_DEVICE(0x1737, 0x0073), .driver_info = DEVICE_RTL8187B}, {} }; From 15aedea439c4d7dbec17c99b5e1594c01b979833 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 27 May 2009 09:43:01 +0900 Subject: [PATCH 700/900] dma-debug: use sg_dma_address accessor instead of using dma_address directly Architectures might not have dma_address in struct scatterlist (PARISC doesn't). Directly accessing to dma_address in struct scatterlist is wrong; we need to use sg_dma_address() accesssor instead. Signed-off-by: FUJITA Tomonori Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index e47e1a08c33..1b5bb82f106 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -840,7 +840,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->dev = dev; entry->paddr = sg_phys(s); entry->size = s->length; - entry->dev_addr = s->dma_address; + entry->dev_addr = sg_dma_address(s); entry->direction = direction; entry->sg_call_ents = nents; entry->sg_mapped_ents = mapped_ents; @@ -872,7 +872,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, .type = dma_debug_sg, .dev = dev, .paddr = sg_phys(s), - .dev_addr = s->dma_address, + .dev_addr = sg_dma_address(s), .size = s->length, .direction = dir, .sg_call_ents = 0, @@ -996,8 +996,8 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, return; for_each_sg(sg, s, nelems, i) { - check_sync(dev, s->dma_address, s->dma_length, 0, - direction, true); + check_sync(dev, sg_dma_address(s), s->dma_length, 0, + direction, true); } } EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu); @@ -1012,8 +1012,8 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, return; for_each_sg(sg, s, nelems, i) { - check_sync(dev, s->dma_address, s->dma_length, 0, - direction, false); + check_sync(dev, sg_dma_address(s), s->dma_length, 0, + direction, false); } } EXPORT_SYMBOL(debug_dma_sync_sg_for_device); From 884d05970bfbc3db368f23460dc4ce63257f240d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 27 May 2009 09:43:02 +0900 Subject: [PATCH 701/900] dma-debug: use sg_dma_len accessor debug_dma_map_sg() and debug_dma_unmap_sg() use length in struct scatterlist while debug_dma_sync_sg_for_cpu() and debug_dma_sync_sg_for_device() use dma_length. This causes bugs warnings on some IOMMU implementations since these values are not same; the length doesn't represent the dma length. We always need to use sg_dma_len() accessor to get the dma length of a scatterlist entry. Signed-off-by: FUJITA Tomonori Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 1b5bb82f106..51f95e5b626 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -839,7 +839,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->type = dma_debug_sg; entry->dev = dev; entry->paddr = sg_phys(s); - entry->size = s->length; + entry->size = sg_dma_len(s); entry->dev_addr = sg_dma_address(s); entry->direction = direction; entry->sg_call_ents = nents; @@ -847,7 +847,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, if (!PageHighMem(sg_page(s))) { check_for_stack(dev, sg_virt(s)); - check_for_illegal_area(dev, sg_virt(s), s->length); + check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); } add_dma_entry(entry); @@ -873,7 +873,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, .dev = dev, .paddr = sg_phys(s), .dev_addr = sg_dma_address(s), - .size = s->length, + .size = sg_dma_len(s), .direction = dir, .sg_call_ents = 0, }; @@ -996,7 +996,7 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, return; for_each_sg(sg, s, nelems, i) { - check_sync(dev, sg_dma_address(s), s->dma_length, 0, + check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0, direction, true); } } @@ -1012,7 +1012,7 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, return; for_each_sg(sg, s, nelems, i) { - check_sync(dev, sg_dma_address(s), s->dma_length, 0, + check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0, direction, false); } } From 88f3907f6f447899544beadf491dccb32015dacb Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 27 May 2009 09:43:03 +0900 Subject: [PATCH 702/900] dma-debug: fix debug_dma_sync_sg_for_cpu and debug_dma_sync_sg_for_device DMA-mapping.txt says that debug_dma_sync_sg family must be called with the _same_ one you passed into the dma_map_sg call, it should _NOT_ be the 'count' value _returned_ from the dma_map_sg call. debug_dma_sync_sg_for_cpu and debug_dma_sync_sg_for_device can't handle this properly; they need to use the sg_mapped_ents in struct dma_debug_entry as debug_dma_unmap_sg() does. Signed-off-by: FUJITA Tomonori Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 51f95e5b626..1abed176d35 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -855,13 +855,32 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, } EXPORT_SYMBOL(debug_dma_map_sg); +static int get_nr_mapped_entries(struct device *dev, struct scatterlist *s) +{ + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + int mapped_ents = 0; + struct dma_debug_entry ref; + + ref.dev = dev; + ref.dev_addr = sg_dma_address(s); + ref.size = sg_dma_len(s), + + bucket = get_hash_bucket(&ref, &flags); + entry = hash_bucket_find(bucket, &ref); + if (entry) + mapped_ents = entry->sg_mapped_ents; + put_hash_bucket(bucket, &flags); + + return mapped_ents; +} + void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { - struct dma_debug_entry *entry; struct scatterlist *s; int mapped_ents = 0, i; - unsigned long flags; if (unlikely(global_disable)) return; @@ -881,14 +900,9 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, if (mapped_ents && i >= mapped_ents) break; - if (mapped_ents == 0) { - struct hash_bucket *bucket; + if (!i) { ref.sg_call_ents = nelems; - bucket = get_hash_bucket(&ref, &flags); - entry = hash_bucket_find(bucket, &ref); - if (entry) - mapped_ents = entry->sg_mapped_ents; - put_hash_bucket(bucket, &flags); + mapped_ents = get_nr_mapped_entries(dev, s); } check_unmap(&ref); @@ -990,12 +1004,18 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, int direction) { struct scatterlist *s; - int i; + int mapped_ents = 0, i; if (unlikely(global_disable)) return; for_each_sg(sg, s, nelems, i) { + if (!i) + mapped_ents = get_nr_mapped_entries(dev, s); + + if (i >= mapped_ents) + break; + check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0, direction, true); } @@ -1006,12 +1026,18 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction) { struct scatterlist *s; - int i; + int mapped_ents = 0, i; if (unlikely(global_disable)) return; for_each_sg(sg, s, nelems, i) { + if (!i) + mapped_ents = get_nr_mapped_entries(dev, s); + + if (i >= mapped_ents) + break; + check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0, direction, false); } From 294ae4011530d008c59c4fb9847738e39228821e Mon Sep 17 00:00:00 2001 From: GeunSik Lim Date: Thu, 28 May 2009 10:36:11 +0900 Subject: [PATCH 703/900] ftrace: fix typo about map of kernel priority in ftrace.txt file. Fix typo about chart to map the kernel priority to user land priorities. * About sched_setscheduler(2) Processes scheduled under SCHED_FIFO or SCHED_RR can have a (user-space) static priority in the range 1 to 99. (reference: http://www.kernel.org/doc/man-pages/online/pages/ man2/sched_setscheduler.2.html) * From: Steven Rostedt 0 to 98 - maps to RT tasks 99 to 1 (SCHED_RR or SCHED_FIFO) 99 - maps to internal kernel threads that want to be lower than RT tasks but higher than SCHED_OTHER tasks. Although I'm not sure if any kernel thread actually uses this. I'm not even sure how this can be set, because the internal sched_setscheduler function does not allow for it. 100 to 139 - maps nice levels -20 to 19. These are not set via sched_setscheduler, but are set via the nice system call. 140 - reserved for idle tasks. Signed-off-by: GeunSik Lim Acked-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- Documentation/trace/ftrace.txt | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index fd9a3e69381..e362f50c496 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -518,9 +518,18 @@ priority with zero (0) being the highest priority and the nice values starting at 100 (nice -20). Below is a quick chart to map the kernel priority to user land priorities. - Kernel priority: 0 to 99 ==> user RT priority 99 to 0 - Kernel priority: 100 to 139 ==> user nice -20 to 19 - Kernel priority: 140 ==> idle task priority + Kernel Space User Space + =============================================================== + 0(high) to 98(low) user RT priority 99(high) to 1(low) + with SCHED_RR or SCHED_FIFO + --------------------------------------------------------------- + 99 sched_priority is not used in scheduling + decisions(it must be specified as 0) + --------------------------------------------------------------- + 100(high) to 139(low) user nice -20(high) to 19(low) + --------------------------------------------------------------- + 140 idle task priority + --------------------------------------------------------------- The task states are: From f04d82b7e0c63d0251f9952a537a4bc4d73aa1a9 Mon Sep 17 00:00:00 2001 From: GeunSik Lim Date: Thu, 28 May 2009 10:36:14 +0900 Subject: [PATCH 704/900] sched: fix typo in sched-rt-group.txt file Fix typo about static priority's range. Kernel Space User Space =============================================================== 0(high) to 98(low) user RT priority 99(high) to 1(low) with SCHED_RR or SCHED_FIFO --------------------------------------------------------------- 99 sched_priority is not used in scheduling decisions(it must be specified as 0) --------------------------------------------------------------- 100(high) to 139(low) user nice -20(high) to 19(low) --------------------------------------------------------------- 140 idle task priority --------------------------------------------------------------- * ref) http://www.kernel.org/doc/man-pages/online/pages/man2/sched_setscheduler.2.html Signed-off-by: GeunSik Lim CC: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-rt-group.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index eb74b014a3f..1df7f9cdab0 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt @@ -187,7 +187,7 @@ get their allocated time. Implementing SCHED_EDF might take a while to complete. Priority Inheritance is the biggest challenge as the current linux PI infrastructure is geared towards -the limited static priority levels 0-139. With deadline scheduling you need to +the limited static priority levels 0-99. With deadline scheduling you need to do deadline inheritance (since priority is inversely proportional to the deadline delta (deadline - now). From 681a1b4032d72f4ad6d4beed751bc65574572746 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 28 May 2009 14:34:18 -0700 Subject: [PATCH 705/900] MAINTAINERS: pair EDAC-E752X P: and M: entries Entries should be P: name then M: email address. Signed-off-by: Joe Perches Acked-by: Doug Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5ee166e27b9..ea4eced29a5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1971,8 +1971,8 @@ F: include/linux/edac.h EDAC-E752X P: Mark Gross -P: Doug Thompson M: mark.gross@intel.com +P: Doug Thompson M: dougthompson@xmission.com L: bluesmoke-devel@lists.sourceforge.net (moderated for non-subscribers) W: bluesmoke.sourceforge.net From 6d2661ede5f20f968422e790af3334908c3bc857 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 28 May 2009 14:34:19 -0700 Subject: [PATCH 706/900] oom: fix possible oom_dump_tasks NULL pointer When /proc/sys/vm/oom_dump_tasks is enabled, it is possible to get a NULL pointer for tasks that have detached mm's since task_lock() is not held during the tasklist scan. Add the task_lock(). Acked-by: Nick Piggin Acked-by: Mel Gorman Cc: Rik van Riel Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 92bcf1db16b..a7b2460e922 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -284,22 +284,28 @@ static void dump_tasks(const struct mem_cgroup *mem) printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " "name\n"); do_each_thread(g, p) { - /* - * total_vm and rss sizes do not exist for tasks with a - * detached mm so there's no need to report them. - */ - if (!p->mm) - continue; + struct mm_struct *mm; + if (mem && !task_in_mem_cgroup(p, mem)) continue; if (!thread_group_leader(p)) continue; task_lock(p); + mm = p->mm; + if (!mm) { + /* + * total_vm and rss sizes do not exist for tasks with no + * mm so there's no need to report them; they can't be + * oom killed anyway. + */ + task_unlock(p); + continue; + } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", - p->pid, __task_cred(p)->uid, p->tgid, - p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p), - p->oomkilladj, p->comm); + p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, + get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, + p->comm); task_unlock(p); } while_each_thread(g, p); } From b2e1feaf0af6b8a826b86748a19ddc2013ab7dbd Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 28 May 2009 14:34:20 -0700 Subject: [PATCH 707/900] cred: #include init.h in cred.h linux/cred.h can't be included as first header (alphabetical order) because it uses __init which is enough to break compilation on some archs. Signed-off-by: Alexey Dobriyan Acked-by: James Morris Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cred.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/cred.h b/include/linux/cred.h index 3282ee4318e..4fa99969631 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -13,6 +13,7 @@ #define _LINUX_CRED_H #include +#include #include #include From bd6daba909d8484bd2ccf6017db4028d7a420927 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 28 May 2009 14:34:21 -0700 Subject: [PATCH 708/900] procfs: make errno values consistent when open pident vs exit(2) race occurs proc_pident_instantiate() has following call flow. proc_pident_lookup() proc_pident_instantiate() proc_pid_make_inode() And, proc_pident_lookup() has following error handling. const struct pid_entry *p, *last; error = ERR_PTR(-ENOENT); if (!task) goto out_no_task; Then, proc_pident_instantiate should return ENOENT too when racing against exit(2) occur. EINAL has two bad reason. - it implies caller is wrong. bad the race isn't caller's mistake. - man 2 open don't explain EINVAL. user often don't handle it. Note: Other proc_pid_make_inode() caller already use ENOENT properly. Acked-by: Eric W. Biederman Cc: Alexey Dobriyan Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index fb45615943c..3326bbf9ab9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1956,7 +1956,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir, const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; - struct dentry *error = ERR_PTR(-EINVAL); + struct dentry *error = ERR_PTR(-ENOENT); inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) From e767e0561d7fd2333df1921f1ab4176211f9036b Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 28 May 2009 14:34:28 -0700 Subject: [PATCH 709/900] memcg: fix deadlock between lock_page_cgroup and mapping tree_lock mapping->tree_lock can be acquired from interrupt context. Then, following dead lock can occur. Assume "A" as a page. CPU0: lock_page_cgroup(A) interrupted -> take mapping->tree_lock. CPU1: take mapping->tree_lock -> lock_page_cgroup(A) This patch tries to fix above deadlock by moving memcg's hook to out of mapping->tree_lock. charge/uncharge of pagecache/swapcache is protected by page lock, not tree_lock. After this patch, lock_page_cgroup() is not called under mapping->tree_lock. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 5 +++++ mm/filemap.c | 6 +++--- mm/memcontrol.c | 4 +++- mm/swap_state.c | 4 +--- mm/truncate.c | 1 + mm/vmscan.c | 2 ++ 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 62d81435347..d476aad3ff5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -437,6 +437,11 @@ static inline int mem_cgroup_cache_charge_swapin(struct page *page, return 0; } +static inline void +mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) +{ +} + #endif /* CONFIG_SWAP */ #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/mm/filemap.c b/mm/filemap.c index 379ff0bcbf6..1b60f30cebf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -121,7 +121,6 @@ void __remove_from_page_cache(struct page *page) mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); - mem_cgroup_uncharge_cache_page(page); /* * Some filesystems seem to re-dirty the page even after @@ -145,6 +144,7 @@ void remove_from_page_cache(struct page *page) spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); } static int sync_page(void *word) @@ -476,13 +476,13 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); page_cache_release(page); } - - spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else mem_cgroup_uncharge_cache_page(page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 01c2d8f1468..4a747a27a22 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1488,8 +1488,9 @@ void mem_cgroup_uncharge_cache_page(struct page *page) __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); } +#ifdef CONFIG_SWAP /* - * called from __delete_from_swap_cache() and drop "page" account. + * called after __delete_from_swap_cache() and drop "page" account. * memcg information is recorded to swap_cgroup of "ent" */ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) @@ -1506,6 +1507,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) if (memcg) css_put(&memcg->css); } +#endif #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* diff --git a/mm/swap_state.c b/mm/swap_state.c index 3ecea98ecb4..1416e7e9e02 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -109,8 +109,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { - swp_entry_t ent = {.val = page_private(page)}; - VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapCache(page)); VM_BUG_ON(PageWriteback(page)); @@ -121,7 +119,6 @@ void __delete_from_swap_cache(struct page *page) total_swapcache_pages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); - mem_cgroup_uncharge_swapcache(page, ent); } /** @@ -191,6 +188,7 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&swapper_space.tree_lock); + mem_cgroup_uncharge_swapcache(page, entry); swap_free(entry); page_cache_release(page); } diff --git a/mm/truncate.c b/mm/truncate.c index 55206fab7b9..12e1579f916 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -359,6 +359,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) BUG_ON(page_has_private(page)); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); page_cache_release(page); /* pagecache ref */ return 1; failed: diff --git a/mm/vmscan.c b/mm/vmscan.c index 5fa3eda1f03..d254306562c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -470,10 +470,12 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_swapcache(page, swap); swap_free(swap); } else { __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); } return 1; From b898f4f869da5b9d41f297fff87aca4cd42d80b3 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 28 May 2009 14:34:29 -0700 Subject: [PATCH 710/900] drivers/serial/mpc52xx_uart.c: fix array overindexing check The check for an overindexing of mpc52xx_uart_{ports,nodes} has an off-by-one. Signed-off-by: Roel Kluin Acked-by: Wolfram Sang Acked-by: Grant Likely Cc: Benjamin Herrenschmidt Cc: Alan Cox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/serial/mpc52xx_uart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/serial/mpc52xx_uart.c b/drivers/serial/mpc52xx_uart.c index 7f72f8ceaa6..b3feb6198d5 100644 --- a/drivers/serial/mpc52xx_uart.c +++ b/drivers/serial/mpc52xx_uart.c @@ -988,7 +988,7 @@ mpc52xx_console_setup(struct console *co, char *options) pr_debug("mpc52xx_console_setup co=%p, co->index=%i, options=%s\n", co, co->index, options); - if ((co->index < 0) || (co->index > MPC52xx_PSC_MAXNUM)) { + if ((co->index < 0) || (co->index >= MPC52xx_PSC_MAXNUM)) { pr_debug("PSC%x out of range\n", co->index); return -EINVAL; } From ba9447198bdd945666a9bac5e556632a7acb235d Mon Sep 17 00:00:00 2001 From: Thomas Dahlmann Date: Thu, 28 May 2009 14:34:30 -0700 Subject: [PATCH 711/900] MAINTAINERS: change email address for Thomas Dahlmann Signed-off-by: Thomas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ea4eced29a5..41c6605feb0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -434,7 +434,7 @@ F: arch/alpha/ AMD GEODE CS5536 USB DEVICE CONTROLLER DRIVER P: Thomas Dahlmann -M: thomas.dahlmann@amd.com +M: dahlmann.thomas@arcor.de L: linux-geode@lists.infradead.org (moderated for non-subscribers) S: Supported F: drivers/usb/gadget/amd5536udc.* From c3dc5bec05a2ae03a72ef82e321d77fb549d951c Mon Sep 17 00:00:00 2001 From: Oskar Schirmer Date: Thu, 28 May 2009 14:34:31 -0700 Subject: [PATCH 712/900] flat: fix data sections alignment The flat loader uses an architecture's flat_stack_align() to align the stack but assumes word-alignment is enough for the data sections. However, on the Xtensa S6000 we have registers up to 128bit width which can be used from userspace and therefor need userspace stack and data-section alignment of at least this size. This patch drops flat_stack_align() and uses the same alignment that is required for slab caches, ARCH_SLAB_MINALIGN, or wordsize if it's not defined by the architecture. It also fixes m32r which was obviously kaput, aligning an uninitialized stack entry instead of the stack pointer. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Oskar Schirmer Cc: David Howells Cc: Russell King Cc: Bryan Wu Cc: Geert Uytterhoeven Acked-by: Paul Mundt Cc: Greg Ungerer Signed-off-by: Johannes Weiner Acked-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/flat.h | 3 --- arch/blackfin/include/asm/flat.h | 1 - arch/h8300/include/asm/flat.h | 1 - arch/m32r/include/asm/flat.h | 1 - arch/m68k/include/asm/flat.h | 1 - arch/sh/include/asm/flat.h | 1 - fs/binfmt_flat.c | 46 +++++++++++++++++++++----------- 7 files changed, 31 insertions(+), 23 deletions(-) diff --git a/arch/arm/include/asm/flat.h b/arch/arm/include/asm/flat.h index 1d77e51907f..59426a4595c 100644 --- a/arch/arm/include/asm/flat.h +++ b/arch/arm/include/asm/flat.h @@ -5,9 +5,6 @@ #ifndef __ARM_FLAT_H__ #define __ARM_FLAT_H__ -/* An odd number of words will be pushed after this alignment, so - deliberately misalign the value. */ -#define flat_stack_align(sp) sp = (void *)(((unsigned long)(sp) - 4) | 4) #define flat_argvp_envp_on_stack() 1 #define flat_old_ram_flag(flags) (flags) #define flat_reloc_valid(reloc, size) ((reloc) <= (size)) diff --git a/arch/blackfin/include/asm/flat.h b/arch/blackfin/include/asm/flat.h index e70074e05f4..733a178d782 100644 --- a/arch/blackfin/include/asm/flat.h +++ b/arch/blackfin/include/asm/flat.h @@ -10,7 +10,6 @@ #include -#define flat_stack_align(sp) /* nothing needed */ #define flat_argvp_envp_on_stack() 0 #define flat_old_ram_flag(flags) (flags) diff --git a/arch/h8300/include/asm/flat.h b/arch/h8300/include/asm/flat.h index 2a873508a9a..bd12b31b90e 100644 --- a/arch/h8300/include/asm/flat.h +++ b/arch/h8300/include/asm/flat.h @@ -5,7 +5,6 @@ #ifndef __H8300_FLAT_H__ #define __H8300_FLAT_H__ -#define flat_stack_align(sp) /* nothing needed */ #define flat_argvp_envp_on_stack() 1 #define flat_old_ram_flag(flags) 1 #define flat_reloc_valid(reloc, size) ((reloc) <= (size)) diff --git a/arch/m32r/include/asm/flat.h b/arch/m32r/include/asm/flat.h index d851cf0c4aa..5d711c4688f 100644 --- a/arch/m32r/include/asm/flat.h +++ b/arch/m32r/include/asm/flat.h @@ -12,7 +12,6 @@ #ifndef __ASM_M32R_FLAT_H #define __ASM_M32R_FLAT_H -#define flat_stack_align(sp) (*sp += (*sp & 3 ? (4 - (*sp & 3)): 0)) #define flat_argvp_envp_on_stack() 0 #define flat_old_ram_flag(flags) (flags) #define flat_set_persistent(relval, p) 0 diff --git a/arch/m68k/include/asm/flat.h b/arch/m68k/include/asm/flat.h index 814b5174a8e..a0e29079397 100644 --- a/arch/m68k/include/asm/flat.h +++ b/arch/m68k/include/asm/flat.h @@ -5,7 +5,6 @@ #ifndef __M68KNOMMU_FLAT_H__ #define __M68KNOMMU_FLAT_H__ -#define flat_stack_align(sp) /* nothing needed */ #define flat_argvp_envp_on_stack() 1 #define flat_old_ram_flag(flags) (flags) #define flat_reloc_valid(reloc, size) ((reloc) <= (size)) diff --git a/arch/sh/include/asm/flat.h b/arch/sh/include/asm/flat.h index d3b2b4f109e..5d84df5e27f 100644 --- a/arch/sh/include/asm/flat.h +++ b/arch/sh/include/asm/flat.h @@ -12,7 +12,6 @@ #ifndef __ASM_SH_FLAT_H #define __ASM_SH_FLAT_H -#define flat_stack_align(sp) /* nothing needed */ #define flat_argvp_envp_on_stack() 0 #define flat_old_ram_flag(flags) (flags) #define flat_reloc_valid(reloc, size) ((reloc) <= (size)) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5cebf0b3779..697f6b5f131 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -41,6 +41,7 @@ #include #include #include +#include /****************************************************************************/ @@ -54,6 +55,18 @@ #define DBG_FLT(a...) #endif +/* + * User data (stack, data section and bss) needs to be aligned + * for the same reasons as SLAB memory is, and to the same amount. + * Avoid duplicating architecture specific code by using the same + * macro as with SLAB allocation: + */ +#ifdef ARCH_SLAB_MINALIGN +#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN) +#else +#define FLAT_DATA_ALIGN (sizeof(void *)) +#endif + #define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ #define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */ @@ -114,20 +127,18 @@ static unsigned long create_flat_tables( int envc = bprm->envc; char uninitialized_var(dummy); - sp = (unsigned long *) ((-(unsigned long)sizeof(char *))&(unsigned long) p); + sp = (unsigned long *)p; + sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); + sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN); + argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); + envp = argv + (argc + 1); - sp -= envc+1; - envp = sp; - sp -= argc+1; - argv = sp; - - flat_stack_align(sp); if (flat_argvp_envp_on_stack()) { - --sp; put_user((unsigned long) envp, sp); - --sp; put_user((unsigned long) argv, sp); + put_user((unsigned long) envp, sp + 2); + put_user((unsigned long) argv, sp + 1); } - put_user(argc,--sp); + put_user(argc, sp); current->mm->arg_start = (unsigned long) p; while (argc-->0) { put_user((unsigned long) p, argv++); @@ -558,7 +569,9 @@ static int load_flat_file(struct linux_binprm * bprm, ret = realdatastart; goto err; } - datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(unsigned long), + FLAT_DATA_ALIGN); DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n", (int)(data_len + bss_len + stack_len), (int)datapos); @@ -604,9 +617,12 @@ static int load_flat_file(struct linux_binprm * bprm, } realdatastart = textpos + ntohl(hdr->data_start); - datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long); - reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) + - MAX_SHARED_LIBS * sizeof(unsigned long)); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(unsigned long), + FLAT_DATA_ALIGN); + + reloc = (unsigned long *) + (datapos + (ntohl(hdr->reloc_start) - text_len)); memp = textpos; memp_size = len; #ifdef CONFIG_BINFMT_ZFLAT @@ -854,7 +870,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ - + stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ res = load_flat_file(bprm, &libinfo, 0, &stack_len); if (res > (unsigned long)-4096) From b5d598b41aebee67bf95802b68b888e98a449687 Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Thu, 28 May 2009 14:34:33 -0700 Subject: [PATCH 713/900] parport_gsc: fix printk format error drivers/parport/parport_gsc.c:356: warning: format '%lx' expects type 'long unsigned int', but argument 2 has type 'resource_size_t' [akpm@linux-foundation.org: fix it to handle u64's] Signed-off-by: Alexander Beregalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/parport/parport_gsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/parport/parport_gsc.c b/drivers/parport/parport_gsc.c index e6a7e847ee8..ea31a452b15 100644 --- a/drivers/parport/parport_gsc.c +++ b/drivers/parport/parport_gsc.c @@ -352,8 +352,8 @@ static int __devinit parport_init_chip(struct parisc_device *dev) unsigned long port; if (!dev->irq) { - printk(KERN_WARNING "IRQ not found for parallel device at 0x%lx\n", - dev->hpa.start); + printk(KERN_WARNING "IRQ not found for parallel device at 0x%llx\n", + (unsigned long long)dev->hpa.start); return -ENODEV; } From 8e8e8267f0a08c2415d5f51bc9a9fde6d5400619 Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Thu, 28 May 2009 14:34:34 -0700 Subject: [PATCH 714/900] serial: 8250_gsc: fix printk format error drivers/serial/8250_gsc.c:44: warning: format '%lx' expects type 'long unsigned int', but argument 2 has type 'resource_size_t' [akpm@linux-foundation.org: fix it to handle u64's] Signed-off-by: Alexander Beregalov Cc: Alan Cox Cc: Kyle McMartin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/serial/8250_gsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/serial/8250_gsc.c b/drivers/serial/8250_gsc.c index 418b4fe9a0a..33149d982e8 100644 --- a/drivers/serial/8250_gsc.c +++ b/drivers/serial/8250_gsc.c @@ -39,9 +39,9 @@ static int __init serial_init_chip(struct parisc_device *dev) */ if (parisc_parent(dev)->id.hw_type != HPHW_IOA) printk(KERN_INFO - "Serial: device 0x%lx not configured.\n" + "Serial: device 0x%llx not configured.\n" "Enable support for Wax, Lasi, Asp or Dino.\n", - dev->hpa.start); + (unsigned long long)dev->hpa.start); return -ENODEV; } From 17663e59704bea838a9236f299104e30909a43b1 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 28 May 2009 14:34:35 -0700 Subject: [PATCH 715/900] S3C-fb: PM fix Correctly restore the FrameBuffer register state in the resume function. Reviewed-by: Kyungmin Park Signed-off-by: Marek Szyprowski Cc: Ben Dooks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/s3c-fb.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/video/s3c-fb.c b/drivers/video/s3c-fb.c index 5e9c6302433..d3a568e6b16 100644 --- a/drivers/video/s3c-fb.c +++ b/drivers/video/s3c-fb.c @@ -947,7 +947,8 @@ static int __devexit s3c_fb_remove(struct platform_device *pdev) int win; for (win = 0; win <= S3C_FB_MAX_WIN; win++) - s3c_fb_release_win(sfb, sfb->windows[win]); + if (sfb->windows[win]) + s3c_fb_release_win(sfb, sfb->windows[win]); iounmap(sfb->regs); @@ -985,11 +986,20 @@ static int s3c_fb_suspend(struct platform_device *pdev, pm_message_t state) static int s3c_fb_resume(struct platform_device *pdev) { struct s3c_fb *sfb = platform_get_drvdata(pdev); + struct s3c_fb_platdata *pd = sfb->pdata; struct s3c_fb_win *win; int win_no; clk_enable(sfb->bus_clk); + /* setup registers */ + writel(pd->vidcon1, sfb->regs + VIDCON1); + + /* zero all windows before we do anything */ + for (win_no = 0; win_no < S3C_FB_MAX_WIN; win_no++) + s3c_fb_clear_win(sfb, win_no); + + /* restore framebuffers */ for (win_no = 0; win_no < S3C_FB_MAX_WIN; win_no++) { win = sfb->windows[win_no]; if (!win) From 53b7479bbdaedcc7846c66fd608fe66f1b5aa35b Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Thu, 28 May 2009 14:34:36 -0700 Subject: [PATCH 716/900] atmel_lcdfb: correct fifo size for some products Remove wrong fifo size definition for some AT91 products. Due to a misunderstanding of some AT91 datasheets, a fifo size of 2048 (words) has been introduced by mistake. In fact, all products (AT91/AT32) are sharing the same fifo size of 512 words. Signed-off-by: Nicolas Ferre Cc: Andrew Victor Acked-by: Haavard Skinnemoen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/atmel_lcdfb.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/video/atmel_lcdfb.c b/drivers/video/atmel_lcdfb.c index 9a577a800db..2fb63f6ea2f 100644 --- a/drivers/video/atmel_lcdfb.c +++ b/drivers/video/atmel_lcdfb.c @@ -29,14 +29,8 @@ /* configurable parameters */ #define ATMEL_LCDC_CVAL_DEFAULT 0xc8 -#define ATMEL_LCDC_DMA_BURST_LEN 8 - -#if defined(CONFIG_ARCH_AT91SAM9263) || defined(CONFIG_ARCH_AT91CAP9) || \ - defined(CONFIG_ARCH_AT91SAM9RL) -#define ATMEL_LCDC_FIFO_SIZE 2048 -#else -#define ATMEL_LCDC_FIFO_SIZE 512 -#endif +#define ATMEL_LCDC_DMA_BURST_LEN 8 /* words */ +#define ATMEL_LCDC_FIFO_SIZE 512 /* words */ #if defined(CONFIG_ARCH_AT91) #define ATMEL_LCDFB_FBINFO_DEFAULT (FBINFO_DEFAULT \ From 32b154c0b0bae2879bf4e549d861caf1759a3546 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 28 May 2009 14:34:37 -0700 Subject: [PATCH 717/900] x86: ignore VM_LOCKED when determining if hugetlb-backed page tables can be shared or not Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13302 On x86 and x86-64, it is possible that page tables are shared beween shared mappings backed by hugetlbfs. As part of this, page_table_shareable() checks a pair of vma->vm_flags and they must match if they are to be shared. All VMA flags are taken into account, including VM_LOCKED. The problem is that VM_LOCKED is cleared on fork(). When a process with a shared memory segment forks() to exec() a helper, there will be shared VMAs with different flags. The impact is that the shared segment is sometimes considered shareable and other times not, depending on what process is checking. What happens is that the segment page tables are being shared but the count is inaccurate depending on the ordering of events. As the page tables are freed with put_page(), bad pmd's are found when some of the children exit. The hugepage counters also get corrupted and the Total and Free count will no longer match even when all the hugepage-backed regions are freed. This requires a reboot of the machine to "fix". This patch addresses the problem by comparing all flags except VM_LOCKED when deciding if pagetables should be shared or not for hugetlbfs-backed mapping. Signed-off-by: Mel Gorman Acked-by: Hugh Dickins Cc: Ingo Molnar Cc: Cc: Lee Schermerhorn Cc: KOSAKI Motohiro Cc: Cc: Eric B Munson Cc: Adam Litke Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 8f307d914c2..f46c340727b 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -26,12 +26,16 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long sbase = saddr & PUD_MASK; unsigned long s_end = sbase + PUD_SIZE; + /* Allow segments to share if only one is marked locked */ + unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; + unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; + /* * match the virtual addresses, permission and the alignment of the * page table page. */ if (pmd_index(addr) != pmd_index(saddr) || - vma->vm_flags != svma->vm_flags || + vm_flags != svm_flags || sbase < svma->vm_start || svma->vm_end < s_end) return 0; From f83a275dbc5ca1721143698e844243fcadfabf6a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 28 May 2009 14:34:40 -0700 Subject: [PATCH 718/900] mm: account for MAP_SHARED mappings using VM_MAYSHARE and not VM_SHARED in hugetlbfs Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13302 hugetlbfs reserves huge pages but does not fault them at mmap() time to ensure that future faults succeed. The reservation behaviour differs depending on whether the mapping was mapped MAP_SHARED or MAP_PRIVATE. For MAP_SHARED mappings, hugepages are reserved when mmap() is first called and are tracked based on information associated with the inode. Other processes mapping MAP_SHARED use the same reservation. MAP_PRIVATE track the reservations based on the VMA created as part of the mmap() operation. Each process mapping MAP_PRIVATE must make its own reservation. hugetlbfs currently checks if a VMA is MAP_SHARED with the VM_SHARED flag and not VM_MAYSHARE. For file-backed mappings, such as hugetlbfs, VM_SHARED is set only if the mapping is MAP_SHARED and the file was opened read-write. If a shared memory mapping was mapped shared-read-write for populating of data and mapped shared-read-only by other processes, then hugetlbfs would account for the mapping as if it was MAP_PRIVATE. This causes processes to fail to map the file MAP_SHARED even though it should succeed as the reservation is there. This patch alters mm/hugetlb.c and replaces VM_SHARED with VM_MAYSHARE when the intent of the code was to check whether the VMA was mapped MAP_SHARED or MAP_PRIVATE. Signed-off-by: Mel Gorman Cc: Hugh Dickins Cc: Ingo Molnar Cc: Cc: Lee Schermerhorn Cc: KOSAKI Motohiro Cc: Cc: Eric B Munson Cc: Adam Litke Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 28c655ba935..e83ad2c9228 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -316,7 +316,7 @@ static void resv_map_release(struct kref *ref) static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); return NULL; @@ -325,7 +325,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_SHARED); + VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); set_vma_private_data(vma, (get_vma_private_data(vma) & HPAGE_RESV_MASK) | (unsigned long)map); @@ -334,7 +334,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_SHARED); + VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } @@ -353,7 +353,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h, if (vma->vm_flags & VM_NORESERVE) return; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { /* Shared mappings always use reserves */ h->resv_huge_pages--; } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { @@ -369,14 +369,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h, void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) vma->vm_private_data = (void *)0; } /* Returns true if the VMA has associated reserve pages */ static int vma_has_reserves(struct vm_area_struct *vma) { - if (vma->vm_flags & VM_SHARED) + if (vma->vm_flags & VM_MAYSHARE) return 1; if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 1; @@ -924,7 +924,7 @@ static long vma_needs_reservation(struct hstate *h, struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); return region_chg(&inode->i_mapping->private_list, idx, idx + 1); @@ -949,7 +949,7 @@ static void vma_commit_reservation(struct hstate *h, struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); @@ -1893,7 +1893,7 @@ retry_avoidcopy: * at the time of fork() could consume its reserves on COW instead * of the full address range. */ - if (!(vma->vm_flags & VM_SHARED) && + if (!(vma->vm_flags & VM_MAYSHARE) && is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_page != pagecache_page) outside_reserve = 1; @@ -2000,7 +2000,7 @@ retry: clear_huge_page(page, address, huge_page_size(h)); __SetPageUptodate(page); - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { int err; struct inode *inode = mapping->host; @@ -2104,7 +2104,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out_mutex; } - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, vma, address); } @@ -2289,7 +2289,7 @@ int hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (!vma || vma->vm_flags & VM_SHARED) + if (!vma || vma->vm_flags & VM_MAYSHARE) chg = region_chg(&inode->i_mapping->private_list, from, to); else { struct resv_map *resv_map = resv_map_alloc(); @@ -2330,7 +2330,7 @@ int hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!vma || vma->vm_flags & VM_SHARED) + if (!vma || vma->vm_flags & VM_MAYSHARE) region_add(&inode->i_mapping->private_list, from, to); return 0; } From 46f7e602fb32e02145ef14f8c0ca6d399f0a96b9 Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Thu, 28 May 2009 14:34:41 -0700 Subject: [PATCH 719/900] memcg: fix build warning and avoid checking for mem != null again and again Fix build warning, "mem_cgroup_is_obsolete defined but not used" when CONFIG_DEBUG_VM is not set. Also avoid checking for !mem again and again. Signed-off-by: Nikanth Karthikesan Acked-by: Pekka Enberg Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4a747a27a22..78eb8552818 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -314,14 +314,6 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return mem; } -static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) -{ - if (!mem) - return true; - return css_is_removed(&mem->css); -} - - /* * Call callback function against all cgroup under hierarchy tree. */ @@ -932,7 +924,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (unlikely(!mem)) return 0; - VM_BUG_ON(!mem || mem_cgroup_is_obsolete(mem)); + VM_BUG_ON(css_is_removed(&mem->css)); while (1) { int ret; From 56ec0c7b88c6eb17733e5015f31302f6312511ed Mon Sep 17 00:00:00 2001 From: Harry Ciao Date: Thu, 28 May 2009 14:34:42 -0700 Subject: [PATCH 720/900] edac: AMD8111 & AMD8131 use dev_name() The "bus_id" member in the device structure has been obsolete, use dev_name() instead. Signed-off-by: Harry Ciao Cc: Doug Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/edac/amd8111_edac.c | 4 ++-- drivers/edac/amd8131_edac.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/edac/amd8111_edac.c b/drivers/edac/amd8111_edac.c index 61469218112..2cb58ef743e 100644 --- a/drivers/edac/amd8111_edac.c +++ b/drivers/edac/amd8111_edac.c @@ -389,7 +389,7 @@ static int amd8111_dev_probe(struct pci_dev *dev, dev_info->edac_dev->dev = &dev_info->dev->dev; dev_info->edac_dev->mod_name = AMD8111_EDAC_MOD_STR; dev_info->edac_dev->ctl_name = dev_info->ctl_name; - dev_info->edac_dev->dev_name = dev_info->dev->dev.bus_id; + dev_info->edac_dev->dev_name = dev_name(&dev_info->dev->dev); if (edac_op_state == EDAC_OPSTATE_POLL) dev_info->edac_dev->edac_check = dev_info->check; @@ -473,7 +473,7 @@ static int amd8111_pci_probe(struct pci_dev *dev, pci_info->edac_dev->dev = &pci_info->dev->dev; pci_info->edac_dev->mod_name = AMD8111_EDAC_MOD_STR; pci_info->edac_dev->ctl_name = pci_info->ctl_name; - pci_info->edac_dev->dev_name = pci_info->dev->dev.bus_id; + pci_info->edac_dev->dev_name = dev_name(&pci_info->dev->dev); if (edac_op_state == EDAC_OPSTATE_POLL) pci_info->edac_dev->edac_check = pci_info->check; diff --git a/drivers/edac/amd8131_edac.c b/drivers/edac/amd8131_edac.c index c083b31cac5..b432d60c622 100644 --- a/drivers/edac/amd8131_edac.c +++ b/drivers/edac/amd8131_edac.c @@ -287,7 +287,7 @@ static int amd8131_probe(struct pci_dev *dev, const struct pci_device_id *id) dev_info->edac_dev->dev = &dev_info->dev->dev; dev_info->edac_dev->mod_name = AMD8131_EDAC_MOD_STR; dev_info->edac_dev->ctl_name = dev_info->ctl_name; - dev_info->edac_dev->dev_name = dev_info->dev->dev.bus_id; + dev_info->edac_dev->dev_name = dev_name(&dev_info->dev->dev); if (edac_op_state == EDAC_OPSTATE_POLL) dev_info->edac_dev->edac_check = amd8131_chipset.check; From 715fe7af9fd7328af661742bfadc195e665a837f Mon Sep 17 00:00:00 2001 From: Harry Ciao Date: Thu, 28 May 2009 14:34:43 -0700 Subject: [PATCH 721/900] edac: AMD8111 & AMD8131 Kconfig fixup The amd8111_edac.c driver will fail allmodconfig on architectures other than PPC, introduce Kconfig dependency to avoid this, since both AMD8111 and AMD8131 chips are only adopted on Maple so far. Signed-off-by: Harry Ciao Cc: Doug Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/edac/Kconfig | 8 ++++++-- drivers/edac/Makefile | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index e5f5c5a8ba6..956982f8739 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -192,16 +192,20 @@ config EDAC_PPC4XX config EDAC_AMD8131 tristate "AMD8131 HyperTransport PCI-X Tunnel" - depends on EDAC_MM_EDAC && PCI + depends on EDAC_MM_EDAC && PCI && PPC_MAPLE help Support for error detection and correction on the AMD8131 HyperTransport PCI-X Tunnel chip. + Note, add more Kconfig dependency if it's adopted + on some machine other than Maple. config EDAC_AMD8111 tristate "AMD8111 HyperTransport I/O Hub" - depends on EDAC_MM_EDAC && PCI + depends on EDAC_MM_EDAC && PCI && PPC_MAPLE help Support for error detection and correction on the AMD8111 HyperTransport I/O Hub chip. + Note, add more Kconfig dependency if it's adopted + on some machine other than Maple. endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index a5fdcf02f59..59076819135 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -35,3 +35,5 @@ obj-$(CONFIG_EDAC_MPC85XX) += mpc85xx_edac.o obj-$(CONFIG_EDAC_MV64X60) += mv64x60_edac.o obj-$(CONFIG_EDAC_CELL) += cell_edac.o obj-$(CONFIG_EDAC_PPC4XX) += ppc4xx_edac.o +obj-$(CONFIG_EDAC_AMD8111) += amd8111_edac.o +obj-$(CONFIG_EDAC_AMD8131) += amd8131_edac.o From b8e7e40abeac49644fec4a4f52ffe74c7b05eca0 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 28 May 2009 14:01:35 +0100 Subject: [PATCH 722/900] 8250: Fix oops from setserial If you setserial a port which has never been initialised we change the type but don't update the I/O method pointers. The same problem is true if you change the io type of a port - but nobody ever does that so nobody noticed! Remember the old type and when attaching if the type has changed reload the port accessor pointers. We can't do it blindly as some 8250 drivers load custom accessors and we must not stomp those. Tested-by: Victor Seryodkin Closes-bug: #13367 Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- drivers/serial/8250.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c index b4b39811b44..a0127e93ade 100644 --- a/drivers/serial/8250.c +++ b/drivers/serial/8250.c @@ -137,6 +137,7 @@ struct uart_8250_port { unsigned char mcr; unsigned char mcr_mask; /* mask of user bits */ unsigned char mcr_force; /* mask of forced bits */ + unsigned char cur_iotype; /* Running I/O type */ /* * Some bits in registers are cleared on a read, so they must @@ -471,6 +472,7 @@ static void io_serial_out(struct uart_port *p, int offset, int value) static void set_io_from_upio(struct uart_port *p) { + struct uart_8250_port *up = (struct uart_8250_port *)p; switch (p->iotype) { case UPIO_HUB6: p->serial_in = hub6_serial_in; @@ -509,6 +511,8 @@ static void set_io_from_upio(struct uart_port *p) p->serial_out = io_serial_out; break; } + /* Remember loaded iotype */ + up->cur_iotype = p->iotype; } static void @@ -1937,6 +1941,9 @@ static int serial8250_startup(struct uart_port *port) up->capabilities = uart_config[up->port.type].flags; up->mcr = 0; + if (up->port.iotype != up->cur_iotype) + set_io_from_upio(port); + if (up->port.type == PORT_16C950) { /* Wake up and initialize UART */ up->acr = 0; @@ -2563,6 +2570,9 @@ static void serial8250_config_port(struct uart_port *port, int flags) if (ret < 0) probeflags &= ~PROBE_RSA; + if (up->port.iotype != up->cur_iotype) + set_io_from_upio(port); + if (flags & UART_CONFIG_TYPE) autoconfig(up, probeflags); if (up->port.type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ) @@ -2671,6 +2681,11 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) { int i; + for (i = 0; i < nr_uarts; i++) { + struct uart_8250_port *up = &serial8250_ports[i]; + up->cur_iotype = 0xFF; + } + serial8250_isa_init_ports(); for (i = 0; i < nr_uarts; i++) { From 6373fffc5d555caf3acf7c5796cec9820aaf7479 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 May 2009 16:12:02 -0700 Subject: [PATCH 723/900] sparc64: Fix section attribute warnings. CSUM copy to/from user assembler was missing allocatable and executable attributes for .fixup Signed-off-by: David S. Miller --- arch/sparc/lib/csum_copy_from_user.S | 2 +- arch/sparc/lib/csum_copy_to_user.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sparc/lib/csum_copy_from_user.S b/arch/sparc/lib/csum_copy_from_user.S index a22eddbe5db..e0304e6a224 100644 --- a/arch/sparc/lib/csum_copy_from_user.S +++ b/arch/sparc/lib/csum_copy_from_user.S @@ -5,7 +5,7 @@ #define EX_LD(x) \ 98: x; \ - .section .fixup; \ + .section .fixup, "ax"; \ .align 4; \ 99: retl; \ mov -1, %o0; \ diff --git a/arch/sparc/lib/csum_copy_to_user.S b/arch/sparc/lib/csum_copy_to_user.S index d5b12f441f0..afd01acc587 100644 --- a/arch/sparc/lib/csum_copy_to_user.S +++ b/arch/sparc/lib/csum_copy_to_user.S @@ -5,7 +5,7 @@ #define EX_ST(x) \ 98: x; \ - .section .fixup; \ + .section .fixup,"ax"; \ .align 4; \ 99: retl; \ mov -1, %o0; \ From 34d531e640cb805973cf656b15c716b961565cea Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 26 May 2009 15:11:06 -0400 Subject: [PATCH 724/900] ACPI: sanity check _PSS frequency to prevent cpufreq crash When BIOS SETUP is changed to disable EIST, some BIOS hand the OS an un-initialized _PSS: Name (_PSS, Package (0x06) { Package (0x06) { 0x80000000, // frequency [MHz] 0x80000000, // power [mW] 0x80000000, // latency [us] 0x80000000, // BM latency [us] 0x80000000, // control 0x80000000 // status }, ... These are outrageous values for frequency, power and latency, raising the question where to draw the line between legal and illegal. We tend to survive garbage in the power and latency fields, but we can BUG_ON when garbage is in the frequency field. Cpufreq multiplies the frequency by 1000 and stores it in a u32 KHz. So disregard a _PSS with a frequency so large that it can't be represented by cpufreq. https://bugzilla.redhat.com/show_bug.cgi?id=500311 Signed-off-by: Len Brown --- drivers/acpi/processor_perflib.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index cafb41000f6..60e543d3234 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -309,9 +309,15 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr) (u32) px->bus_master_latency, (u32) px->control, (u32) px->status)); - if (!px->core_frequency) { - printk(KERN_ERR PREFIX - "Invalid _PSS data: freq is zero\n"); + /* + * Check that ACPI's u64 MHz will be valid as u32 KHz in cpufreq + */ + if (!px->core_frequency || + ((u32)(px->core_frequency * 1000) != + (px->core_frequency * 1000))) { + printk(KERN_ERR FW_BUG PREFIX + "Invalid BIOS _PSS frequency: 0x%llx MHz\n", + px->core_frequency); result = -EFAULT; kfree(pr->performance->states); goto end; From 34ac272b3aaef11a91e19a72f3ac5772a96ffbc5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 26 May 2009 23:35:34 -0400 Subject: [PATCH 725/900] ACPI: video: DMI workaround broken eMachines E510 BIOS enabling display brightness http://bugzilla.kernel.org/show_bug.cgi?id=13376 Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- drivers/acpi/video.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 810cca90ca7..ee45e760422 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -570,6 +570,14 @@ static struct dmi_system_id video_dmi_table[] __initdata = { DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5710Z"), }, }, + { + .callback = video_set_bqc_offset, + .ident = "eMachines E510", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "EMACHINES"), + DMI_MATCH(DMI_PRODUCT_NAME, "eMachines E510"), + }, + }, {} }; From 93bcece20ef87c29548ec7e66532f1018572cea0 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 19 May 2009 15:08:41 -0400 Subject: [PATCH 726/900] ACPI: video: DMI workaround broken Acer 5315 BIOS enabling display brightness http://bugzilla.kernel.org/show_bug.cgi?id=13121 Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- drivers/acpi/video.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index ee45e760422..6d897bbd3bd 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -578,6 +578,14 @@ static struct dmi_system_id video_dmi_table[] __initdata = { DMI_MATCH(DMI_PRODUCT_NAME, "eMachines E510"), }, }, + { + .callback = video_set_bqc_offset, + .ident = "Acer Aspire 5315", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5315"), + }, + }, {} }; From 1fc8d33acafe68bdcc21b327d22ef3820b819727 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 20 May 2009 11:56:08 +0530 Subject: [PATCH 727/900] drm/i915: acpi/video.c fix section mismatch warning Currently acpi_video_exit() is exported as well as using __exit which causes: WARNING: drivers/acpi/video.o(__ksymtab+0x0): Section mismatch in reference from the variable __ksymtab_acpi_video_exit to the function .exit.text:acpi_video_exit() The symbol acpi_video_exit is exported and annotated __exit Fix this by removing the __exit annotation of acpi_video_exit or drop the export. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Len Brown --- drivers/acpi/video.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 810cca90ca7..a79b8853526 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -2334,7 +2334,7 @@ static int __init acpi_video_init(void) return acpi_video_register(); } -void __exit acpi_video_exit(void) +void acpi_video_exit(void) { acpi_bus_unregister_driver(&acpi_video_bus); From 21671b88be331fb9c95891d5ee7d2e940e6b024c Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Fri, 22 May 2009 10:23:40 +0200 Subject: [PATCH 728/900] ACPI processor: remove spurious newline from warning message Commit 4973b22a ("ACPI processor: reset the throttling state once it's invalid") introduced a new warning which prints a spurious newline. The ACPI_WARNING macro that is used already takes care of adding a newline, after adding ACPI_CA_VERSION to the message. Remove the newline to avoid the message getting split into two lines. Signed-off-by: Frans Pop Signed-off-by: Len Brown --- drivers/acpi/processor_throttling.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index 7f16f5f8e7d..227543789ba 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -840,7 +840,7 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr) state = acpi_get_throttling_state(pr, value); if (state == -1) { ACPI_WARNING((AE_INFO, - "Invalid throttling state, reset\n")); + "Invalid throttling state, reset")); state = 0; ret = acpi_processor_set_throttling(pr, state); if (ret) From 61c8c67e3ad67ea1d1360f2e88688bd942834756 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 26 May 2009 14:58:39 -0700 Subject: [PATCH 729/900] acpi-cpufreq: fix printk typo and indentation Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Len Brown --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 208ecf6643d..54b6de2cd94 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -693,8 +693,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE && policy->cpuinfo.transition_latency > 20 * 1000) { policy->cpuinfo.transition_latency = 20 * 1000; - printk_once(KERN_INFO "Capping off P-state tranision" - " latency at 20 uS\n"); + printk_once(KERN_INFO + "P-state transition latency capped at 20 uS\n"); } /* table init */ From 31db5645bda24682dadbc97d5e8a7918ade2a298 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 29 May 2009 21:11:27 -0400 Subject: [PATCH 730/900] ACPI, i915: build fix (v2) drivers/built-in.o: In function `intel_opregion_init': (.text+0x9d540): undefined reference to `acpi_video_register' v2: move under DRM_I915 from DRM_I915_KMS Signed-off-by: Len Brown Signed-off-by: Randy Dunlap --- drivers/gpu/drm/Kconfig | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 4cd35d8fd79..f5d46e7199d 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -67,12 +67,18 @@ config DRM_I830 will load the correct one. config DRM_I915 + tristate "i915 driver" select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT select FB select FRAMEBUFFER_CONSOLE if !EMBEDDED - tristate "i915 driver" + # i915 depends on ACPI_VIDEO when ACPI is enabled + # but for select to work, need to select ACPI_VIDEO's dependencies, ick + select VIDEO_OUTPUT_CONTROL if ACPI + select BACKLIGHT_CLASS_DEVICE if ACPI + select INPUT if ACPI + select ACPI_VIDEO if ACPI help Choose this option if you have a system that has Intel 830M, 845G, 852GM, 855GM 865G or 915G integrated graphics. If M is selected, the @@ -84,12 +90,6 @@ config DRM_I915 config DRM_I915_KMS bool "Enable modesetting on intel by default" depends on DRM_I915 - # i915 KMS depends on ACPI_VIDEO when ACPI is enabled - # but for select to work, need to select ACPI_VIDEO's dependencies, ick - select VIDEO_OUTPUT_CONTROL if ACPI - select BACKLIGHT_CLASS_DEVICE if ACPI - select INPUT if ACPI - select ACPI_VIDEO if ACPI help Choose this option if you want kernel modesetting enabled by default, and you have a new enough userspace to support this. Running old From 465440d2720543669841db5b0691ba41892ed0ae Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Mon, 25 May 2009 20:57:21 +0000 Subject: [PATCH 731/900] mlx4_en: Fix a kernel panic when waking tx queue When the transmit queue gets full we enable interrupts for TX completions There was a race that we handled the TX queue both from the interrupt context and from the transmit function. Using "spin_trylock_irq()" ensures this doesn't happen. Signed-off-by: Yevgeny Petrilin Signed-off-by: David S. Miller --- drivers/net/mlx4/en_tx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/mlx4/en_tx.c b/drivers/net/mlx4/en_tx.c index ac6fc499b28..e5c98a98ad3 100644 --- a/drivers/net/mlx4/en_tx.c +++ b/drivers/net/mlx4/en_tx.c @@ -426,7 +426,7 @@ void mlx4_en_poll_tx_cq(unsigned long data) INC_PERF_COUNTER(priv->pstats.tx_poll); - if (!spin_trylock(&ring->comp_lock)) { + if (!spin_trylock_irq(&ring->comp_lock)) { mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); return; } @@ -439,7 +439,7 @@ void mlx4_en_poll_tx_cq(unsigned long data) if (inflight && priv->port_up) mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); - spin_unlock(&ring->comp_lock); + spin_unlock_irq(&ring->comp_lock); } static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv, @@ -482,9 +482,9 @@ static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind) /* Poll the CQ every mlx4_en_TX_MODER_POLL packets */ if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0) - if (spin_trylock(&ring->comp_lock)) { + if (spin_trylock_irq(&ring->comp_lock)) { mlx4_en_process_tx_cq(priv->dev, cq); - spin_unlock(&ring->comp_lock); + spin_unlock_irq(&ring->comp_lock); } } From bdb0e010bf0061a73027cc84dd7ad192c663eca3 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 29 May 2009 22:04:54 -0700 Subject: [PATCH 732/900] ath1e: add new device id for asus hardware Gary Lin reports that a new device id needs to be added to the atl1e in order to get some new Asus hardware to work properly. Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- drivers/net/atl1e/atl1e_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/atl1e/atl1e_main.c b/drivers/net/atl1e/atl1e_main.c index fb57b750866..1342418fb20 100644 --- a/drivers/net/atl1e/atl1e_main.c +++ b/drivers/net/atl1e/atl1e_main.c @@ -37,6 +37,7 @@ char atl1e_driver_version[] = DRV_VERSION; */ static struct pci_device_id atl1e_pci_tbl[] = { {PCI_DEVICE(PCI_VENDOR_ID_ATTANSIC, PCI_DEVICE_ID_ATTANSIC_L1E)}, + {PCI_DEVICE(PCI_VENDOR_ID_ATTANSIC, 0x1066)}, /* required last entry */ { 0 } }; From cf4ae4e3de83e2c7394af70b46f4f50e5f0fb90c Mon Sep 17 00:00:00 2001 From: Matt Kraai Date: Fri, 29 May 2009 22:06:33 -0700 Subject: [PATCH 733/900] net/firmare: Ignore .cis files Signed-off-by: Matt Kraai Signed-off-by: David S. Miller --- firmware/cis/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 firmware/cis/.gitignore diff --git a/firmware/cis/.gitignore b/firmware/cis/.gitignore new file mode 100644 index 00000000000..1de39847f26 --- /dev/null +++ b/firmware/cis/.gitignore @@ -0,0 +1 @@ +*.cis From 62013ab5d5df297a01ae5863b5c26d758ec0af7f Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sat, 30 May 2009 21:50:58 +0900 Subject: [PATCH 734/900] nilfs2: fix bh leak in nilfs_cpfile_delete_checkpoints function The nilfs_cpfile_delete_checkpoints() wrongly skips brelse() for the header block of checkpoint file in case of errors. This fixes the leak bug. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/cpfile.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index e90b60dfced..300f1cdfa86 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -311,7 +311,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) { if (ret != -ENOENT) - goto out_sem; + goto out_header; /* skip hole */ ret = 0; continue; @@ -344,7 +344,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, continue; printk(KERN_ERR "%s: cannot delete block\n", __func__); - goto out_sem; + goto out_header; } } @@ -361,6 +361,8 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, nilfs_mdt_mark_dirty(cpfile); kunmap_atomic(kaddr, KM_USER0); } + + out_header: brelse(header_bh); out_sem: From c339dfdd65b52bfd947ab29d1210314a2f6d622d Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Sat, 30 May 2009 20:06:54 +0200 Subject: [PATCH 735/900] ide_pci_generic: add quirk for Netcell ATA RAID We need to explicitly mark words 85-87 as valid ones since firmware doesn't do it. This should fix support for LBA48 and FLUSH CACHE [EXT] command which stopped working after we applied more strict checking of identify words in: commit 942dcd85bf8edf38cdc3745306ca250684d99a61 ("ide: idedisk_supports_lba48() -> ata_id_lba48_enabled()") and commit 4b58f17d7c45a8e5f4acda641bec388398b9c0fa ("ide: ide_id_has_flush_cache() -> ata_id_flush_enabled()") Reported-and-tested-by: "Trevor Hemsley" Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-pci-generic.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/ide/ide-pci-generic.c b/drivers/ide/ide-pci-generic.c index 61111fd2713..39d4e01f5c9 100644 --- a/drivers/ide/ide-pci-generic.c +++ b/drivers/ide/ide-pci-generic.c @@ -33,6 +33,16 @@ static int ide_generic_all; /* Set to claim all devices */ module_param_named(all_generic_ide, ide_generic_all, bool, 0444); MODULE_PARM_DESC(all_generic_ide, "IDE generic will claim all unknown PCI IDE storage controllers."); +static void netcell_quirkproc(ide_drive_t *drive) +{ + /* mark words 85-87 as valid */ + drive->id[ATA_ID_CSF_DEFAULT] |= 0x4000; +} + +static const struct ide_port_ops netcell_port_ops = { + .quirkproc = netcell_quirkproc, +}; + #define DECLARE_GENERIC_PCI_DEV(extra_flags) \ { \ .name = DRV_NAME, \ @@ -74,6 +84,7 @@ static const struct ide_port_info generic_chipsets[] __devinitdata = { { /* 6: Revolution */ .name = DRV_NAME, + .port_ops = &netcell_port_ops, .host_flags = IDE_HFLAG_CLEAR_SIMPLEX | IDE_HFLAG_TRUST_BIOS_FOR_DMA | IDE_HFLAG_OFF_BOARD, From d315a0e09f1c8b833cacd5e72f3edea419978138 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 31 May 2009 23:09:22 +1000 Subject: [PATCH 736/900] crypto: hash - Fix handling of sg entry that crosses page boundary A quirk that we've always supported is having an sg entry that's bigger than a page, or more generally an sg entry that crosses page boundaries. Even though it would be better to explicitly have to sg entries for this, we need to support it for the existing users, in particular, IPsec. The new ahash sg walking code did try to handle this, but there was a bug where we didn't increment the page so kept on walking on the first page over an dover again. This patch fixes it. Tested-by: Martin Willi Signed-off-by: Herbert Xu --- crypto/ahash.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/crypto/ahash.c b/crypto/ahash.c index b2d1ee32cfe..f3476374f76 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -82,10 +82,11 @@ int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err) if (err) return err; - walk->offset = 0; - - if (nbytes) + if (nbytes) { + walk->offset = 0; + walk->pg++; return hash_walk_next(walk); + } if (!walk->total) return 0; From 52bb25a620e1925bb53d41d0ed28571b3de98a31 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Mon, 1 Jun 2009 06:21:13 +0000 Subject: [PATCH 737/900] headers_check fix: linux/auto_fs.h fix the following 'make headers_check' warnings: usr/include/linux/auto_fs.h:17: include of is preferred over Signed-off-by: Jaswinder Singh Rajput --- include/linux/auto_fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h index 63265852b7d..7b09c8348fd 100644 --- a/include/linux/auto_fs.h +++ b/include/linux/auto_fs.h @@ -14,13 +14,12 @@ #ifndef _LINUX_AUTO_FS_H #define _LINUX_AUTO_FS_H +#include #ifdef __KERNEL__ #include #include -#include #include #else -#include #include #endif /* __KERNEL__ */ From d280cc989ad591607e812cd5c5dfde702b5f191a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Mon, 1 Jun 2009 06:23:25 +0000 Subject: [PATCH 738/900] headers_check fix: linux/net_dropmon.h fix the following 'make headers_check' warnings: usr/include/linux/net_dropmon.h:7: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/net_dropmon.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h index 0217fb81a63..0e2e100c44a 100644 --- a/include/linux/net_dropmon.h +++ b/include/linux/net_dropmon.h @@ -1,6 +1,7 @@ #ifndef __NET_DROPMON_H #define __NET_DROPMON_H +#include #include struct net_dm_drop_point { From 4371ee353c3fc41aad9458b8e8e627eb508bc9a3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 1 Jun 2009 02:43:17 -0700 Subject: [PATCH 739/900] MAINTAINERS: take maintainership of the cpmac Ethernet driver This patch adds me as the maintainer of the CPMAC (AR7) Ethernet driver. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2b349ba4add..111fc169437 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1532,6 +1532,13 @@ W: http://www.fi.muni.cz/~kas/cosa/ S: Maintained F: drivers/net/wan/cosa* +CPMAC ETHERNET DRIVER +P: Florian Fainelli +M: florian@openwrt.org +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/cpmac.c + CPU FREQUENCY DRIVERS P: Dave Jones M: davej@redhat.com From cf9f6e21c155d5add733b969c695837ead79eeab Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 1 Jun 2009 03:12:04 -0700 Subject: [PATCH 740/900] 3c509: Add missing EISA IDs Several EISA device IDs for 3c509 family network cards are missing from the driver, making the cards unusable in their EISA mode. Here's a fix to add them based on the EISA configuration files distributed by 3Com and our eisa.ids database. Signed-off-by: Maciej W. Rozycki Signed-off-by: David S. Miller --- drivers/net/3c509.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/3c509.c b/drivers/net/3c509.c index fbb37192199..682aad89708 100644 --- a/drivers/net/3c509.c +++ b/drivers/net/3c509.c @@ -480,9 +480,13 @@ static int pnp_registered; #ifdef CONFIG_EISA static struct eisa_device_id el3_eisa_ids[] = { + { "TCM5090" }, + { "TCM5091" }, { "TCM5092" }, { "TCM5093" }, + { "TCM5094" }, { "TCM5095" }, + { "TCM5098" }, { "" } }; MODULE_DEVICE_TABLE(eisa, el3_eisa_ids); From 39d8bbedb9571a89d638f5b05358f26ab503d7a6 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Mon, 1 Jun 2009 13:46:49 +0200 Subject: [PATCH 741/900] hwmon: (lm78) Add missing __devexit_p() The remove function uses __devexit, so the .remove assignment needs __devexit_p() to fix a build error with hotplug disabled. Signed-off-by: Mike Frysinger Signed-off-by: Jean Delvare --- drivers/hwmon/lm78.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/lm78.c b/drivers/hwmon/lm78.c index b5e3b285169..a1787fdf5b9 100644 --- a/drivers/hwmon/lm78.c +++ b/drivers/hwmon/lm78.c @@ -182,7 +182,7 @@ static struct platform_driver lm78_isa_driver = { .name = "lm78", }, .probe = lm78_isa_probe, - .remove = lm78_isa_remove, + .remove = __devexit_p(lm78_isa_remove), }; From d54d462472a16fc07adb53a2fcd6c0c2a9a8dd1d Mon Sep 17 00:00:00 2001 From: Christian Engelmayer Date: Mon, 1 Jun 2009 13:46:50 +0200 Subject: [PATCH 742/900] hwmon: Update documentation on fan_max Add fan_max description. Add fan limit alarm 'max_alarm' to the alarm section. Signed-off-by: Christian Engelmayer Acked-by: Hans de Goede Signed-off-by: Jean Delvare --- Documentation/hwmon/sysfs-interface | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface index 2f10ce6a879..004ee161721 100644 --- a/Documentation/hwmon/sysfs-interface +++ b/Documentation/hwmon/sysfs-interface @@ -150,6 +150,11 @@ fan[1-*]_min Fan minimum value Unit: revolution/min (RPM) RW +fan[1-*]_max Fan maximum value + Unit: revolution/min (RPM) + Only rarely supported by the hardware. + RW + fan[1-*]_input Fan input value. Unit: revolution/min (RPM) RO @@ -390,6 +395,7 @@ OR in[0-*]_min_alarm in[0-*]_max_alarm fan[1-*]_min_alarm +fan[1-*]_max_alarm temp[1-*]_min_alarm temp[1-*]_max_alarm temp[1-*]_crit_alarm From fb39125fd79a25c5002f3b45cf4c80e3fa6b961b Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 17 Apr 2009 15:15:51 +0800 Subject: [PATCH 743/900] ftrace, workqueuetrace: make workqueue tracepoints use TRACE_EVENT macro v3: zhaolei@cn.fujitsu.com: Change TRACE_EVENT definition to new format introduced by Steven Rostedt: consolidate trace and trace_event headers v2: kosaki@jp.fujitsu.com: print the function names instead of addr, and zap the work addr v1: zhaolei@cn.fujitsu.com: Make workqueue tracepoints use TRACE_EVENT macro TRACE_EVENT is a more generic way to define tracepoints. Doing so adds these new capabilities to the tracepoints: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions Then, this patch converts DEFINE_TRACE to TRACE_EVENT in workqueue related tracepoints. [ Impact: expand workqueue tracer to events tracing ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: KOSAKI Motohiro Signed-off-by: Frederic Weisbecker --- include/trace/events/workqueue.h | 100 +++++++++++++++++++++++++++++++ include/trace/workqueue.h | 25 -------- kernel/trace/trace_workqueue.c | 2 +- kernel/workqueue.c | 11 +--- 4 files changed, 103 insertions(+), 35 deletions(-) create mode 100644 include/trace/events/workqueue.h delete mode 100644 include/trace/workqueue.h diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h new file mode 100644 index 00000000000..035f1bff288 --- /dev/null +++ b/include/trace/events/workqueue.h @@ -0,0 +1,100 @@ +#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WORKQUEUE_H + +#include +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM workqueue + +TRACE_EVENT(workqueue_insertion, + + TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), + + TP_ARGS(wq_thread, work), + + TP_STRUCT__entry( + __array(char, thread_comm, TASK_COMM_LEN) + __field(pid_t, thread_pid) + __field(work_func_t, func) + ), + + TP_fast_assign( + memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN); + __entry->thread_pid = wq_thread->pid; + __entry->func = work->func; + ), + + TP_printk("thread=%s:%d func=%pF", __entry->thread_comm, + __entry->thread_pid, __entry->func) +); + +TRACE_EVENT(workqueue_execution, + + TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), + + TP_ARGS(wq_thread, work), + + TP_STRUCT__entry( + __array(char, thread_comm, TASK_COMM_LEN) + __field(pid_t, thread_pid) + __field(work_func_t, func) + ), + + TP_fast_assign( + memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN); + __entry->thread_pid = wq_thread->pid; + __entry->func = work->func; + ), + + TP_printk("thread=%s:%d func=%pF", __entry->thread_comm, + __entry->thread_pid, __entry->func) +); + +/* Trace the creation of one workqueue thread on a cpu */ +TRACE_EVENT(workqueue_creation, + + TP_PROTO(struct task_struct *wq_thread, int cpu), + + TP_ARGS(wq_thread, cpu), + + TP_STRUCT__entry( + __array(char, thread_comm, TASK_COMM_LEN) + __field(pid_t, thread_pid) + __field(int, cpu) + ), + + TP_fast_assign( + memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN); + __entry->thread_pid = wq_thread->pid; + __entry->cpu = cpu; + ), + + TP_printk("thread=%s:%d cpu=%d", __entry->thread_comm, + __entry->thread_pid, __entry->cpu) +); + +TRACE_EVENT(workqueue_destruction, + + TP_PROTO(struct task_struct *wq_thread), + + TP_ARGS(wq_thread), + + TP_STRUCT__entry( + __array(char, thread_comm, TASK_COMM_LEN) + __field(pid_t, thread_pid) + ), + + TP_fast_assign( + memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN); + __entry->thread_pid = wq_thread->pid; + ), + + TP_printk("thread=%s:%d", __entry->thread_comm, __entry->thread_pid) +); + +#endif /* _TRACE_WORKQUEUE_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h deleted file mode 100644 index 7626523deeb..00000000000 --- a/include/trace/workqueue.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __TRACE_WORKQUEUE_H -#define __TRACE_WORKQUEUE_H - -#include -#include -#include - -DECLARE_TRACE(workqueue_insertion, - TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), - TP_ARGS(wq_thread, work)); - -DECLARE_TRACE(workqueue_execution, - TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), - TP_ARGS(wq_thread, work)); - -/* Trace the creation of one workqueue thread on a cpu */ -DECLARE_TRACE(workqueue_creation, - TP_PROTO(struct task_struct *wq_thread, int cpu), - TP_ARGS(wq_thread, cpu)); - -DECLARE_TRACE(workqueue_destruction, - TP_PROTO(struct task_struct *wq_thread), - TP_ARGS(wq_thread)); - -#endif /* __TRACE_WORKQUEUE_H */ diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 984b9175c13..cfe56d31d85 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -6,7 +6,7 @@ */ -#include +#include #include #include #include "trace_stat.h" diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f71fb2a0895..0668795d881 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -33,7 +33,8 @@ #include #include #include -#include +#define CREATE_TRACE_POINTS +#include /* * The per-CPU workqueue (if single thread, we always use the first @@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); } -DEFINE_TRACE(workqueue_insertion); - static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head) { @@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); -DEFINE_TRACE(workqueue_execution); - static void run_workqueue(struct cpu_workqueue_struct *cwq) { spin_lock_irq(&cwq->lock); @@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu) return cwq; } -DEFINE_TRACE(workqueue_creation); - static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; @@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name, } EXPORT_SYMBOL_GPL(__create_workqueue_key); -DEFINE_TRACE(workqueue_destruction); - static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) { /* From 1fdfca9c577aac96a559c1ea68f5c9156f17d636 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 20 Apr 2009 14:58:26 +0800 Subject: [PATCH 744/900] trace_workqueue: use list_for_each_entry() instead of list_for_each_entry_safe() No need to use list_for_each_entry_safe() in iteration without deleting any node, we can use list_for_each_entry() instead. [ Impact: cleanup ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_workqueue.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index cfe56d31d85..128b64b93f1 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -47,12 +47,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node, *next; + struct cpu_workqueue_stats *node; unsigned long flags; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, - list) { + list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { if (node->pid == wq_thread->pid) { atomic_inc(&node->inserted); goto found; @@ -69,12 +68,11 @@ probe_workqueue_execution(struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node, *next; + struct cpu_workqueue_stats *node; unsigned long flags; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, - list) { + list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { if (node->pid == wq_thread->pid) { node->executed++; goto found; From b8867164f05791a6b5363bd51c1274e03600886e Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 20 Apr 2009 14:59:36 +0800 Subject: [PATCH 745/900] trace_workqueue: remove cpu_workqueue_stats->first_entry cpu_workqueue_stats->first_entry is useless because we can retrieve the header of a cpu workqueue using: if (&cpu_workqueue_stats->list == workqueue_cpu_stat(cpu)->list.next) [ Impact: cleanup ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_workqueue.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 128b64b93f1..890974aed64 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -16,8 +16,6 @@ /* A cpu workqueue thread */ struct cpu_workqueue_stats { struct list_head list; -/* Useful to know if we print the cpu headers */ - bool first_entry; int cpu; pid_t pid; /* Can be inserted from interrupt or user context, need to be atomic */ @@ -103,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) cws->pid = wq_thread->pid; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (list_empty(&workqueue_cpu_stat(cpu)->list)) - cws->first_entry = true; list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); } From f3c4ae26e93d354152196b62797ba86ad86dd0cc Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 20 Apr 2009 15:02:17 +0800 Subject: [PATCH 746/900] trace_workqueue: remove blank line between each cpu The blankline between each cpu's workqueue stat is not necessary, because the cpu number is enough to part them by eye. Old style also caused a blankline below headline, and made code complex by using lock, disableirq and get cpu var. Old style: # CPU INSERTED EXECUTED NAME # | | | | 0 8644 8644 events/0 0 0 0 cpuset ... 0 1 1 kdmflush 1 35365 35365 events/1 ... New style: # CPU INSERTED EXECUTED NAME # | | | | 0 8644 8644 events/0 0 0 0 cpuset ... 0 1 1 kdmflush 1 35365 35365 events/1 ... [ Impact: provide more readable code ] Signed-off-by: Zhao Lei Cc: KOSAKI Motohiro Cc: Steven Rostedt Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_workqueue.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 890974aed64..97fcea4acce 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -185,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx) static int workqueue_stat_show(struct seq_file *s, void *p) { struct cpu_workqueue_stats *cws = p; - unsigned long flags; - int cpu = cws->cpu; struct pid *pid; struct task_struct *tsk; - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (&cws->list == workqueue_cpu_stat(cpu)->list.next) - seq_printf(s, "\n"); - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - pid = find_get_pid(cws->pid); if (pid) { tsk = get_pid_task(pid, PIDTYPE_PID); From 0d64f8342de26d02451900b1aad94716fe92c4ab Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 May 2009 05:58:49 +0200 Subject: [PATCH 747/900] tracing/stat: replace trace_stat_session by stat_session The "trace" prefix in struct trace_stat_session type is annoying while reading the trace_stat.c file. It makes the lines longer, and is not that much useful to explain the sense of this type. Just keep "struct stat_session" for this type. [ Impact: make the code a bit more readable ] Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index fdde3a4a94c..3b6816be825 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -22,7 +22,7 @@ struct trace_stat_list { }; /* A stat session is the stats output in one file */ -struct tracer_stat_session { +struct stat_session { struct list_head session_list; struct tracer_stat *ts; struct list_head stat_list; @@ -38,7 +38,7 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); static struct dentry *stat_dir; -static void reset_stat_session(struct tracer_stat_session *session) +static void reset_stat_session(struct stat_session *session) { struct trace_stat_list *node, *next; @@ -48,7 +48,7 @@ static void reset_stat_session(struct tracer_stat_session *session) INIT_LIST_HEAD(&session->stat_list); } -static void destroy_session(struct tracer_stat_session *session) +static void destroy_session(struct stat_session *session) { debugfs_remove(session->file); reset_stat_session(session); @@ -71,7 +71,7 @@ static int dummy_cmp(void *p1, void *p2) * All of these copies and sorting are required on all opening * since the stats could have changed between two file sessions. */ -static int stat_seq_init(struct tracer_stat_session *session) +static int stat_seq_init(struct stat_session *session) { struct trace_stat_list *iter_entry, *new_entry; struct tracer_stat *ts = session->ts; @@ -154,7 +154,7 @@ exit_free_list: static void *stat_seq_start(struct seq_file *s, loff_t *pos) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; /* Prevent from tracer switch or stat_list modification */ mutex_lock(&session->stat_mutex); @@ -168,7 +168,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; if (p == SEQ_START_TOKEN) return seq_list_start(&session->stat_list, *pos); @@ -178,13 +178,13 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) static void stat_seq_stop(struct seq_file *s, void *p) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; mutex_unlock(&session->stat_mutex); } static int stat_seq_show(struct seq_file *s, void *v) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); if (v == SEQ_START_TOKEN) @@ -205,7 +205,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file) { int ret; - struct tracer_stat_session *session = inode->i_private; + struct stat_session *session = inode->i_private; ret = seq_open(file, &trace_stat_seq_ops); if (!ret) { @@ -222,7 +222,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file) */ static int tracing_stat_release(struct inode *i, struct file *f) { - struct tracer_stat_session *session = i->i_private; + struct stat_session *session = i->i_private; mutex_lock(&session->stat_mutex); reset_stat_session(session); @@ -251,7 +251,7 @@ static int tracing_stat_init(void) return 0; } -static int init_stat_file(struct tracer_stat_session *session) +static int init_stat_file(struct stat_session *session) { if (!stat_dir && tracing_stat_init()) return -ENODEV; @@ -266,7 +266,7 @@ static int init_stat_file(struct tracer_stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { - struct tracer_stat_session *session, *node, *tmp; + struct stat_session *session, *node, *tmp; int ret; if (!trace) @@ -286,7 +286,7 @@ int register_stat_tracer(struct tracer_stat *trace) mutex_unlock(&all_stat_sessions_mutex); /* Init the session */ - session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); + session = kmalloc(sizeof(struct stat_session), GFP_KERNEL); if (!session) return -ENOMEM; @@ -312,7 +312,7 @@ int register_stat_tracer(struct tracer_stat *trace) void unregister_stat_tracer(struct tracer_stat *trace) { - struct tracer_stat_session *node, *tmp; + struct stat_session *node, *tmp; mutex_lock(&all_stat_sessions_mutex); list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { From 8f184f27300f66f6dcc8296c2dae7a1fbe8429c9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 May 2009 06:24:36 +0200 Subject: [PATCH 748/900] tracing/stat: replace linked list by an rbtree for sorting When the stat tracing framework prepares the entries from a tracer to output them to the user, it starts by computing a linear sort through a linked list to give the entries ordered by relevance to the user. This is quite ugly and causes a small latency when we begin to read the file. This patch changes that by turning the linked list into a red-black tree. Athough the whole iteration using the start and next tracer callbacks while opening the file remain the same, it is now much more fast and scalable. The rbtree guarantees O(log(n)) insertions whereas a linked list with linear sorting brought us a O(n) despair. Now the (visible) latency has disapeared. [ Impact: kill the latency while starting to read a stat tracer file ] Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 140 +++++++++++++++++++++++++++----------- 1 file changed, 100 insertions(+), 40 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 3b6816be825..0bd0fc82da5 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -1,7 +1,7 @@ /* * Infrastructure for statistic tracing (histogram output). * - * Copyright (C) 2008 Frederic Weisbecker + * Copyright (C) 2008-2009 Frederic Weisbecker * * Based on the code from trace_branch.c which is * Copyright (C) 2008 Steven Rostedt @@ -10,14 +10,19 @@ #include +#include #include #include "trace_stat.h" #include "trace.h" -/* List of stat entries from a tracer */ -struct trace_stat_list { - struct list_head list; +/* + * List of stat red-black nodes from a tracer + * We use a such tree to sort quickly the stat + * entries from the tracer. + */ +struct stat_node { + struct rb_node node; void *stat; }; @@ -25,7 +30,7 @@ struct trace_stat_list { struct stat_session { struct list_head session_list; struct tracer_stat *ts; - struct list_head stat_list; + struct rb_root stat_root; struct mutex stat_mutex; struct dentry *file; }; @@ -37,15 +42,45 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); /* The root directory for all stat files */ static struct dentry *stat_dir; +/* + * Iterate through the rbtree using a post order traversal path + * to release the next node. + * It won't necessary release one at each iteration + * but it will at least advance closer to the next one + * to be released. + */ +static struct rb_node *release_next(struct rb_node *node) +{ + struct stat_node *snode; + struct rb_node *parent = rb_parent(node); + + if (node->rb_left) + return node->rb_left; + else if (node->rb_right) + return node->rb_right; + else { + if (!parent) + return NULL; + if (parent->rb_left == node) + parent->rb_left = NULL; + else + parent->rb_right = NULL; + + snode = container_of(node, struct stat_node, node); + kfree(snode); + + return parent; + } +} static void reset_stat_session(struct stat_session *session) { - struct trace_stat_list *node, *next; + struct rb_node *node = session->stat_root.rb_node; - list_for_each_entry_safe(node, next, &session->stat_list, list) - kfree(node); + while (node) + node = release_next(node); - INIT_LIST_HEAD(&session->stat_list); + session->stat_root = RB_ROOT; } static void destroy_session(struct stat_session *session) @@ -56,6 +91,35 @@ static void destroy_session(struct stat_session *session) kfree(session); } +typedef int (*cmp_stat_t)(void *, void *); + +static void +insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* + * Figure out where to put new node + * This is a descendent sorting + */ + while (*new) { + struct stat_node *this; + int result; + + this = container_of(*new, struct stat_node, node); + result = cmp(data->stat, this->stat); + + parent = *new; + if (result >= 0) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); +} + /* * For tracers that don't provide a stat_cmp callback. * This one will force an immediate insertion on tail of @@ -73,8 +137,9 @@ static int dummy_cmp(void *p1, void *p2) */ static int stat_seq_init(struct stat_session *session) { - struct trace_stat_list *iter_entry, *new_entry; struct tracer_stat *ts = session->ts; + struct stat_node *new_entry; + struct rb_root *root; void *stat; int ret = 0; int i; @@ -93,15 +158,13 @@ static int stat_seq_init(struct stat_session *session) * The first entry. Actually this is the second, but the first * one (the stat_list head) is pointless. */ - new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); + new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); if (!new_entry) { ret = -ENOMEM; goto exit; } - - INIT_LIST_HEAD(&new_entry->list); - - list_add(&new_entry->list, &session->stat_list); + root = &session->stat_root; + insert_stat(root, new_entry, dummy_cmp); new_entry->stat = stat; @@ -116,31 +179,17 @@ static int stat_seq_init(struct stat_session *session) if (!stat) break; - new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); + new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); if (!new_entry) { ret = -ENOMEM; goto exit_free_list; } - INIT_LIST_HEAD(&new_entry->list); new_entry->stat = stat; - list_for_each_entry_reverse(iter_entry, &session->stat_list, - list) { - - /* Insertion with a descendent sorting */ - if (ts->stat_cmp(iter_entry->stat, - new_entry->stat) >= 0) { - - list_add(&new_entry->list, &iter_entry->list); - break; - } - } - - /* The current larger value */ - if (list_empty(&new_entry->list)) - list_add(&new_entry->list, &session->stat_list); + insert_stat(root, new_entry, ts->stat_cmp); } + exit: mutex_unlock(&session->stat_mutex); return ret; @@ -155,25 +204,38 @@ exit_free_list: static void *stat_seq_start(struct seq_file *s, loff_t *pos) { struct stat_session *session = s->private; + struct rb_node *node; + int i; /* Prevent from tracer switch or stat_list modification */ mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && session->ts->stat_headers) + if (!*pos && session->ts->stat_headers) { + (*pos)++; return SEQ_START_TOKEN; + } - return seq_list_start(&session->stat_list, *pos); + node = rb_first(&session->stat_root); + for (i = 0; node && i < *pos; i++) + node = rb_next(node); + + (*pos)++; + + return node; } static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) { struct stat_session *session = s->private; + struct rb_node *node = p; + + (*pos)++; if (p == SEQ_START_TOKEN) - return seq_list_start(&session->stat_list, *pos); + return rb_first(&session->stat_root); - return seq_list_next(p, &session->stat_list, pos); + return rb_next(node); } static void stat_seq_stop(struct seq_file *s, void *p) @@ -185,7 +247,7 @@ static void stat_seq_stop(struct seq_file *s, void *p) static int stat_seq_show(struct seq_file *s, void *v) { struct stat_session *session = s->private; - struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); + struct stat_node *l = container_of(v, struct stat_node, node); if (v == SEQ_START_TOKEN) return session->ts->stat_headers(s); @@ -286,15 +348,13 @@ int register_stat_tracer(struct tracer_stat *trace) mutex_unlock(&all_stat_sessions_mutex); /* Init the session */ - session = kmalloc(sizeof(struct stat_session), GFP_KERNEL); + session = kzalloc(sizeof(*session), GFP_KERNEL); if (!session) return -ENOMEM; session->ts = trace; INIT_LIST_HEAD(&session->session_list); - INIT_LIST_HEAD(&session->stat_list); mutex_init(&session->stat_mutex); - session->file = NULL; ret = init_stat_file(session); if (ret) { From b3dd7ba7d862707800c7ac45068f14ade2b65155 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 27 May 2009 11:04:26 +0800 Subject: [PATCH 749/900] tracing/stat: change dummpy_cmp() to return -1 Currently the output of trace_stat/workqueues is totally reversed: # cat /debug/tracing/trace_stat/workqueues ... 1 17 17 210 37 `-blk_unplug_work+0x0/0x57 1 3779 3779 181 11 |-cfq_kick_queue+0x0/0x2f 1 3796 3796 kblockd/1:120 ... The correct output should be: 1 3796 3796 kblockd/1:120 1 3779 3779 181 11 |-cfq_kick_queue+0x0/0x2f 1 17 17 210 37 `-blk_unplug_work+0x0/0x57 It's caused by "tracing/stat: replace linked list by an rbtree for sorting" (53059c9b67a62a3dc8c80204d3da42b9267ea5a0). dummpy_cmp() should return -1, so rb_node will always be inserted as right-most node in the rbtree, thus we sort the output in ascending order. [ Impact: fix the output of trace_stat/workqueues ] Signed-off-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 0bd0fc82da5..5816d1aebcc 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -127,7 +127,7 @@ insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp) */ static int dummy_cmp(void *p1, void *p2) { - return 1; + return -1; } /* From e16228069083a2f6b94383ac5739aea7a0f38ce4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 27 May 2009 11:04:48 +0800 Subject: [PATCH 750/900] tracing/stat: remember to free root node When closing a trace_stat file, we destroy the rbtree constructed during file open, but there is memory leak that the root node is not freed. [ Impact: fix memory leak when closing a trace_stat file ] Signed-off-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 5816d1aebcc..8030ec98dba 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -60,8 +60,8 @@ static struct rb_node *release_next(struct rb_node *node) return node->rb_right; else { if (!parent) - return NULL; - if (parent->rb_left == node) + ; + else if (parent->rb_left == node) parent->rb_left = NULL; else parent->rb_right = NULL; From dbd3fbdfeecfad4e71139db05d72560c3583e2a9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 27 May 2009 11:42:46 +0800 Subject: [PATCH 751/900] tracing/stat: do some cleanups - remove duplicate code in stat_seq_init() - update comments to reflect the change from stat list to stat rbtree [ Impact: clean up ] Signed-off-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 54 +++++++++++++++------------------------ 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 8030ec98dba..17f20ebdad2 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -93,10 +93,15 @@ static void destroy_session(struct stat_session *session) typedef int (*cmp_stat_t)(void *, void *); -static void -insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp) +static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) { struct rb_node **new = &(root->rb_node), *parent = NULL; + struct stat_node *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->stat = stat; /* * Figure out where to put new node @@ -118,12 +123,13 @@ insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp) rb_link_node(&data->node, parent, new); rb_insert_color(&data->node, root); + return 0; } /* * For tracers that don't provide a stat_cmp callback. - * This one will force an immediate insertion on tail of - * the list. + * This one will force an insertion as right-most node + * in the rbtree. */ static int dummy_cmp(void *p1, void *p2) { @@ -131,15 +137,14 @@ static int dummy_cmp(void *p1, void *p2) } /* - * Initialize the stat list at each trace_stat file opening. + * Initialize the stat rbtree at each trace_stat file opening. * All of these copies and sorting are required on all opening * since the stats could have changed between two file sessions. */ static int stat_seq_init(struct stat_session *session) { struct tracer_stat *ts = session->ts; - struct stat_node *new_entry; - struct rb_root *root; + struct rb_root *root = &session->stat_root; void *stat; int ret = 0; int i; @@ -154,23 +159,12 @@ static int stat_seq_init(struct stat_session *session) if (!stat) goto exit; - /* - * The first entry. Actually this is the second, but the first - * one (the stat_list head) is pointless. - */ - new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); - if (!new_entry) { - ret = -ENOMEM; + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) goto exit; - } - root = &session->stat_root; - insert_stat(root, new_entry, dummy_cmp); - - new_entry->stat = stat; /* - * Iterate over the tracer stat entries and store them in a sorted - * list. + * Iterate over the tracer stat entries and store them in an rbtree. */ for (i = 1; ; i++) { stat = ts->stat_next(stat, i); @@ -179,22 +173,16 @@ static int stat_seq_init(struct stat_session *session) if (!stat) break; - new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); - if (!new_entry) { - ret = -ENOMEM; - goto exit_free_list; - } - - new_entry->stat = stat; - - insert_stat(root, new_entry, ts->stat_cmp); + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) + goto exit_free_rbtree; } exit: mutex_unlock(&session->stat_mutex); return ret; -exit_free_list: +exit_free_rbtree: reset_stat_session(session); mutex_unlock(&session->stat_mutex); return ret; @@ -207,7 +195,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) struct rb_node *node; int i; - /* Prevent from tracer switch or stat_list modification */ + /* Prevent from tracer switch or rbtree modification */ mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ @@ -280,7 +268,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file) } /* - * Avoid consuming memory with our now useless list. + * Avoid consuming memory with our now useless rbtree. */ static int tracing_stat_release(struct inode *i, struct file *f) { From 43bd1236234cacbc18d1476a9b57e7a306efddf5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 30 May 2009 04:25:30 +0200 Subject: [PATCH 752/900] tracing/stat: remove unappropriate safe walk on list register_stat_tracer() uses list_for_each_entry_safe to check whether a tracer is already present in the list. But we don't delete anything from the list here, so we don't need the safe version [ Impact: cleanup list use is stat tracing ] Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 17f20ebdad2..c00643733f4 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -316,7 +316,7 @@ static int init_stat_file(struct stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { - struct stat_session *session, *node, *tmp; + struct stat_session *session, *node; int ret; if (!trace) @@ -327,7 +327,7 @@ int register_stat_tracer(struct tracer_stat *trace) /* Already registered? */ mutex_lock(&all_stat_sessions_mutex); - list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + list_for_each_entry(node, &all_stat_sessions, session_list) { if (node->ts == trace) { mutex_unlock(&all_stat_sessions_mutex); return -EINVAL; From 58f892e022e88438183c48661dcdc6a2997dab99 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Tue, 26 May 2009 21:48:07 +0000 Subject: [PATCH 753/900] x86: Print real IOAPIC version for x86-64 Fix the fact that the IOAPIC version number in the x86_64 code path always gets assigned to 0, instead of the correct value. Before the patch: (from "dmesg" output): ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0]) IOAPIC[0]: apic_id 8, version 0, address 0xfec00000, GSI 0-23 <--- After the patch: ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0]) IOAPIC[0]: apic_id 8, version 32, address 0xfec00000, GSI 0-23 <--- History: io_apic_get_version() was compiled out of the x86_64 code path in the commit f2c2cca3acef8b253a36381d9b469ad4fb08563a: Author: Andi Kleen Date: Tue Sep 26 10:52:37 2006 +0200 [PATCH] Remove APIC version/cpu capability mpparse checking/printing ACPI went to great trouble to get the APIC version and CPU capabilities of different CPUs before passing them to the mpparser. But all that data was used was to print it out. Actually it even faked some data based on the boot cpu, not on the actual CPU being booted. Remove all this code because it's not needed. Cc: len.brown@intel.com At the time, the IOAPIC version number was deliberately not printed in the x86_64 code path. However, after the x86 and x86_64 files were merged, the net result is that the IOAPIC version is printed incorrectly in the x86_64 code path. The patch below provides a fix. I have tested it with acpi, and with acpi=off, and did not see any problems. Signed-off-by: Naga Chumbalkar Acked-by: Yinghai Lu LKML-Reference: <20090416014230.4885.94926.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar ************************* --- arch/x86/kernel/acpi/boot.c | 5 +---- arch/x86/kernel/apic/io_apic.c | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 844e5e25213..631086159c5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -985,11 +985,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); mp_ioapics[idx].apicid = uniq_ioapic_id(id); -#ifdef CONFIG_X86_32 mp_ioapics[idx].apicver = io_apic_get_version(idx); -#else - mp_ioapics[idx].apicver = 0; -#endif + /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac7f3b6ad58..f712f8ff403 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4012,6 +4012,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } +#endif int __init io_apic_get_version(int ioapic) { @@ -4024,7 +4025,6 @@ int __init io_apic_get_version(int ioapic) return reg_01.bits.version; } -#endif int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { From 3d58829b0510244596079c1d2f1762c53aef2e97 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 28 May 2009 09:54:47 +0200 Subject: [PATCH 754/900] x86, apic: Restore irqs on fail paths lapic_resume forgets to restore interrupts on fail paths. Fix that. Signed-off-by: Jiri Slaby Acked-by: Cyrill Gorcunov LKML-Reference: <1243497289-18591-1-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b0fd26442c4..e82488d3f0b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2027,7 +2027,7 @@ static int lapic_resume(struct sys_device *dev) unsigned int l, h; unsigned long flags; int maxlvt; - int ret; + int ret = 0; struct IO_APIC_route_entry **ioapic_entries = NULL; if (!apic_pm_state.active) @@ -2038,14 +2038,15 @@ static int lapic_resume(struct sys_device *dev) ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); - return -ENOMEM; + ret = -ENOMEM; + goto restore; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { WARN(1, "Saving IO-APIC state failed: %d\n", ret); free_ioapic_entries(ioapic_entries); - return ret; + goto restore; } mask_IO_APIC_setup(ioapic_entries); @@ -2097,10 +2098,10 @@ static int lapic_resume(struct sys_device *dev) restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } - +restore: local_irq_restore(flags); - return 0; + return ret; } /* From 60e59f68824102c87e64c5f51c4e57c0b8a61e46 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sun, 24 May 2009 20:34:10 +0000 Subject: [PATCH 755/900] powerpc/pmac: Update PowerMac 32-bit defconfig This mostly adds back AppleTouch support and adds CONFIG_HIGHMEM by default. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/configs/pmac32_defconfig | 278 ++++++++++++++++++-------- 1 file changed, 195 insertions(+), 83 deletions(-) diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 5339bb44cce..ea8870a3448 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.28-rc3 -# Tue Nov 11 19:36:51 2008 +# Linux kernel version: 2.6.30-rc7 +# Mon May 25 14:53:25 2009 # # CONFIG_PPC64 is not set @@ -14,6 +14,7 @@ CONFIG_6xx=y # CONFIG_40x is not set # CONFIG_44x is not set # CONFIG_E200 is not set +CONFIG_PPC_BOOK3S=y CONFIG_PPC_FPU=y CONFIG_ALTIVEC=y CONFIG_PPC_STD_MMU=y @@ -43,7 +44,7 @@ CONFIG_GENERIC_FIND_NEXT_BIT=y CONFIG_PPC=y CONFIG_EARLY_PRINTK=y CONFIG_GENERIC_NVRAM=y -CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y +CONFIG_SCHED_OMIT_FRAME_POINTER=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y CONFIG_PPC_OF=y CONFIG_OF=y @@ -52,12 +53,14 @@ CONFIG_OF=y CONFIG_AUDIT_ARCH=y CONFIG_GENERIC_BUG=y CONFIG_SYS_SUPPORTS_APM_EMULATION=y +CONFIG_DTC=y # CONFIG_DEFAULT_UIMAGE is not set CONFIG_HIBERNATE_32=y CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y # CONFIG_PPC_DCR_NATIVE is not set # CONFIG_PPC_DCR_MMIO is not set +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # @@ -72,14 +75,24 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set # CONFIG_AUDIT is not set + +# +# RCU Subsystem +# +CONFIG_CLASSIC_RCU=y +# CONFIG_TREE_RCU is not set +# CONFIG_PREEMPT_RCU is not set +# CONFIG_TREE_RCU_TRACE is not set +# CONFIG_PREEMPT_RCU_TRACE is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_CGROUPS is not set # CONFIG_GROUP_SCHED is not set +# CONFIG_CGROUPS is not set CONFIG_SYSFS_DEPRECATED=y CONFIG_SYSFS_DEPRECATED_V2=y # CONFIG_RELAY is not set @@ -88,23 +101,27 @@ CONFIG_NAMESPACES=y # CONFIG_IPC_NS is not set # CONFIG_USER_NS is not set # CONFIG_PID_NS is not set +# CONFIG_NET_NS is not set CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y # CONFIG_EMBEDDED is not set CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y -# CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y -CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y CONFIG_TIMERFD=y @@ -114,10 +131,12 @@ CONFIG_AIO=y CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y +# CONFIG_COMPAT_BRK is not set # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y # CONFIG_MARKERS is not set CONFIG_OPROFILE=y CONFIG_HAVE_OPROFILE=y @@ -127,10 +146,10 @@ CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y +# CONFIG_SLOW_WORK is not set # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y -# CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 CONFIG_MODULES=y # CONFIG_MODULE_FORCE_LOAD is not set @@ -138,11 +157,8 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set -CONFIG_KMOD=y CONFIG_BLOCK=y CONFIG_LBD=y -# CONFIG_BLK_DEV_IO_TRACE is not set -CONFIG_LSF=y CONFIG_BLK_DEV_BSG=y # CONFIG_BLK_DEV_INTEGRITY is not set @@ -158,14 +174,11 @@ CONFIG_DEFAULT_AS=y # CONFIG_DEFAULT_CFQ is not set # CONFIG_DEFAULT_NOOP is not set CONFIG_DEFAULT_IOSCHED="anticipatory" -CONFIG_CLASSIC_RCU=y CONFIG_FREEZER=y # # Platform support # -CONFIG_PPC_MULTIPLATFORM=y -CONFIG_CLASSIC32=y # CONFIG_PPC_CHRP is not set # CONFIG_MPC5121_ADS is not set # CONFIG_MPC5121_GENERIC is not set @@ -178,7 +191,9 @@ CONFIG_PPC_PMAC=y # CONFIG_PPC_83xx is not set # CONFIG_PPC_86xx is not set # CONFIG_EMBEDDED6xx is not set +# CONFIG_AMIGAONE is not set CONFIG_PPC_NATIVE=y +CONFIG_PPC_OF_BOOT_TRAMPOLINE=y # CONFIG_IPIC is not set CONFIG_MPIC=y # CONFIG_MPIC_WEIRD is not set @@ -212,11 +227,12 @@ CONFIG_CPU_FREQ_PMAC=y CONFIG_PPC601_SYNC_FIX=y # CONFIG_TAU is not set # CONFIG_FSL_ULI1575 is not set +# CONFIG_SIMPLE_GPIO is not set # # Kernel options # -# CONFIG_HIGHMEM is not set +CONFIG_HIGHMEM=y CONFIG_TICK_ONESHOT=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y @@ -239,6 +255,7 @@ CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_ARCH_HAS_WALK_MEMORY=y CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y # CONFIG_KEXEC is not set +# CONFIG_CRASH_DUMP is not set CONFIG_ARCH_FLATMEM_ENABLE=y CONFIG_ARCH_POPULATES_NODE_MAP=y CONFIG_SELECT_MEMORY_MODEL=y @@ -250,12 +267,17 @@ CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_PAGEFLAGS_EXTENDED=y CONFIG_SPLIT_PTLOCK_CPUS=4 # CONFIG_MIGRATION is not set -# CONFIG_RESOURCES_64BIT is not set # CONFIG_PHYS_ADDR_T_64BIT is not set CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y +CONFIG_PPC_4K_PAGES=y +# CONFIG_PPC_16K_PAGES is not set +# CONFIG_PPC_64K_PAGES is not set +# CONFIG_PPC_256K_PAGES is not set CONFIG_FORCE_MAX_ZONEORDER=11 CONFIG_PROC_DEVICETREE=y # CONFIG_CMDLINE_BOOL is not set @@ -288,6 +310,8 @@ CONFIG_ARCH_SUPPORTS_MSI=y # CONFIG_PCI_MSI is not set # CONFIG_PCI_LEGACY is not set # CONFIG_PCI_DEBUG is not set +# CONFIG_PCI_STUB is not set +# CONFIG_PCI_IOV is not set CONFIG_PCCARD=m # CONFIG_PCMCIA_DEBUG is not set CONFIG_PCMCIA=m @@ -397,6 +421,8 @@ CONFIG_NETFILTER_XTABLES=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m # CONFIG_NETFILTER_XT_TARGET_CONNMARK is not set # CONFIG_NETFILTER_XT_TARGET_DSCP is not set +CONFIG_NETFILTER_XT_TARGET_HL=m +# CONFIG_NETFILTER_XT_TARGET_LED is not set CONFIG_NETFILTER_XT_TARGET_MARK=m CONFIG_NETFILTER_XT_TARGET_NFLOG=m CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m @@ -405,6 +431,7 @@ CONFIG_NETFILTER_XT_TARGET_RATEEST=m CONFIG_NETFILTER_XT_TARGET_TRACE=m CONFIG_NETFILTER_XT_TARGET_TCPMSS=m CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m +# CONFIG_NETFILTER_XT_MATCH_CLUSTER is not set CONFIG_NETFILTER_XT_MATCH_COMMENT=m # CONFIG_NETFILTER_XT_MATCH_CONNBYTES is not set CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m @@ -415,6 +442,7 @@ CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m # CONFIG_NETFILTER_XT_MATCH_HASHLIMIT is not set CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_HL=m CONFIG_NETFILTER_XT_MATCH_IPRANGE=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m CONFIG_NETFILTER_XT_MATCH_LIMIT=m @@ -478,17 +506,15 @@ CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_IP_DCCP=m CONFIG_INET_DCCP_DIAG=m -CONFIG_IP_DCCP_ACKVEC=y # # DCCP CCIDs Configuration (EXPERIMENTAL) # -CONFIG_IP_DCCP_CCID2=m # CONFIG_IP_DCCP_CCID2_DEBUG is not set -CONFIG_IP_DCCP_CCID3=m +CONFIG_IP_DCCP_CCID3=y # CONFIG_IP_DCCP_CCID3_DEBUG is not set CONFIG_IP_DCCP_CCID3_RTO=100 -CONFIG_IP_DCCP_TFRC_LIB=m +CONFIG_IP_DCCP_TFRC_LIB=y # # DCCP Kernel Hacking @@ -508,13 +534,16 @@ CONFIG_IP_DCCP_TFRC_LIB=m # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set +# CONFIG_PHONET is not set # CONFIG_NET_SCHED is not set CONFIG_NET_CLS_ROUTE=y +# CONFIG_DCB is not set # # Network testing # # CONFIG_NET_PKTGEN is not set +# CONFIG_NET_DROP_MONITOR is not set # CONFIG_HAMRADIO is not set # CONFIG_CAN is not set CONFIG_IRDA=m @@ -577,8 +606,6 @@ CONFIG_BT_HIDP=m # # Bluetooth device drivers # -CONFIG_BT_HCIUSB=m -# CONFIG_BT_HCIUSB_SCO is not set # CONFIG_BT_HCIBTUSB is not set # CONFIG_BT_HCIUART is not set CONFIG_BT_HCIBCM203X=m @@ -590,31 +617,27 @@ CONFIG_BT_HCIBFUSB=m # CONFIG_BT_HCIBTUART is not set # CONFIG_BT_HCIVHCI is not set # CONFIG_AF_RXRPC is not set -# CONFIG_PHONET is not set CONFIG_WIRELESS=y CONFIG_CFG80211=m -CONFIG_NL80211=y +# CONFIG_CFG80211_REG_DEBUG is not set CONFIG_WIRELESS_OLD_REGULATORY=y CONFIG_WIRELESS_EXT=y CONFIG_WIRELESS_EXT_SYSFS=y +# CONFIG_LIB80211 is not set CONFIG_MAC80211=m # # Rate control algorithm selection # -CONFIG_MAC80211_RC_PID=y -# CONFIG_MAC80211_RC_MINSTREL is not set -CONFIG_MAC80211_RC_DEFAULT_PID=y -# CONFIG_MAC80211_RC_DEFAULT_MINSTREL is not set -CONFIG_MAC80211_RC_DEFAULT="pid" +CONFIG_MAC80211_RC_MINSTREL=y +# CONFIG_MAC80211_RC_DEFAULT_PID is not set +CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT="minstrel" # CONFIG_MAC80211_MESH is not set CONFIG_MAC80211_LEDS=y +# CONFIG_MAC80211_DEBUGFS is not set # CONFIG_MAC80211_DEBUG_MENU is not set -CONFIG_IEEE80211=m -# CONFIG_IEEE80211_DEBUG is not set -CONFIG_IEEE80211_CRYPT_WEP=m -CONFIG_IEEE80211_CRYPT_CCMP=m -CONFIG_IEEE80211_CRYPT_TKIP=m +# CONFIG_WIMAX is not set # CONFIG_RFKILL is not set # CONFIG_NET_9P is not set @@ -662,17 +685,27 @@ CONFIG_BLK_DEV_RAM_SIZE=4096 # CONFIG_BLK_DEV_HD is not set CONFIG_MISC_DEVICES=y # CONFIG_PHANTOM is not set -# CONFIG_EEPROM_93CX6 is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set +# CONFIG_ICS932S401 is not set # CONFIG_ENCLOSURE_SERVICES is not set # CONFIG_HP_ILO is not set +# CONFIG_ISL29003 is not set +# CONFIG_C2PORT is not set + +# +# EEPROM support +# +# CONFIG_EEPROM_AT24 is not set +# CONFIG_EEPROM_LEGACY is not set +# CONFIG_EEPROM_93CX6 is not set CONFIG_HAVE_IDE=y CONFIG_IDE=y # # Please see Documentation/ide/ide.txt for help/info on IDE drives # +CONFIG_IDE_XFER_MODE=y CONFIG_IDE_TIMINGS=y CONFIG_IDE_ATAPI=y # CONFIG_BLK_DEV_IDE_SATA is not set @@ -684,7 +717,6 @@ CONFIG_BLK_DEV_IDECS=m CONFIG_BLK_DEV_IDECD=y CONFIG_BLK_DEV_IDECD_VERBOSE_ERRORS=y # CONFIG_BLK_DEV_IDETAPE is not set -CONFIG_BLK_DEV_IDESCSI=y # CONFIG_IDE_TASK_IOCTL is not set CONFIG_IDE_PROC_FS=y @@ -714,6 +746,7 @@ CONFIG_BLK_DEV_IDEDMA_PCI=y # CONFIG_BLK_DEV_JMICRON is not set # CONFIG_BLK_DEV_SC1200 is not set # CONFIG_BLK_DEV_PIIX is not set +# CONFIG_BLK_DEV_IT8172 is not set # CONFIG_BLK_DEV_IT8213 is not set # CONFIG_BLK_DEV_IT821X is not set # CONFIG_BLK_DEV_NS87415 is not set @@ -728,7 +761,6 @@ CONFIG_BLK_DEV_SL82C105=y # CONFIG_BLK_DEV_TC86C001 is not set CONFIG_BLK_DEV_IDE_PMAC=y CONFIG_BLK_DEV_IDE_PMAC_ATA100FIRST=y -CONFIG_BLK_DEV_IDEDMA_PMAC=y CONFIG_BLK_DEV_IDEDMA=y # @@ -772,6 +804,7 @@ CONFIG_SCSI_FC_ATTRS=y # CONFIG_SCSI_SRP_ATTRS is not set CONFIG_SCSI_LOWLEVEL=y # CONFIG_ISCSI_TCP is not set +# CONFIG_SCSI_CXGB3_ISCSI is not set # CONFIG_BLK_DEV_3W_XXXX_RAID is not set # CONFIG_SCSI_3W_9XXX is not set # CONFIG_SCSI_ACARD is not set @@ -791,8 +824,12 @@ CONFIG_SCSI_AIC7XXX_OLD=m # CONFIG_MEGARAID_NEWGEN is not set # CONFIG_MEGARAID_LEGACY is not set # CONFIG_MEGARAID_SAS is not set +# CONFIG_SCSI_MPT2SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_SCSI_BUSLOGIC is not set +# CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set +# CONFIG_FCOE is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_EATA is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -822,6 +859,7 @@ CONFIG_SCSI_MAC53C94=y # CONFIG_SCSI_SRP is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set # CONFIG_SCSI_DH is not set +# CONFIG_SCSI_OSD_INITIATOR is not set # CONFIG_ATA is not set CONFIG_MD=y CONFIG_BLK_DEV_MD=m @@ -881,6 +919,7 @@ CONFIG_THERM_ADT746X=m # CONFIG_ANSLCD is not set CONFIG_PMAC_RACKMETER=m CONFIG_NETDEVICES=y +CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_DUMMY=m # CONFIG_BONDING is not set # CONFIG_MACVLAN is not set @@ -898,6 +937,8 @@ CONFIG_BMAC=y CONFIG_SUNGEM=y # CONFIG_CASSINI is not set # CONFIG_NET_VENDOR_3COM is not set +# CONFIG_ETHOC is not set +# CONFIG_DNET is not set # CONFIG_NET_TULIP is not set # CONFIG_HP100 is not set # CONFIG_IBM_NEW_EMAC_ZMII is not set @@ -913,7 +954,6 @@ CONFIG_PCNET32=y # CONFIG_ADAPTEC_STARFIRE is not set # CONFIG_B44 is not set # CONFIG_FORCEDETH is not set -# CONFIG_EEPRO100 is not set # CONFIG_E100 is not set # CONFIG_FEALNX is not set # CONFIG_NATSEMI is not set @@ -923,6 +963,7 @@ CONFIG_PCNET32=y # CONFIG_R6040 is not set # CONFIG_SIS900 is not set # CONFIG_EPIC100 is not set +# CONFIG_SMSC9420 is not set # CONFIG_SUNDANCE is not set # CONFIG_TLAN is not set # CONFIG_VIA_RHINE is not set @@ -935,6 +976,7 @@ CONFIG_NETDEV_1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -945,18 +987,20 @@ CONFIG_NETDEV_1000=y # CONFIG_VIA_VELOCITY is not set # CONFIG_TIGON3 is not set # CONFIG_BNX2 is not set -# CONFIG_MV643XX_ETH is not set # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set # CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set # CONFIG_JME is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set +CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_CHELSIO_T3 is not set # CONFIG_ENIC is not set # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -966,6 +1010,7 @@ CONFIG_NETDEV_10000=y # CONFIG_BNX2X is not set # CONFIG_QLGE is not set # CONFIG_SFC is not set +# CONFIG_BE2NET is not set # CONFIG_TR is not set # @@ -974,20 +1019,11 @@ CONFIG_NETDEV_10000=y # CONFIG_WLAN_PRE80211 is not set CONFIG_WLAN_80211=y # CONFIG_PCMCIA_RAYCS is not set -# CONFIG_IPW2100 is not set -# CONFIG_IPW2200 is not set # CONFIG_LIBERTAS is not set # CONFIG_LIBERTAS_THINFIRM is not set # CONFIG_AIRO is not set -CONFIG_HERMES=m -CONFIG_APPLE_AIRPORT=m -# CONFIG_PLX_HERMES is not set -# CONFIG_TMD_HERMES is not set -# CONFIG_NORTEL_HERMES is not set -CONFIG_PCI_HERMES=m -CONFIG_PCMCIA_HERMES=m -# CONFIG_PCMCIA_SPECTRUM is not set # CONFIG_ATMEL is not set +# CONFIG_AT76C50X_USB is not set # CONFIG_AIRO_CS is not set # CONFIG_PCMCIA_WL3501 is not set CONFIG_PRISM54=m @@ -997,15 +1033,17 @@ CONFIG_PRISM54=m # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set # CONFIG_MAC80211_HWSIM is not set +# CONFIG_MWL8K is not set CONFIG_P54_COMMON=m # CONFIG_P54_USB is not set # CONFIG_P54_PCI is not set +CONFIG_P54_LEDS=y # CONFIG_ATH5K is not set # CONFIG_ATH9K is not set -# CONFIG_IWLCORE is not set -# CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWLAGN is not set -# CONFIG_IWL3945 is not set +# CONFIG_AR9170_USB is not set +# CONFIG_IPW2100 is not set +# CONFIG_IPW2200 is not set +# CONFIG_IWLWIFI is not set # CONFIG_HOSTAP is not set CONFIG_B43=m CONFIG_B43_PCI_AUTOSELECT=y @@ -1025,6 +1063,19 @@ CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y # CONFIG_B43LEGACY_PIO_MODE is not set # CONFIG_ZD1211RW is not set # CONFIG_RT2X00 is not set +CONFIG_HERMES=m +CONFIG_HERMES_CACHE_FW_ON_INIT=y +CONFIG_APPLE_AIRPORT=m +# CONFIG_PLX_HERMES is not set +# CONFIG_TMD_HERMES is not set +# CONFIG_NORTEL_HERMES is not set +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +# CONFIG_PCMCIA_SPECTRUM is not set + +# +# Enable WiMAX (Networking options) to see the WiMAX drivers +# # # USB Network Adapters @@ -1036,6 +1087,7 @@ CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y CONFIG_USB_USBNET=m CONFIG_USB_NET_AX8817X=m CONFIG_USB_NET_CDCETHER=m +# CONFIG_USB_NET_CDC_EEM is not set # CONFIG_USB_NET_DM9601 is not set # CONFIG_USB_NET_SMSC95XX is not set # CONFIG_USB_NET_GL620A is not set @@ -1099,7 +1151,7 @@ CONFIG_INPUT_KEYBOARD=y CONFIG_INPUT_MOUSE=y # CONFIG_MOUSE_PS2 is not set # CONFIG_MOUSE_SERIAL is not set -# CONFIG_MOUSE_APPLETOUCH is not set +CONFIG_MOUSE_APPLETOUCH=y # CONFIG_MOUSE_BCM5974 is not set # CONFIG_MOUSE_VSXXXAA is not set # CONFIG_INPUT_JOYSTICK is not set @@ -1150,10 +1202,13 @@ CONFIG_SERIAL_PMACZILOG_TTYS=y # CONFIG_SERIAL_JSM is not set # CONFIG_SERIAL_OF_PLATFORM is not set CONFIG_UNIX98_PTYS=y +# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 +# CONFIG_HVC_UDBG is not set # CONFIG_IPMI_HANDLER is not set CONFIG_HW_RANDOM=m +# CONFIG_HW_RANDOM_TIMERIOMEM is not set CONFIG_NVRAM=y CONFIG_GEN_RTC=y # CONFIG_GEN_RTC_X is not set @@ -1232,12 +1287,9 @@ CONFIG_I2C_POWERMAC=y # Miscellaneous I2C Chip support # # CONFIG_DS1682 is not set -# CONFIG_EEPROM_AT24 is not set -# CONFIG_EEPROM_LEGACY is not set # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set # CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set # CONFIG_I2C_DEBUG_CORE is not set @@ -1259,11 +1311,11 @@ CONFIG_BATTERY_PMU=y # CONFIG_THERMAL is not set # CONFIG_THERMAL_HWMON is not set # CONFIG_WATCHDOG is not set +CONFIG_SSB_POSSIBLE=y # # Sonics Silicon Backplane # -CONFIG_SSB_POSSIBLE=y CONFIG_SSB=m CONFIG_SSB_SPROM=y CONFIG_SSB_PCIHOST_POSSIBLE=y @@ -1281,18 +1333,13 @@ CONFIG_SSB_DRIVER_PCICORE=y # CONFIG_MFD_CORE is not set # CONFIG_MFD_SM501 is not set # CONFIG_HTC_PASIC3 is not set +# CONFIG_TWL4030_CORE is not set # CONFIG_MFD_TMIO is not set # CONFIG_PMIC_DA903X is not set # CONFIG_MFD_WM8400 is not set # CONFIG_MFD_WM8350_I2C is not set - -# -# Voltage and Current regulators -# +# CONFIG_MFD_PCF50633 is not set # CONFIG_REGULATOR is not set -# CONFIG_REGULATOR_FIXED_VOLTAGE is not set -# CONFIG_REGULATOR_VIRTUAL_CONSUMER is not set -# CONFIG_REGULATOR_BQ24022 is not set # # Multimedia devices @@ -1390,6 +1437,7 @@ CONFIG_FB_ATY_BACKLIGHT=y # CONFIG_FB_KYRO is not set CONFIG_FB_3DFX=y # CONFIG_FB_3DFX_ACCEL is not set +CONFIG_FB_3DFX_I2C=y # CONFIG_FB_VOODOO1 is not set # CONFIG_FB_VT8623 is not set # CONFIG_FB_TRIDENT is not set @@ -1399,12 +1447,14 @@ CONFIG_FB_3DFX=y # CONFIG_FB_IBM_GXT4500 is not set # CONFIG_FB_VIRTUAL is not set # CONFIG_FB_METRONOME is not set +# CONFIG_FB_MB862XX is not set +# CONFIG_FB_BROADSHEET is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_LCD_CLASS_DEVICE=m # CONFIG_LCD_ILI9320 is not set # CONFIG_LCD_PLATFORM is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y -# CONFIG_BACKLIGHT_CORGI is not set +CONFIG_BACKLIGHT_GENERIC=y # # Display device support @@ -1444,11 +1494,13 @@ CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m CONFIG_SND_PCM_OSS_PLUGINS=y CONFIG_SND_SEQUENCER_OSS=y +# CONFIG_SND_HRTIMER is not set # CONFIG_SND_DYNAMIC_MINORS is not set CONFIG_SND_SUPPORT_OLD_API=y CONFIG_SND_VERBOSE_PROCFS=y # CONFIG_SND_VERBOSE_PRINTK is not set # CONFIG_SND_DEBUG is not set +CONFIG_SND_VMASTER=y CONFIG_SND_DRIVERS=y CONFIG_SND_DUMMY=m # CONFIG_SND_VIRMIDI is not set @@ -1486,6 +1538,8 @@ CONFIG_SND_PCI=y # CONFIG_SND_INDIGO is not set # CONFIG_SND_INDIGOIO is not set # CONFIG_SND_INDIGODJ is not set +# CONFIG_SND_INDIGOIOX is not set +# CONFIG_SND_INDIGODJX is not set # CONFIG_SND_EMU10K1 is not set # CONFIG_SND_EMU10K1X is not set # CONFIG_SND_ENS1370 is not set @@ -1551,28 +1605,31 @@ CONFIG_USB_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y -CONFIG_HID_BRIGHT=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y -CONFIG_HID_DELL=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +CONFIG_HID_KYE=y CONFIG_HID_GYRATION=y +CONFIG_HID_KENSINGTON=y CONFIG_HID_LOGITECH=y # CONFIG_LOGITECH_FF is not set # CONFIG_LOGIRUMBLEPAD2_FF is not set CONFIG_HID_MICROSOFT=y CONFIG_HID_MONTEREY=y +CONFIG_HID_NTRIG=y CONFIG_HID_PANTHERLORD=y # CONFIG_PANTHERLORD_FF is not set CONFIG_HID_PETALYNX=y CONFIG_HID_SAMSUNG=y CONFIG_HID_SONY=y CONFIG_HID_SUNPLUS=y +# CONFIG_GREENASIA_FF is not set +CONFIG_HID_TOPSEED=y # CONFIG_THRUSTMASTER_FF is not set # CONFIG_ZEROPLUS_FF is not set CONFIG_USB_SUPPORT=y @@ -1603,6 +1660,7 @@ CONFIG_USB_EHCI_HCD=m CONFIG_USB_EHCI_ROOT_HUB_TT=y # CONFIG_USB_EHCI_TT_NEWSCHED is not set # CONFIG_USB_EHCI_HCD_PPC_OF is not set +# CONFIG_USB_OXU210HP_HCD is not set # CONFIG_USB_ISP116X_HCD is not set # CONFIG_USB_ISP1760_HCD is not set CONFIG_USB_OHCI_HCD=y @@ -1625,24 +1683,23 @@ CONFIG_USB_PRINTER=m # CONFIG_USB_TMC is not set # -# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may # # -# may also be needed; see USB_STORAGE Help for more information +# also be needed; see USB_STORAGE Help for more info # CONFIG_USB_STORAGE=m # CONFIG_USB_STORAGE_DEBUG is not set # CONFIG_USB_STORAGE_DATAFAB is not set # CONFIG_USB_STORAGE_FREECOM is not set # CONFIG_USB_STORAGE_ISD200 is not set -# CONFIG_USB_STORAGE_DPCM is not set # CONFIG_USB_STORAGE_USBAT is not set # CONFIG_USB_STORAGE_SDDR09 is not set # CONFIG_USB_STORAGE_SDDR55 is not set # CONFIG_USB_STORAGE_JUMPSHOT is not set # CONFIG_USB_STORAGE_ALAUDA is not set -CONFIG_USB_STORAGE_ONETOUCH=y +CONFIG_USB_STORAGE_ONETOUCH=m # CONFIG_USB_STORAGE_KARMA is not set # CONFIG_USB_STORAGE_CYPRESS_ATACB is not set # CONFIG_USB_LIBUSUAL is not set @@ -1665,7 +1722,7 @@ CONFIG_USB_EZUSB=y # CONFIG_USB_SERIAL_CH341 is not set # CONFIG_USB_SERIAL_WHITEHEAT is not set # CONFIG_USB_SERIAL_DIGI_ACCELEPORT is not set -# CONFIG_USB_SERIAL_CP2101 is not set +# CONFIG_USB_SERIAL_CP210X is not set # CONFIG_USB_SERIAL_CYPRESS_M8 is not set # CONFIG_USB_SERIAL_EMPEG is not set # CONFIG_USB_SERIAL_FTDI_SIO is not set @@ -1701,15 +1758,19 @@ CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y # CONFIG_USB_SERIAL_NAVMAN is not set # CONFIG_USB_SERIAL_PL2303 is not set # CONFIG_USB_SERIAL_OTI6858 is not set +# CONFIG_USB_SERIAL_QUALCOMM is not set # CONFIG_USB_SERIAL_SPCP8X5 is not set # CONFIG_USB_SERIAL_HP4X is not set # CONFIG_USB_SERIAL_SAFE is not set +# CONFIG_USB_SERIAL_SIEMENS_MPI is not set # CONFIG_USB_SERIAL_SIERRAWIRELESS is not set +# CONFIG_USB_SERIAL_SYMBOL is not set # CONFIG_USB_SERIAL_TI is not set # CONFIG_USB_SERIAL_CYBERJACK is not set # CONFIG_USB_SERIAL_XIRCOM is not set # CONFIG_USB_SERIAL_OPTION is not set # CONFIG_USB_SERIAL_OMNINET is not set +# CONFIG_USB_SERIAL_OPTICON is not set # CONFIG_USB_SERIAL_DEBUG is not set # @@ -1726,7 +1787,6 @@ CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y # CONFIG_USB_LED is not set # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set -# CONFIG_USB_PHIDGET is not set # CONFIG_USB_IDMOUSE is not set # CONFIG_USB_FTDI_ELAN is not set CONFIG_USB_APPLEDISPLAY=m @@ -1738,6 +1798,11 @@ CONFIG_USB_APPLEDISPLAY=m # CONFIG_USB_ISIGHTFW is not set # CONFIG_USB_VST is not set # CONFIG_USB_GADGET is not set + +# +# OTG and related infrastructure +# +# CONFIG_NOP_USB_XCEIV is not set # CONFIG_UWB is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1748,7 +1813,9 @@ CONFIG_LEDS_CLASS=y # LED drivers # # CONFIG_LEDS_PCA9532 is not set +# CONFIG_LEDS_LP5521 is not set # CONFIG_LEDS_PCA955X is not set +# CONFIG_LEDS_BD2802 is not set # # LED Triggers @@ -1759,11 +1826,16 @@ CONFIG_LEDS_TRIGGER_IDE_DISK=y # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set CONFIG_LEDS_TRIGGER_DEFAULT_ON=y + +# +# iptables trigger is under Netfilter config (LED target) +# # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set # CONFIG_EDAC is not set # CONFIG_RTC_CLASS is not set # CONFIG_DMADEVICES is not set +# CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set # CONFIG_STAGING is not set @@ -1774,6 +1846,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y # CONFIG_EXT3_FS_SECURITY is not set @@ -1783,7 +1856,9 @@ CONFIG_EXT4_FS_XATTR=y # CONFIG_EXT4_FS_POSIX_ACL is not set # CONFIG_EXT4_FS_SECURITY is not set CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set CONFIG_JBD2=y +# CONFIG_JBD2_DEBUG is not set CONFIG_FS_MBCACHE=y # CONFIG_REISERFS_FS is not set # CONFIG_JFS_FS is not set @@ -1792,6 +1867,7 @@ CONFIG_FILE_LOCKING=y # CONFIG_XFS_FS is not set # CONFIG_GFS2_FS is not set # CONFIG_OCFS2_FS is not set +# CONFIG_BTRFS_FS is not set CONFIG_DNOTIFY=y CONFIG_INOTIFY=y CONFIG_INOTIFY_USER=y @@ -1800,6 +1876,11 @@ CONFIG_INOTIFY_USER=y CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=m +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1831,10 +1912,7 @@ CONFIG_TMPFS=y # CONFIG_TMPFS_POSIX_ACL is not set # CONFIG_HUGETLB_PAGE is not set # CONFIG_CONFIGFS_FS is not set - -# -# Miscellaneous filesystems -# +CONFIG_MISC_FILESYSTEMS=y # CONFIG_ADFS_FS is not set # CONFIG_AFFS_FS is not set CONFIG_HFS_FS=m @@ -1843,6 +1921,7 @@ CONFIG_HFSPLUS_FS=m # CONFIG_BFS_FS is not set # CONFIG_EFS_FS is not set # CONFIG_CRAMFS is not set +# CONFIG_SQUASHFS is not set # CONFIG_VXFS_FS is not set # CONFIG_MINIX_FS is not set # CONFIG_OMFS_FS is not set @@ -1851,6 +1930,7 @@ CONFIG_HFSPLUS_FS=m # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -1868,7 +1948,6 @@ CONFIG_NFS_ACL_SUPPORT=y CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y CONFIG_SUNRPC_GSS=y -# CONFIG_SUNRPC_REGISTER_V4 is not set CONFIG_RPCSEC_GSS_KRB5=y # CONFIG_RPCSEC_GSS_SPKM3 is not set CONFIG_SMB_FS=m @@ -1940,11 +2019,13 @@ CONFIG_NLS_ISO8859_1=m # CONFIG_NLS_KOI8_U is not set CONFIG_NLS_UTF8=m # CONFIG_DLM is not set +CONFIG_BINARY_PRINTF=y # # Library routines # CONFIG_BITREVERSE=y +CONFIG_GENERIC_FIND_LAST_BIT=y CONFIG_CRC_CCITT=y CONFIG_CRC16=y CONFIG_CRC_T10DIF=y @@ -1954,15 +2035,18 @@ CONFIG_CRC32=y CONFIG_LIBCRC32C=m CONFIG_ZLIB_INFLATE=y CONFIG_ZLIB_DEFLATE=y +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y CONFIG_TEXTSEARCH=y CONFIG_TEXTSEARCH_KMP=m CONFIG_TEXTSEARCH_BM=m CONFIG_TEXTSEARCH_FSM=m -CONFIG_PLIST=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y CONFIG_HAVE_LMB=y +CONFIG_NLATTR=y # # Kernel hacking @@ -1973,13 +2057,16 @@ CONFIG_ENABLE_MUST_CHECK=y CONFIG_FRAME_WARN=1024 CONFIG_MAGIC_SYSRQ=y # CONFIG_UNUSED_SYMBOLS is not set -# CONFIG_DEBUG_FS is not set +CONFIG_DEBUG_FS=y # CONFIG_HEADERS_CHECK is not set CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 CONFIG_SCHED_DEBUG=y CONFIG_SCHEDSTATS=y # CONFIG_TIMER_STATS is not set @@ -1994,6 +2081,7 @@ CONFIG_SCHEDSTATS=y # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set +# CONFIG_DEBUG_HIGHMEM is not set CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_VM is not set @@ -2001,6 +2089,7 @@ CONFIG_DEBUG_BUGVERBOSE=y CONFIG_DEBUG_MEMORY_INIT=y # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_SG is not set +# CONFIG_DEBUG_NOTIFIERS is not set # CONFIG_BOOT_PRINTK_DELAY is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_RCU_CPU_STALL_DETECTOR is not set @@ -2009,7 +2098,14 @@ CONFIG_DEBUG_MEMORY_INIT=y # CONFIG_FAULT_INJECTION is not set CONFIG_LATENCYTOP=y CONFIG_SYSCTL_SYSCALL_CHECK=y +CONFIG_NOP_TRACER=y CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_RING_BUFFER=y +CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -2017,12 +2113,19 @@ CONFIG_HAVE_FUNCTION_TRACER=y # CONFIG_FUNCTION_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set -# CONFIG_DYNAMIC_PRINTK_DEBUG is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_DYNAMIC_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set +CONFIG_PRINT_STACK_DEPTH=64 # CONFIG_DEBUG_STACKOVERFLOW is not set # CONFIG_DEBUG_STACK_USAGE is not set # CONFIG_CODE_PATCHING_SELFTEST is not set @@ -2033,6 +2136,7 @@ CONFIG_XMON_DEFAULT=y CONFIG_XMON_DISASSEMBLY=y CONFIG_DEBUGGER=y CONFIG_IRQSTACKS=y +# CONFIG_VIRQ_DEBUG is not set # CONFIG_BDI_SWITCH is not set CONFIG_BOOTX_TEXT=y # CONFIG_PPC_EARLY_DEBUG is not set @@ -2051,13 +2155,20 @@ CONFIG_CRYPTO=y # # CONFIG_CRYPTO_FIPS is not set CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_AEAD2=y CONFIG_CRYPTO_BLKCIPHER=y +CONFIG_CRYPTO_BLKCIPHER2=y CONFIG_CRYPTO_HASH=y -CONFIG_CRYPTO_RNG=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_PCOMP=y CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y # CONFIG_CRYPTO_GF128MUL is not set CONFIG_CRYPTO_NULL=m +CONFIG_CRYPTO_WORKQUEUE=y # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_AUTHENC=y # CONFIG_CRYPTO_TEST is not set @@ -2127,6 +2238,7 @@ CONFIG_CRYPTO_TWOFISH_COMMON=m # Compression # CONFIG_CRYPTO_DEFLATE=m +# CONFIG_CRYPTO_ZLIB is not set # CONFIG_CRYPTO_LZO is not set # From 2af15d6a44b871ad4c2a651302374cde8f335480 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 May 2009 13:37:24 -0400 Subject: [PATCH 756/900] ftrace: add kernel command line function filtering When using ftrace=function on the command line to trace functions on boot up, one can not filter out functions that are commonly called. This patch adds two new ftrace command line commands. ftrace_notrace=function-list ftrace_filter=function-list Where function-list is a comma separated list of functions to filter. The ftrace_notrace will make the functions listed not be included in the function tracing, and ftrace_filter will only trace the functions listed. These two act the same as the debugfs/tracing/set_ftrace_notrace and debugfs/tracing/set_ftrace_filter respectively. The simple glob expressions that are allowed by the filter files can also be used by the command line interface. ftrace_notrace=rcu*,*lock,*spin* Will not trace any function that starts with rcu, ends with lock, or has the word spin in it. Note, if the self tests are enabled, they may interfere with the filtering set by the command lines. Signed-off-by: Steven Rostedt --- Documentation/kernel-parameters.txt | 17 ++++++++++-- kernel/trace/ftrace.c | 42 +++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9243dd84f4d..fcd3bfbe74e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -751,12 +751,25 @@ and is between 256 and 4096 characters. It is defined in the file ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. ftrace=[tracer] - [ftrace] will set and start the specified tracer + [FTRACE] will set and start the specified tracer as early as possible in order to facilitate early boot debugging. ftrace_dump_on_oops - [ftrace] will dump the trace buffers on oops. + [FTRACE] will dump the trace buffers on oops. + + ftrace_filter=[function-list] + [FTRACE] Limit the functions traced by the function + tracer at boot up. function-list is a comma separated + list of functions. This list can be changed at run + time by the set_ftrace_filter file in the debugfs + tracing directory. + + ftrace_notrace=[function-list] + [FTRACE] Do not trace the functions specified in + function-list. This list can be changed at run time + by the set_ftrace_notrace file in the debugfs + tracing directory. gamecon.map[2|3]= [HW,JOY] Multisystem joystick and NES/SNES/PSX pad diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 140699a9a8a..2074e5b7766 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -32,6 +32,7 @@ #include #include +#include #include "trace_output.h" #include "trace_stat.h" @@ -2369,6 +2370,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset) ftrace_set_regex(buf, len, reset, 0); } +/* + * command line interface to allow users to set filters on boot up. + */ +#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE +static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; + +static int __init set_ftrace_notrace(char *str) +{ + strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_notrace=", set_ftrace_notrace); + +static int __init set_ftrace_filter(char *str) +{ + strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_filter=", set_ftrace_filter); + +static void __init set_ftrace_early_filter(char *buf, int enable) +{ + char *func; + + while (buf) { + func = strsep(&buf, ","); + ftrace_set_regex(func, strlen(func), 0, enable); + } +} + +static void __init set_ftrace_early_filters(void) +{ + if (ftrace_filter_buf[0]) + set_ftrace_early_filter(ftrace_filter_buf, 1); + if (ftrace_notrace_buf[0]) + set_ftrace_early_filter(ftrace_notrace_buf, 0); +} + static int ftrace_regex_release(struct inode *inode, struct file *file, int enable) { @@ -2829,6 +2869,8 @@ void __init ftrace_init(void) if (ret) pr_warning("Failed to register trace ftrace module notifier\n"); + set_ftrace_early_filters(); + return; failed: ftrace_disabled = 1; From 5e0a093910876882f91f1d4b8a1635a099e6c7ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 May 2009 15:50:13 -0400 Subject: [PATCH 757/900] tracing: fix config options to not show when automatically selected There are two options that are selected by all tracers, but we want to have those options available when no tracer is selected. These are The event tracer and sched switch tracer. The are enabled by all tracers, but if a tracer is not selected we want the options to appear. All tracers including them select TRACING. Thus what we would like to do is: config EVENT_TRACER bool "prompt" depends on TRACING select TRACING But that gives us a bug in the kbuild system since we just created a circular dependency. We only want the prompt to show when TRACING is off. This patch adds GENERIC_TRACER that all tracers will select instead of TRACING. The two options (sched switch and event tracer) will select TRACING directly and depend on !GENERIC_TRACER. This solves the cicular dependency. [ Impact: hide options that are selected by default ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a508b9d2adb..6e55cc3ac49 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -56,6 +56,13 @@ config CONTEXT_SWITCH_TRACER select MARKERS bool +# All tracer options should select GENERIC_TRACER. For those options that are +# enabled by all tracers (context switch and event tracer) they select TRACING. +# This allows those options to appear when no other tracer is selected. But the +# options do not appear when something else selects it. We need the two options +# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the +# hidding of the automatic options options. + config TRACING bool select DEBUG_FS @@ -66,6 +73,10 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING +config GENERIC_TRACER + bool + select TRACING + # # Minimum requirements an architecture has to meet for us to # be able to offer generic tracing facilities: @@ -95,7 +106,7 @@ config FUNCTION_TRACER depends on HAVE_FUNCTION_TRACER select FRAME_POINTER select KALLSYMS - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help Enable the kernel to trace every kernel function. This is done @@ -126,7 +137,7 @@ config IRQSOFF_TRACER depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME select TRACE_IRQFLAGS - select TRACING + select GENERIC_TRACER select TRACER_MAX_TRACE help This option measures the time spent in irqs-off critical @@ -147,7 +158,7 @@ config PREEMPT_TRACER default n depends on GENERIC_TIME depends on PREEMPT - select TRACING + select GENERIC_TRACER select TRACER_MAX_TRACE help This option measures the time spent in preemption off critical @@ -166,7 +177,7 @@ config PREEMPT_TRACER config SYSPROF_TRACER bool "Sysprof Tracer" depends on X86 - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help This tracer provides the trace needed by the 'Sysprof' userspace @@ -174,7 +185,7 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE help @@ -183,6 +194,7 @@ config SCHED_TRACER config ENABLE_CONTEXT_SWITCH_TRACER bool "Trace process context switches" + depends on !GENERIC_TRACER select TRACING select CONTEXT_SWITCH_TRACER help @@ -191,6 +203,7 @@ config ENABLE_CONTEXT_SWITCH_TRACER config ENABLE_EVENT_TRACING bool "Trace various events in the kernel" + depends on !GENERIC_TRACER select TRACING help This tracer hooks to various trace points in the kernel @@ -204,14 +217,14 @@ config ENABLE_EVENT_TRACING config FTRACE_SYSCALLS bool "Trace syscalls" depends on HAVE_FTRACE_SYSCALLS - select TRACING + select GENERIC_TRACER select KALLSYMS help Basic tracer to catch the syscall entry and exit events. config BOOT_TRACER bool "Trace boot initcalls" - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help This tracer helps developers to optimize boot times: it records @@ -228,7 +241,7 @@ config BOOT_TRACER config TRACE_BRANCH_PROFILING bool - select TRACING + select GENERIC_TRACER choice prompt "Branch Profiling" @@ -308,7 +321,7 @@ config BRANCH_TRACER config POWER_TRACER bool "Trace power consumption behavior" depends on X86 - select TRACING + select GENERIC_TRACER help This tracer helps developers to analyze and optimize the kernels power management decisions, specifically the C-state and P-state @@ -342,14 +355,14 @@ config STACK_TRACER config HW_BRANCH_TRACER depends on HAVE_HW_BRANCH_TRACER bool "Trace hw branches" - select TRACING + select GENERIC_TRACER help This tracer records all branches on the system in a circular buffer giving access to the last N branches for each cpu. config KMEMTRACE bool "Trace SLAB allocations" - select TRACING + select GENERIC_TRACER help kmemtrace provides tracing for slab allocator functions, such as kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected @@ -369,7 +382,7 @@ config KMEMTRACE config WORKQUEUE_TRACER bool "Trace workqueues" - select TRACING + select GENERIC_TRACER help The workqueue tracer provides some statistical informations about each cpu workqueue thread such as the number of the @@ -385,7 +398,7 @@ config BLK_DEV_IO_TRACE select RELAY select DEBUG_FS select TRACEPOINTS - select TRACING + select GENERIC_TRACER select STACKTRACE help Say Y here if you want to be able to trace the block layer actions @@ -446,7 +459,7 @@ config FTRACE_SELFTEST config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING + depends on GENERIC_TRACER select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup @@ -457,7 +470,7 @@ config FTRACE_STARTUP_TEST config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI - select TRACING + select GENERIC_TRACER help Mmiotrace traces Memory Mapped I/O access and is meant for debugging and reverse engineering. It is called from the ioremap From 897f17a65389a26509bd0c79a9812d1c9ea8ea6f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 May 2009 16:31:21 -0400 Subject: [PATCH 758/900] tracing: combine the default tracers into one config Both event tracer and sched switch plugin are selected by default by all generic tracers. But if no generic tracer is enabled, their options appear. But ether one of them will select the other, thus it only makes sense to have the default tracers be selected by one option. [ Impact: clean up kconfig menu ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6e55cc3ac49..4a13e5a01ce 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -192,27 +192,14 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. -config ENABLE_CONTEXT_SWITCH_TRACER - bool "Trace process context switches" - depends on !GENERIC_TRACER - select TRACING - select CONTEXT_SWITCH_TRACER - help - This tracer gets called from the context switch and records - all switching of tasks. - -config ENABLE_EVENT_TRACING - bool "Trace various events in the kernel" +config ENABLE_DEFAULT_TRACERS + bool "Trace process context switches and events" depends on !GENERIC_TRACER select TRACING help This tracer hooks to various trace points in the kernel allowing the user to pick and choose which trace point they - want to trace. - - Note, all tracers enable event tracing. This option is - only a convenience to enable event tracing when no other - tracers are selected. + want to trace. It also includes the sched_switch tracer plugin. config FTRACE_SYSCALLS bool "Trace syscalls" From 6e25db44a7ad7eb380f4ec774ec00a8fcddea112 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 May 2009 11:24:59 +0800 Subject: [PATCH 759/900] tracing/events: fix a typo in __string() format output "tsize" should be "\tsize". Also remove the space before "__str_loc". Before: # cat tracing/events/irq/irq_handler_entry/format ... field:int irq; offset:12; size:4; field: __str_loc name; offset:16;tsize:2; ... After: # cat tracing/events/irq/irq_handler_entry/format ... field:int irq; offset:12; size:4; field:__str_loc name; offset:16; size:2; ... [ Impact: standardize __string field description in events format file ] Signed-off-by: Li Zefan Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index b4ec83ae711..9276ec4f34d 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -209,8 +209,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #undef __string #define __string(item, src) \ - ret = trace_seq_printf(s, "\tfield: __str_loc " #item ";\t" \ - "offset:%u;tsize:%u;\n", \ + ret = trace_seq_printf(s, "\tfield:__str_loc " #item ";\t" \ + "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), \ __str_loc_##item), \ (unsigned int)sizeof(field.__str_loc_##item)); \ From a9c1c3abe1160a5632e48c929b02b740556bf423 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 1 Jun 2009 15:35:13 +0800 Subject: [PATCH 760/900] tracing/events: put TP_fast_assign into braces Currently TP_fast_assign has a limitation that we can't define local variables in it. Here's one use case when we introduce __dynamic_array(): TP_fast_assign( type *p = __get_dynamic_array(item); foo(p); bar(p); ), [ Impact: allow defining local variables in TP_fast_assign ] Signed-off-by: Li Zefan LKML-Reference: <4A2384B1.90100@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 9276ec4f34d..ee926822244 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -471,7 +471,7 @@ static void ftrace_raw_event_##call(proto) \ return; \ entry = ring_buffer_event_data(event); \ \ - assign; \ + { assign; } \ \ if (!filter_current_check_discard(event_call, entry, event)) \ trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ From 7fcb7c472f455d1711eb5a7633204dba8800a6d6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 1 Jun 2009 15:35:46 +0800 Subject: [PATCH 761/900] tracing/events: introduce __dynamic_array() __string() is limited: - it's a char array, but we may want to define array with other types - a source string should be available, but we may just know the string size We introduce __dynamic_array() to break those limitations, and __string() becomes a wrapper of it. As a side effect, now __get_str() can be used in TP_fast_assign but not only TP_print. Take XFS for example, we have the string length in the dirent, but the string itself is not NULL-terminated, so __dynamic_array() can be used: TRACE_EVENT(xfs_dir2, TP_PROTO(struct xfs_da_args *args), TP_ARGS(args), TP_STRUCT__entry( __field(int, namelen) __dynamic_array(char, name, args->namelen + 1) ... ), TP_fast_assign( char *name = __get_str(name); if (args->namelen) memcpy(name, args->name, args->namelen); name[args->namelen] = '\0'; __entry->namelen = args->namelen; ), TP_printk("name %.*s namelen %d", __entry->namelen ? __get_str(name) : NULL __entry->namelen) ); [ Impact: allow defining dynamic size arrays ] Signed-off-by: Li Zefan LKML-Reference: <4A2384D2.3080403@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 126 ++++++++++++++++++++--------- kernel/trace/trace_events_filter.c | 6 +- 2 files changed, 93 insertions(+), 39 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index ee926822244..b5478dab579 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -18,14 +18,17 @@ #include -#undef __array -#define __array(type, item, len) type item[len]; - #undef __field #define __field(type, item) type item; +#undef __array +#define __array(type, item, len) type item[len]; + +#undef __dynamic_array +#define __dynamic_array(type, item, len) unsigned short __data_loc_##item; + #undef __string -#define __string(item, src) unsigned short __str_loc_##item; +#define __string(item, src) __dynamic_array(char, item, -1) #undef TP_STRUCT__entry #define TP_STRUCT__entry(args...) args @@ -35,7 +38,7 @@ struct ftrace_raw_##name { \ struct trace_entry ent; \ tstruct \ - char __str_data[0]; \ + char __data[0]; \ }; \ static struct ftrace_event_call event_##name @@ -47,30 +50,31 @@ * * Include the following: * - * struct ftrace_str_offsets_ { - * int ; - * int ; + * struct ftrace_data_offsets_ { + * int ; + * int ; * [...] * }; * - * The __string() macro will create each int , this is to - * keep the offset of each string from the beggining of the event - * once we perform the strlen() of the src strings. - * + * The __dynamic_array() macro will create each int , this is + * to keep the offset of each array from the beginning of the event. */ -#undef __array -#define __array(type, item, len) - #undef __field #define __field(type, item); +#undef __array +#define __array(type, item, len) + +#undef __dynamic_array +#define __dynamic_array(type, item, len) int item; + #undef __string -#define __string(item, src) int item; +#define __string(item, src) __dynamic_array(char, item, -1) #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ - struct ftrace_str_offsets_##call { \ + struct ftrace_data_offsets_##call { \ tstruct; \ }; @@ -119,8 +123,12 @@ #undef TP_printk #define TP_printk(fmt, args...) fmt "\n", args +#undef __get_dynamic_array +#define __get_dynamic_array(field) \ + ((void *)__entry + __entry->__data_loc_##field) + #undef __get_str -#define __get_str(field) ((char *)__entry + __entry->__str_loc_##field) +#define __get_str(field) (char *)__get_dynamic_array(field) #undef __print_flags #define __print_flags(flag, delim, flag_array...) \ @@ -207,16 +215,19 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ if (!ret) \ return 0; -#undef __string -#define __string(item, src) \ - ret = trace_seq_printf(s, "\tfield:__str_loc " #item ";\t" \ +#undef __dynamic_array +#define __dynamic_array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), \ - __str_loc_##item), \ - (unsigned int)sizeof(field.__str_loc_##item)); \ + __data_loc_##item), \ + (unsigned int)sizeof(field.__data_loc_##item)); \ if (!ret) \ return 0; +#undef __string +#define __string(item, src) __dynamic_array(char, item, -1) + #undef __entry #define __entry REC @@ -260,11 +271,14 @@ ftrace_format_##call(struct trace_seq *s) \ if (ret) \ return ret; +#undef __dynamic_array +#define __dynamic_array(type, item, len) \ + ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\ + offsetof(typeof(field), __data_loc_##item), \ + sizeof(field.__data_loc_##item), 0); + #undef __string -#define __string(item, src) \ - ret = trace_define_field(event_call, "__str_loc", #item, \ - offsetof(typeof(field), __str_loc_##item), \ - sizeof(field.__str_loc_##item), 0); +#define __string(item, src) __dynamic_array(char, item, -1) #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ @@ -288,6 +302,43 @@ ftrace_define_fields_##call(void) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +/* + * remember the offset of each array from the beginning of the event. + */ + +#undef __entry +#define __entry entry + +#undef __field +#define __field(type, item) + +#undef __array +#define __array(type, item, len) + +#undef __dynamic_array +#define __dynamic_array(type, item, len) \ + __data_offsets->item = __data_size + \ + offsetof(typeof(*entry), __data); \ + __data_size += (len) * sizeof(type); + +#undef __string +#define __string(item, src) __dynamic_array(char, item, strlen(src) + 1) \ + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +static inline int ftrace_get_offsets_##call( \ + struct ftrace_data_offsets_##call *__data_offsets, proto) \ +{ \ + int __data_size = 0; \ + struct ftrace_raw_##call __maybe_unused *entry; \ + \ + tstruct; \ + \ + return __data_size; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + /* * Stage 4 of the trace events. * @@ -432,15 +483,15 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #undef __array #define __array(type, item, len) +#undef __dynamic_array +#define __dynamic_array(type, item, len) \ + __entry->__data_loc_##item = __data_offsets.item; + #undef __string -#define __string(item, src) \ - __str_offsets.item = __str_size + \ - offsetof(typeof(*entry), __str_data); \ - __str_size += strlen(src) + 1; +#define __string(item, src) __dynamic_array(char, item, -1) \ #undef __assign_str #define __assign_str(dst, src) \ - __entry->__str_loc_##dst = __str_offsets.dst; \ strcpy(__get_str(dst), src); #undef TRACE_EVENT @@ -451,26 +502,29 @@ static struct ftrace_event_call event_##call; \ \ static void ftrace_raw_event_##call(proto) \ { \ - struct ftrace_str_offsets_##call __maybe_unused __str_offsets; \ + struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ struct ftrace_event_call *event_call = &event_##call; \ struct ring_buffer_event *event; \ struct ftrace_raw_##call *entry; \ unsigned long irq_flags; \ - int __str_size = 0; \ + int __data_size; \ int pc; \ \ local_save_flags(irq_flags); \ pc = preempt_count(); \ \ - tstruct; \ + __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ \ event = trace_current_buffer_lock_reserve(event_##call.id, \ - sizeof(struct ftrace_raw_##call) + __str_size,\ + sizeof(*entry) + __data_size, \ irq_flags, pc); \ if (!event) \ return; \ entry = ring_buffer_event_data(event); \ \ + \ + tstruct \ + \ { assign; } \ \ if (!filter_current_check_discard(event_call, entry, event)) \ diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index a7430b16d24..db6e54bdb59 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -478,12 +478,12 @@ enum { static int is_string_field(const char *type) { + if (strstr(type, "__data_loc") && strstr(type, "char")) + return FILTER_DYN_STRING; + if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; - if (!strcmp(type, "__str_loc")) - return FILTER_DYN_STRING; - return 0; } From ec081ddc3d90aab35bc0de19a358b964978837cf Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 1 Jun 2009 15:53:35 +0100 Subject: [PATCH 762/900] tracing: add exports to use __print_symbolic and __print_flags from a module A patch to allow the use of __print_symbolic and __print_flags from a module. This allows the current GFS2 tracing patch to build. Signed-off-by: Steven Whitehouse LKML-Reference: <1243868015.29604.542.camel@localhost.localdomain> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c12d95db2f5..0fe3b223f7e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -17,6 +17,7 @@ static DECLARE_RWSEM(trace_event_mutex); DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); +EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; @@ -250,6 +251,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, return p->buffer; } +EXPORT_SYMBOL(ftrace_print_flags_seq); const char * ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, @@ -275,6 +277,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, return p->buffer; } +EXPORT_SYMBOL(ftrace_print_symbols_seq); #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) From 1d080d6c3141623c92caaebe20e847cb99ccbb60 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 1 Jun 2009 12:20:40 -0400 Subject: [PATCH 763/900] tracing: remove redundant SOFTIRQ from softirq event traces After converting the softirq tracer to use te flags options, this caused a regression with the name. Since the flag was used directly it was printed out (i.e. HRTIMER_SOFTIRQ). This patch only shows the softirq name without the SOFTIRQ part. [ Impact: fix regression of output from softirq events ] Signed-off-by: Steven Rostedt --- include/trace/events/irq.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 683fb36a994..b0c7ede55eb 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -7,18 +7,18 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq -#define softirq_name(sirq) { sirq, #sirq } -#define show_softirq_name(val) \ - __print_symbolic(val, \ - softirq_name(HI_SOFTIRQ), \ - softirq_name(TIMER_SOFTIRQ), \ - softirq_name(NET_TX_SOFTIRQ), \ - softirq_name(NET_RX_SOFTIRQ), \ - softirq_name(BLOCK_SOFTIRQ), \ - softirq_name(TASKLET_SOFTIRQ), \ - softirq_name(SCHED_SOFTIRQ), \ - softirq_name(HRTIMER_SOFTIRQ), \ - softirq_name(RCU_SOFTIRQ)) +#define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq } +#define show_softirq_name(val) \ + __print_symbolic(val, \ + softirq_name(HI), \ + softirq_name(TIMER), \ + softirq_name(NET_TX), \ + softirq_name(NET_RX), \ + softirq_name(BLOCK), \ + softirq_name(TASKLET), \ + softirq_name(SCHED), \ + softirq_name(HRTIMER), \ + softirq_name(RCU)) /** * irq_handler_entry - called immediately before the irq action handler From 112f38a7e36e9d688b389507136bf3af3e6d159b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 1 Jun 2009 15:16:05 -0400 Subject: [PATCH 764/900] tracing: make trace pipe recognize latency format flag The trace_pipe did not recognize the latency format flag and would produce different output than the trace file. The problem was partly due that the trace flags in the iterator was not set as well as the trace_pipe zeros out part of the iterator (including the flags) to be able to use the same routines as the trace file. trace_flags of the iterator should not cause any problems when not zeroed out by for trace_pipe. Reported-by: Johannes Berg Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 +- kernel/trace/trace.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index bbf40f624fc..5c093ffc655 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -51,6 +51,7 @@ struct trace_iterator { int cpu_file; struct mutex mutex; struct ring_buffer_iter *buffer_iter[NR_CPUS]; + unsigned long iter_flags; /* The below is zeroed out in pipe_read */ struct trace_seq seq; @@ -58,7 +59,6 @@ struct trace_iterator { int cpu; u64 ts; - unsigned long iter_flags; loff_t pos; long idx; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a3a8a87d7e9..cae34c69752 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2826,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); + if (trace_flags & TRACE_ITER_LATENCY_FMT) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + iter->cpu_file = cpu_file; iter->tr = &global_trace; mutex_init(&iter->mutex); From 0f6ce3de4ef6ff940308087c49760d068851c1a7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 1 Jun 2009 21:51:28 -0400 Subject: [PATCH 765/900] ftrace: do not profile functions when disabled A race was found that if one were to enable and disable the function profiler repeatedly, then the system can panic. This was because a profiled function may be preempted just before disabling interrupts. While the profiler is disabled and then reenabled, the preempted function could start again, and access the hash as it is being initialized. This just adds a check in the irq disabled part to check if the profiler is enabled, and if it is not then it will just exit. When the system is disabled, the profile_enabled variable is cleared before calling the unregistering of the function profiler. This unregistering calls stop machine which also acts as a synchronize schedule. [ Impact: fix panic in enabling/disabling function profiler ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2074e5b7766..d6973dfadb3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -599,7 +599,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) local_irq_save(flags); stat = &__get_cpu_var(ftrace_profile_stats); - if (!stat->hash) + if (!stat->hash || !ftrace_profile_enabled) goto out; rec = ftrace_find_profiled_func(stat, ip); @@ -630,7 +630,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) local_irq_save(flags); stat = &__get_cpu_var(ftrace_profile_stats); - if (!stat->hash) + if (!stat->hash || !ftrace_profile_enabled) goto out; calltime = trace->rettime - trace->calltime; @@ -724,6 +724,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, ftrace_profile_enabled = 1; } else { ftrace_profile_enabled = 0; + /* + * unregister_ftrace_profiler calls stop_machine + * so this acts like an synchronize_sched. + */ unregister_ftrace_profiler(); } } From 1f23920dbf1377fa9e4aef4f3d20c34a06a71a35 Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Thu, 7 May 2009 19:49:45 -0500 Subject: [PATCH 766/900] xfs: fix double unlock in xfs_swap_extents() Regreesion from commit ef8f7fc, which rearranged the code in xfs_swap_extents() leading to double unlock of xfs inode ilock. That resulted in xfs_fsr deadlocking itself on platforms, which don't handle double unlock of rw_semaphore nicely. It caused the count go negative, which represents the write holder, without really having one. ia64 is one of the platforms where deadlock was easily reproduced and the fix was tested. Signed-off-by: Eric Sandeen Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/xfs_dfrag.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index e6d839bddbf..7465f9ee125 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -347,13 +347,15 @@ xfs_swap_extents( error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); out: kmem_free(tempifp); return error; +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + goto out; + out_trans_cancel: xfs_trans_cancel(tp, 0); goto out_unlock; From e6da7c9fed111ba1243297ee6eda8e24ae11c384 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 23 May 2009 14:30:12 -0500 Subject: [PATCH 767/900] xfs: fix overflow in xfs_growfs_data_private In the case where growing a filesystem would leave the last AG too small, the fixup code has an overflow in the calculation of the new size with one fewer ag, because "nagcount" is a 32 bit number. If the new filesystem has > 2^32 blocks in it this causes a problem resulting in an EINVAL return from growfs: # xfs_io -f -c "truncate 19998630180864" fsfile # mkfs.xfs -f -bsize=4096 -dagsize=76288719b,size=3905982455b fsfile # mount -o loop fsfile /mnt # xfs_growfs /mnt meta-data=/dev/loop0 isize=256 agcount=52, agsize=76288719 blks = sectsz=512 attr=2 data = bsize=4096 blocks=3905982455, imaxpct=5 = sunit=0 swidth=0 blks naming =version 2 bsize=4096 ascii-ci=0 log =internal bsize=4096 blocks=32768, version=2 = sectsz=512 sunit=0 blks, lazy-count=0 realtime =none extsz=4096 blocks=0, rtextents=0 xfs_growfs: XFS_IOC_FSGROWFSDATA xfsctl failed: Invalid argument Reported-by: richard.ems@cape-horn-eng.com Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_fsops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 8379e3bca26..cbd451bb484 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -160,7 +160,7 @@ xfs_growfs_data_private( nagcount = new + (nb_mod != 0); if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { nagcount--; - nb = nagcount * mp->m_sb.sb_agblocks; + nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; if (nb < mp->m_sb.sb_dblocks) return XFS_ERROR(EINVAL); } From 1b17d766463d51904cb242f194a780737e5f73ef Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Mon, 1 Jun 2009 13:13:24 -0500 Subject: [PATCH 768/900] xfs: prevent deadlock in xfs_qm_shake() It's possible to recurse into filesystem from the memory allocation, which deadlocks in xfs_qm_shake(). Add check for __GFP_FS, and bail out if it is not set. Signed-off-by: Felix Blyakher Signed-off-by: Hedi Berriche Reviewed-by: Christoph Hellwig Reviewed-by: Andi Kleen Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/kmem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index af6843c7ee4..179cbd630f6 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -103,7 +103,7 @@ extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); static inline int kmem_shake_allow(gfp_t gfp_mask) { - return (gfp_mask & __GFP_WAIT) != 0; + return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)); } #endif /* __XFS_SUPPORT_KMEM_H__ */ From 5a9a8e32ebe269c71d8d3e78f9435fe7729f38e9 Mon Sep 17 00:00:00 2001 From: Ed Swierk Date: Tue, 2 Jun 2009 00:19:52 -0700 Subject: [PATCH 769/900] forcedeth: add phy_power_down parameter, leave phy powered up by default (v2) Add a phy_power_down parameter to forcedeth: set to 1 to power down the phy and disable the link when an interface goes down; set to 0 to always leave the phy powered up. The phy power state persists across reboots; Windows, some BIOSes, and older versions of Linux don't bother to power up the phy again, forcing users to remove all power to get the interface working (see http://bugzilla.kernel.org/show_bug.cgi?id=13072). Leaving the phy powered on is the safest default behavior. Users accustomed to seeing the link state reflect the interface state and/or wanting to minimize power consumption can set phy_power_down=1 if compatibility with other OSes is not an issue. Signed-off-by: Ed Swierk Signed-off-by: David S. Miller --- drivers/net/forcedeth.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c index f9a846b1b92..9f6a68fb7b4 100644 --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -897,6 +897,12 @@ enum { }; static int phy_cross = NV_CROSSOVER_DETECTION_DISABLED; +/* + * Power down phy when interface is down (persists through reboot; + * older Linux and other OSes may not power it up again) + */ +static int phy_power_down = 0; + static inline struct fe_priv *get_nvpriv(struct net_device *dev) { return netdev_priv(dev); @@ -1485,7 +1491,10 @@ static int phy_init(struct net_device *dev) /* restart auto negotiation, power down phy */ mii_control = mii_rw(dev, np->phyaddr, MII_BMCR, MII_READ); - mii_control |= (BMCR_ANRESTART | BMCR_ANENABLE | BMCR_PDOWN); + mii_control |= (BMCR_ANRESTART | BMCR_ANENABLE); + if (phy_power_down) { + mii_control |= BMCR_PDOWN; + } if (mii_rw(dev, np->phyaddr, MII_BMCR, mii_control)) { return PHY_ERROR; } @@ -5513,7 +5522,7 @@ static int nv_close(struct net_device *dev) nv_drain_rxtx(dev); - if (np->wolenabled) { + if (np->wolenabled || !phy_power_down) { writel(NVREG_PFF_ALWAYS|NVREG_PFF_MYADDR, base + NvRegPacketFilterFlags); nv_start_rx(dev); } else { @@ -6367,6 +6376,8 @@ module_param(dma_64bit, int, 0); MODULE_PARM_DESC(dma_64bit, "High DMA is enabled by setting to 1 and disabled by setting to 0."); module_param(phy_cross, int, 0); MODULE_PARM_DESC(phy_cross, "Phy crossover detection for Realtek 8201 phy is enabled by setting to 1 and disabled by setting to 0."); +module_param(phy_power_down, int, 0); +MODULE_PARM_DESC(phy_power_down, "Power down phy and disable link when interface is down (1), or leave phy powered up (0)."); MODULE_AUTHOR("Manfred Spraul "); MODULE_DESCRIPTION("Reverse Engineered nForce ethernet driver"); From ea30e11970a96cfe5e32c03a29332554573b4a10 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 2 Jun 2009 01:29:58 -0700 Subject: [PATCH 770/900] e1000: add missing length check to e1000 receive routine Patch to fix bad length checking in e1000. E1000 by default does two things: 1) Spans rx descriptors for packets that don't fit into 1 skb on recieve 2) Strips the crc from a frame by subtracting 4 bytes from the length prior to doing an skb_put Since the e1000 driver isn't written to support receiving packets that span multiple rx buffers, it checks the End of Packet bit of every frame, and discards it if its not set. This places us in a situation where, if we have a spanning packet, the first part is discarded, but the second part is not (since it is the end of packet, and it passes the EOP bit test). If the second part of the frame is small (4 bytes or less), we subtract 4 from it to remove its crc, underflow the length, and wind up in skb_over_panic, when we try to skb_put a huge number of bytes into the skb. This amounts to a remote DOS attack through careful selection of frame size in relation to interface MTU. The fix for this is already in the e1000e driver, as well as the e1000 sourceforge driver, but no one ever pushed it to e1000. This is lifted straight from e1000e, and prevents small frames from causing the underflow described above Signed-off-by: Neil Horman Tested-by: Andy Gospodarek Signed-off-by: David S. Miller --- drivers/net/e1000/e1000_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index b1419e21b46..fffb006b7d9 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -4027,8 +4027,9 @@ static bool e1000_clean_rx_irq(struct e1000_adapter *adapter, PCI_DMA_FROMDEVICE); length = le16_to_cpu(rx_desc->length); - - if (unlikely(!(status & E1000_RXD_STAT_EOP))) { + /* !EOP means multiple descriptors were used to store a single + * packet, also make sure the frame isn't just CRC only */ + if (unlikely(!(status & E1000_RXD_STAT_EOP) || (length <= 4))) { /* All receives must fit into a single buffer */ E1000_DBG("%s: Receive packet consumed multiple" " buffers\n", netdev->name); From 12186be7d2e1106cede1cc728526e3d7998cbe94 Mon Sep 17 00:00:00 2001 From: Minoru Usui Date: Tue, 2 Jun 2009 02:17:34 -0700 Subject: [PATCH 771/900] net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid kernel panic when we use cls_cgroup This patch fixes a bug which unconfigured struct tcf_proto keeps chaining in tc_ctl_tfilter(), and avoids kernel panic in cls_cgroup_classify() when we use cls_cgroup. When we execute 'tc filter add', tcf_proto is allocated, initialized by classifier's init(), and chained. After it's chained, tc_ctl_tfilter() calls classifier's change(). When classifier's change() fails, tc_ctl_tfilter() does not free and keeps tcf_proto. In addition, cls_cgroup is initialized in change() not in init(). It accesses unconfigured struct tcf_proto which is chained before change(), then hits Oops. Signed-off-by: Minoru Usui Signed-off-by: Jarek Poplawski Signed-off-by: Jamal Hadi Salim Tested-by: Minoru Usui Signed-off-by: David S. Miller --- net/sched/cls_api.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0759f32e9dc..09cdcdfe7e9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -135,6 +135,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) unsigned long cl; unsigned long fh; int err; + int tp_created = 0; if (net != &init_net) return -EINVAL; @@ -266,10 +267,7 @@ replay: goto errout; } - spin_lock_bh(root_lock); - tp->next = *back; - *back = tp; - spin_unlock_bh(root_lock); + tp_created = 1; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) goto errout; @@ -296,8 +294,11 @@ replay: switch (n->nlmsg_type) { case RTM_NEWTFILTER: err = -EEXIST; - if (n->nlmsg_flags & NLM_F_EXCL) + if (n->nlmsg_flags & NLM_F_EXCL) { + if (tp_created) + tcf_destroy(tp); goto errout; + } break; case RTM_DELTFILTER: err = tp->ops->delete(tp, fh); @@ -314,8 +315,18 @@ replay: } err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh); - if (err == 0) + if (err == 0) { + if (tp_created) { + spin_lock_bh(root_lock); + tp->next = *back; + *back = tp; + spin_unlock_bh(root_lock); + } tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + } else { + if (tp_created) + tcf_destroy(tp); + } errout: if (cl) From 2e507d849f1834d3fe9aae71b7aabf4c2a3827b8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 18:24:20 +0200 Subject: [PATCH 772/900] dma-debug: add variables and checks for driver filter This patch adds the state variables for the driver filter and a function to check if the filter is enabled and matches to the current device. The check is built into the err_printk function. Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index cdd205d6bf7..c01f64780ca 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -99,6 +99,15 @@ static struct dentry *show_num_errors_dent __read_mostly; static struct dentry *num_free_entries_dent __read_mostly; static struct dentry *min_free_entries_dent __read_mostly; +/* per-driver filter related state */ + +#define NAME_MAX_LEN 64 + +static char current_driver_name[NAME_MAX_LEN] __read_mostly; +static struct device_driver *current_driver __read_mostly; + +static DEFINE_RWLOCK(driver_name_lock); + static const char *type2name[4] = { "single", "page", "scather-gather", "coherent" }; @@ -128,9 +137,47 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry) #endif } +static bool driver_filter(struct device *dev) +{ + /* driver filter off */ + if (likely(!current_driver_name[0])) + return true; + + /* driver filter on and initialized */ + if (current_driver && dev->driver == current_driver) + return true; + + /* driver filter on but not yet initialized */ + if (!current_driver && current_driver_name[0]) { + struct device_driver *drv = get_driver(dev->driver); + unsigned long flags; + bool ret = false; + + if (!drv) + return false; + + /* lock to protect against change of current_driver_name */ + read_lock_irqsave(&driver_name_lock, flags); + + if (drv->name && + strncmp(current_driver_name, drv->name, 63) == 0) { + current_driver = drv; + ret = true; + } + + read_unlock_irqrestore(&driver_name_lock, flags); + put_driver(drv); + + return ret; + } + + return false; +} + #define err_printk(dev, entry, format, arg...) do { \ error_count += 1; \ - if (show_all_errors || show_num_errors > 0) { \ + if (driver_filter(dev) && \ + (show_all_errors || show_num_errors > 0)) { \ WARN(1, "%s %s: " format, \ dev_driver_string(dev), \ dev_name(dev) , ## arg); \ From 8a6fc708b9bb48a79a385bdc2be0959ee2ab788d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 21:23:13 +0200 Subject: [PATCH 773/900] dma-debug: add debugfs file for driver filter This patch adds the dma-api/driver_filter file to debugfs. The root user can write a driver name into this file to see only dma-api errors for that particular driver in the kernel log. Writing an empty string to that file disables the driver filter. Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index c01f64780ca..c6330b1a7be 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -23,9 +23,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -98,6 +100,7 @@ static struct dentry *show_all_errors_dent __read_mostly; static struct dentry *show_num_errors_dent __read_mostly; static struct dentry *num_free_entries_dent __read_mostly; static struct dentry *min_free_entries_dent __read_mostly; +static struct dentry *filter_dent __read_mostly; /* per-driver filter related state */ @@ -160,7 +163,8 @@ static bool driver_filter(struct device *dev) read_lock_irqsave(&driver_name_lock, flags); if (drv->name && - strncmp(current_driver_name, drv->name, 63) == 0) { + strncmp(current_driver_name, drv->name, + NAME_MAX_LEN-1) == 0) { current_driver = drv; ret = true; } @@ -454,6 +458,97 @@ out_err: return -ENOMEM; } +static ssize_t filter_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + unsigned long flags; + char buf[NAME_MAX_LEN + 1]; + int len; + + if (!current_driver_name[0]) + return 0; + + /* + * We can't copy to userspace directly because current_driver_name can + * only be read under the driver_name_lock with irqs disabled. So + * create a temporary copy first. + */ + read_lock_irqsave(&driver_name_lock, flags); + len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name); + read_unlock_irqrestore(&driver_name_lock, flags); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t filter_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + unsigned long flags; + char buf[NAME_MAX_LEN]; + size_t len = NAME_MAX_LEN - 1; + int i; + + /* + * We can't copy from userspace directly. Access to + * current_driver_name is protected with a write_lock with irqs + * disabled. Since copy_from_user can fault and may sleep we + * need to copy to temporary buffer first + */ + len = min(count, len); + if (copy_from_user(buf, userbuf, len)) + return -EFAULT; + + buf[len] = 0; + + write_lock_irqsave(&driver_name_lock, flags); + + /* Now handle the string we got from userspace very carefully. + * The rules are: + * - only use the first token we got + * - token delimiter is everything looking like a space + * character (' ', '\n', '\t' ...) + * + */ + if (!isalnum(buf[0])) { + /* + If the first character userspace gave us is not + * alphanumerical then assume the filter should be + * switched off. + */ + if (current_driver_name[0]) + printk(KERN_INFO "DMA-API: switching off dma-debug " + "driver filter\n"); + current_driver_name[0] = 0; + current_driver = NULL; + goto out_unlock; + } + + /* + * Now parse out the first token and use it as the name for the + * driver to filter for. + */ + for (i = 0; i < NAME_MAX_LEN; ++i) { + current_driver_name[i] = buf[i]; + if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0) + break; + } + current_driver_name[i] = 0; + current_driver = NULL; + + printk(KERN_INFO "DMA-API: enable driver filter for driver [%s]\n", + current_driver_name); + +out_unlock: + write_unlock_irqrestore(&driver_name_lock, flags); + + return count; +} + +const struct file_operations filter_fops = { + .read = filter_read, + .write = filter_write, +}; + static int dma_debug_fs_init(void) { dma_debug_dent = debugfs_create_dir("dma-api", NULL); @@ -497,6 +592,11 @@ static int dma_debug_fs_init(void) if (!min_free_entries_dent) goto out_err; + filter_dent = debugfs_create_file("driver_filter", 0644, + dma_debug_dent, NULL, &filter_fops); + if (!filter_dent) + goto out_err; + return 0; out_err: From 1745de5e5639457513fe43440f2800e23c3cbc7d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 21:49:51 +0200 Subject: [PATCH 774/900] dma-debug: add dma_debug_driver kernel command line This patch add the dma_debug_driver= boot parameter to enable the driver filter for early boot. Signed-off-by: Joerg Roedel --- Documentation/kernel-parameters.txt | 7 +++++++ lib/dma-debug.c | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e87bdbfbcc7..b3f1314588c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -646,6 +646,13 @@ and is between 256 and 4096 characters. It is defined in the file DMA-API debugging code disables itself because the architectural default is too low. + dma_debug_driver= + With this option the DMA-API debugging driver + filter feature can be enabled at boot time. Just + pass the driver to filter for as the parameter. + The filter can be disabled or changed to another + driver later using sysfs. + dscc4.setup= [NET] dtc3181e= [HW,SCSI] diff --git a/lib/dma-debug.c b/lib/dma-debug.c index c6330b1a7be..d0618aa13b4 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -1109,3 +1109,21 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, } EXPORT_SYMBOL(debug_dma_sync_sg_for_device); +static int __init dma_debug_driver_setup(char *str) +{ + int i; + + for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) { + current_driver_name[i] = *str; + if (*str == 0) + break; + } + + if (current_driver_name[0]) + printk(KERN_INFO "DMA-API: enable driver filter for " + "driver [%s]\n", current_driver_name); + + + return 1; +} +__setup("dma_debug_driver=", dma_debug_driver_setup); From 016ea6874a6d58df85b54f56997d26df13c307b2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 21:57:23 +0200 Subject: [PATCH 775/900] dma-debug: add documentation for the driver filter This patch adds the driver filter feature to the dma-debug documentation. Signed-off-by: Joerg Roedel --- Documentation/DMA-API.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index d9aa43d78bc..25fb8bcf32a 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -704,12 +704,24 @@ this directory the following files can currently be found: The current number of free dma_debug_entries in the allocator. + dma-api/driver-filter + You can write a name of a driver into this file + to limit the debug output to requests from that + particular driver. Write an empty string to + that file to disable the filter and see + all errors again. + If you have this code compiled into your kernel it will be enabled by default. If you want to boot without the bookkeeping anyway you can provide 'dma_debug=off' as a boot parameter. This will disable DMA-API debugging. Notice that you can not enable it again at runtime. You have to reboot to do so. +If you want to see debug messages only for a special device driver you can +specify the dma_debug_driver= parameter. This will enable the +driver filter at boot time. The debug code will only print errors for that +driver afterwards. This filter can be disabled or changed later using debugfs. + When the code disables itself at runtime this is most likely because it ran out of dma_debug_entries. These entries are preallocated at boot. The number of preallocated entries is defined per architecture. If it is too low for you From 179c498ae2998461fe436437a74dc29036fc7dcc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 12:03:19 -0400 Subject: [PATCH 776/900] function-graph: only allocate init tasks if it was not already done When the function graph tracer is enabled, it calls the initialization needed for the init tasks that would be called on all created tasks. The problem is that this is called every time the function graph tracer is enabled, and the ret_stack is allocated for the idle tasks each time. Thus, the old ret_stack is lost and a memory leak is created. This is also dangerous because if an interrupt happened on another CPU with the init task and the ret_stack is replaced, we then lose all the return pointers for the interrupt, and a crash would take place. [ Impact: fix memory leak and possible crash due to race ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f1ed080406c..ebff62ef40b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2643,8 +2643,10 @@ static int start_graph_tracing(void) return -ENOMEM; /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) - ftrace_graph_init_task(idle_task(cpu)); + for_each_online_cpu(cpu) { + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_task(idle_task(cpu)); + } do { ret = alloc_retstack_tasklist(ret_stack_list); From d3ae33efb8e2f277f9007eb060c9d0b91ab38ae1 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 2 Jun 2009 12:34:31 +0100 Subject: [PATCH 777/900] pata_netcell: LBA48 force identify bits correct This matches Bartlomiej's patch for ide_pci_generic: c339dfdd65b52bfd947ab29d1210314a2f6d622d In the libata case netcell has its own mini driver. I suspect this fix is actually only needed for some firmware revs but it does no harm either way. Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- drivers/ata/pata_netcell.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/ata/pata_netcell.c b/drivers/ata/pata_netcell.c index bdb236957cb..9a698097134 100644 --- a/drivers/ata/pata_netcell.c +++ b/drivers/ata/pata_netcell.c @@ -20,13 +20,24 @@ /* No PIO or DMA methods needed for this device */ +static unsigned int netcell_read_id(struct ata_device *adev, + struct ata_taskfile *tf, u16 *id) +{ + unsigned int err_mask = ata_do_dev_read_id(adev, tf, id); + /* Firmware forgets to mark words 85-87 valid */ + if (err_mask == 0) + id[ATA_ID_CSF_DEFAULT] |= 0x0400; + return err_mask; +} + static struct scsi_host_template netcell_sht = { ATA_BMDMA_SHT(DRV_NAME), }; static struct ata_port_operations netcell_ops = { .inherits = &ata_bmdma_port_ops, - .cable_detect = ata_cable_80wire, + .cable_detect = ata_cable_80wire, + .read_id = netcell_read_id, }; From 05ad709d04799125ed85dd816fdb558258102172 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 2 Jun 2009 16:58:10 +0100 Subject: [PATCH 778/900] parport: quickfix the proc registration bug Ideally we should have a directory of drivers and a link to the 'active' driver. For now just show the first device which is effectively the existing semantics without a warning. This is an update on the original buggy patch that I then forgot to resubmit. Confusingly it was proposed by Red Hat, written by Etched Pixels fixed and submitted by Intel ... Resolves-Bug: http://bugzilla.kernel.org/show_bug.cgi?id=9749 Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- drivers/parport/share.c | 13 ++++++++++--- include/linux/parport.h | 4 ++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/parport/share.c b/drivers/parport/share.c index 0ebca450ed2..dffa5d4fb29 100644 --- a/drivers/parport/share.c +++ b/drivers/parport/share.c @@ -614,7 +614,10 @@ parport_register_device(struct parport *port, const char *name, * pardevice fields. -arca */ port->ops->init_state(tmp, tmp->state); - parport_device_proc_register(tmp); + if (!test_and_set_bit(PARPORT_DEVPROC_REGISTERED, &port->devflags)) { + port->proc_device = tmp; + parport_device_proc_register(tmp); + } return tmp; out_free_all: @@ -646,10 +649,14 @@ void parport_unregister_device(struct pardevice *dev) } #endif - parport_device_proc_unregister(dev); - port = dev->port->physport; + if (port->proc_device == dev) { + port->proc_device = NULL; + clear_bit(PARPORT_DEVPROC_REGISTERED, &port->devflags); + parport_device_proc_unregister(dev); + } + if (port->cad == dev) { printk(KERN_DEBUG "%s: %s forgot to release port\n", port->name, dev->name); diff --git a/include/linux/parport.h b/include/linux/parport.h index e1f83c5065c..38a423ed3c0 100644 --- a/include/linux/parport.h +++ b/include/linux/parport.h @@ -324,6 +324,10 @@ struct parport { int spintime; atomic_t ref_count; + unsigned long devflags; +#define PARPORT_DEVPROC_REGISTERED 0 + struct pardevice *proc_device; /* Currently register proc device */ + struct list_head full_list; struct parport *slaves[3]; }; From 82310a3272d5a2a7652f5649ad8a55f58c8f74d9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 12:26:07 -0400 Subject: [PATCH 779/900] function-graph: enable the stack after initialization of other variables The function graph tracer checks if the task_struct has ret_stack defined to know if it is OK or not to use it. The initialization is done for all tasks by one process, but the idle tasks use the same initialization used by new tasks. If an interrupt happens on an idle task that just had the ret_stack created, but before the rest of the initialization took place, then we can corrupt the return address of the functions. This patch moves the setting of the task_struct's ret_stack to after the other variables have been initialized. [ Impact: prevent kernel panic on idle task when starting function graph ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 9 +++++++-- kernel/trace/trace_functions_graph.c | 6 ++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ebff62ef40b..20e066065eb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2739,15 +2739,20 @@ void unregister_ftrace_graph(void) void ftrace_graph_init_task(struct task_struct *t) { if (atomic_read(&ftrace_graph_active)) { - t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + struct ftrace_ret_stack *ret_stack; + + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH * sizeof(struct ftrace_ret_stack), GFP_KERNEL); - if (!t->ret_stack) + if (!ret_stack) return; t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; + /* make curr_ret_stack visable before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; } else t->ret_stack = NULL; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d28687e7b3a..baeb5fe3610 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -65,6 +65,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) if (!current->ret_stack) return -EBUSY; + /* + * We must make sure the ret_stack is tested before we read + * anything else. + */ + smp_rmb(); + /* The return trace stack is full */ if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { atomic_inc(¤t->trace_overrun); From 26c01624a2a40f8a4ddf6449b65c9b1c418d0e72 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 14:01:19 -0400 Subject: [PATCH 780/900] function-graph: add memory barriers for accessing task's ret_stack The code that handles the tasks ret_stack allocation for every task assumes that only an interrupt can cause issues (even though interrupts are disabled). In reality, the code is allocating the ret_stack for tasks that may be running on other CPUs and there are not efficient memory barriers to handle this case. [ Impact: prevent crash due to using of uninitialized ret_stack variables ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 20e066065eb..1664d3f33d3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2580,12 +2580,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } if (t->ret_stack == NULL) { - t->curr_ret_stack = -1; - /* Make sure IRQs see the -1 first: */ - barrier(); - t->ret_stack = ret_stack_list[start++]; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); + t->curr_ret_stack = -1; + /* Make sure the tasks see the -1 first: */ + smp_wmb(); + t->ret_stack = ret_stack_list[start++]; } } while_each_thread(g, t); From f7e8b616ed1cc6f790b82324bce8a2a60295e5c2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 16:39:48 -0400 Subject: [PATCH 781/900] function-graph: move initialization of new tasks up in fork When the function graph tracer is enabled, all new tasks must allocate a ret_stack to place the return address of functions. This is because the function graph tracer will replace the real return address with a call to the tracing of the exit function. This initialization happens in fork, but it happens too late. If fork fails, then it will call free_task and that calls the freeing of this ret_stack. But before initialization happens, the new (failed) task points to its parents ret_stack. If a fork failure happens during the function trace, it would be catastrophic for the parent. Also, there's no need to call ftrace_graph_exit_task from fork, since it is called by free_task which fork calls on failure. [ Impact: prevent crash during failed fork running function graph tracer ] Signed-off-by: Steven Rostedt --- kernel/fork.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd0072..c4b1e35c430 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -982,6 +982,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!p) goto fork_out; + ftrace_graph_init_task(p); + rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING @@ -1131,8 +1133,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } - ftrace_graph_init_task(p); - p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) @@ -1141,7 +1141,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (current->nsproxy != p->nsproxy) { retval = ns_cgroup_clone(p, pid); if (retval) - goto bad_fork_free_graph; + goto bad_fork_free_pid; } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; @@ -1233,7 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_free_graph; + goto bad_fork_free_pid; } if (clone_flags & CLONE_THREAD) { @@ -1268,8 +1268,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, cgroup_post_fork(p); return p; -bad_fork_free_graph: - ftrace_graph_exit_task(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); From 84047e360af0394ac5861d433f26bbcf30f77dd1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 16:51:55 -0400 Subject: [PATCH 782/900] function-graph: always initialize task ret_stack On creating a new task while running the function graph tracer, if we fail to allocate the ret_stack, and then fail the fork, the code will free the parent ret_stack. This is because the child duplicated the parent and currently points to the parent's ret_stack. This patch always initializes the task's ret_stack to NULL. [ Impact: prevent crash of parent on low memory during fork ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1664d3f33d3..bb081f37cac 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2738,6 +2738,9 @@ void unregister_ftrace_graph(void) /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { + /* Make sure we do not use the parent ret_stack */ + t->ret_stack = NULL; + if (atomic_read(&ftrace_graph_active)) { struct ftrace_ret_stack *ret_stack; @@ -2753,8 +2756,7 @@ void ftrace_graph_init_task(struct task_struct *t) /* make curr_ret_stack visable before we add the ret_stack */ smp_wmb(); t->ret_stack = ret_stack; - } else - t->ret_stack = NULL; + } } void ftrace_graph_exit_task(struct task_struct *t) From eb5f4ca9536ba297c98721ecbbdf41ec5b987bd5 Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Mon, 1 Jun 2009 09:19:37 +0100 Subject: [PATCH 783/900] [ARM] 5534/1: kmalloc must return a cache line aligned buffer Define ARCH_KMALLOC_MINALIGN in asm/cache.h At the request of Russell also move ARCH_SLAB_MINALIGN to this file. Signed-off-by: Martin Fuzzey Signed-off-by: Russell King --- arch/arm/include/asm/cache.h | 16 ++++++++++++++++ arch/arm/include/asm/page.h | 7 ------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h index cb7a9e97fd7..feaa75f0013 100644 --- a/arch/arm/include/asm/cache.h +++ b/arch/arm/include/asm/cache.h @@ -7,4 +7,20 @@ #define L1_CACHE_SHIFT 5 #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) +/* + * Memory returned by kmalloc() may be used for DMA, so we must make + * sure that all such allocations are cache aligned. Otherwise, + * unrelated code may cause parts of the buffer to be read into the + * cache before the transfer is done, causing old data to be seen by + * the CPU. + */ +#define ARCH_KMALLOC_MINALIGN L1_CACHE_BYTES + +/* + * With EABI on ARMv5 and above we must have 64-bit aligned slab pointers. + */ +#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5) +#define ARCH_SLAB_MINALIGN 8 +#endif + #endif diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index e6eb8a67b80..7b522770f29 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -202,13 +202,6 @@ typedef struct page *pgtable_t; (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -/* - * With EABI on ARMv5 and above we must have 64-bit aligned slab pointers. - */ -#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5) -#define ARCH_SLAB_MINALIGN 8 -#endif - #include #endif From 1946d6ef9d7bd4ba97094fe6eb68a9b877bde6b7 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 1 Jun 2009 12:50:33 +0100 Subject: [PATCH 784/900] [ARM] ARMv7 errata: only apply fixes when running on applicable CPU Currently, whenever an erratum workaround is enabled, it will be applied whether or not the erratum is relevent for the CPU. This patch changes this - we check the variant and revision fields in the main ID register to determine which errata to apply. We also avoid re-applying erratum 460075 if it has already been applied. Applying this fix in non-secure mode results in the kernel failing to boot (or even do anything.) This fixes booting on some ARMv7 based platforms which otherwise silently fail. Acked-by: Catalin Marinas Signed-off-by: Russell King --- arch/arm/mm/proc-v7.S | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S index 3397f1e64d7..a08d9d2380d 100644 --- a/arch/arm/mm/proc-v7.S +++ b/arch/arm/mm/proc-v7.S @@ -184,23 +184,37 @@ __v7_setup: stmia r12, {r0-r5, r7, r9, r11, lr} bl v7_flush_dcache_all ldmia r12, {r0-r5, r7, r9, r11, lr} + + mrc p15, 0, r0, c0, c0, 0 @ read main ID register + and r10, r0, #0xff000000 @ ARM? + teq r10, #0x41000000 + bne 2f + and r5, r0, #0x00f00000 @ variant + and r6, r0, #0x0000000f @ revision + orr r0, r6, r5, lsr #20-4 @ combine variant and revision + #ifdef CONFIG_ARM_ERRATA_430973 - mrc p15, 0, r10, c1, c0, 1 @ read aux control register - orr r10, r10, #(1 << 6) @ set IBE to 1 - mcr p15, 0, r10, c1, c0, 1 @ write aux control register + teq r5, #0x00100000 @ only present in r1p* + mrceq p15, 0, r10, c1, c0, 1 @ read aux control register + orreq r10, r10, #(1 << 6) @ set IBE to 1 + mcreq p15, 0, r10, c1, c0, 1 @ write aux control register #endif #ifdef CONFIG_ARM_ERRATA_458693 - mrc p15, 0, r10, c1, c0, 1 @ read aux control register - orr r10, r10, #(1 << 5) @ set L1NEON to 1 - orr r10, r10, #(1 << 9) @ set PLDNOP to 1 - mcr p15, 0, r10, c1, c0, 1 @ write aux control register + teq r0, #0x20 @ only present in r2p0 + mrceq p15, 0, r10, c1, c0, 1 @ read aux control register + orreq r10, r10, #(1 << 5) @ set L1NEON to 1 + orreq r10, r10, #(1 << 9) @ set PLDNOP to 1 + mcreq p15, 0, r10, c1, c0, 1 @ write aux control register #endif #ifdef CONFIG_ARM_ERRATA_460075 - mrc p15, 1, r10, c9, c0, 2 @ read L2 cache aux ctrl register - orr r10, r10, #(1 << 22) @ set the Write Allocate disable bit - mcr p15, 1, r10, c9, c0, 2 @ write the L2 cache aux ctrl register + teq r0, #0x20 @ only present in r2p0 + mrceq p15, 1, r10, c9, c0, 2 @ read L2 cache aux ctrl register + tsteq r10, #1 << 22 + orreq r10, r10, #(1 << 22) @ set the Write Allocate disable bit + mcreq p15, 1, r10, c9, c0, 2 @ write the L2 cache aux ctrl register #endif - mov r10, #0 + +2: mov r10, #0 #ifdef HARVARD_CACHE mcr p15, 0, r10, c7, c5, 0 @ I+BTB cache invalidate #endif From 9fa7eb283c5cdc2b0f4a8cfe6387ed82e5e9a3d3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 2 Jun 2009 20:07:25 -0700 Subject: [PATCH 785/900] Linux 2.6.30-rc8 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 739fd34a72a..610d1c332c4 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 30 -EXTRAVERSION = -rc7 +EXTRAVERSION = -rc8 NAME = Man-Eating Seals of Antiquity # *DOCUMENTATION* From 6799687a53a28536fd027ccb644833f66a778925 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 2 Jun 2009 08:23:58 +0200 Subject: [PATCH 786/900] x86, boot: add new generated files to the appropriate .gitignore files git status complains of untracked (generated) files in arch/x86/boot.. # Untracked files: # (use "git add ..." to include in what will be committed) # # ../../arch/x86/boot/compressed/mkpiggy # ../../arch/x86/boot/compressed/piggy.S # ../../arch/x86/boot/compressed/vmlinux.lds # ../../arch/x86/boot/voffset.h # ../../arch/x86/boot/zoffset.h ..so adjust .gitignore files accordingly. Signed-off-by: Mike Galbraith Signed-off-by: H. Peter Anvin --- arch/x86/boot/.gitignore | 2 ++ arch/x86/boot/compressed/.gitignore | 3 +++ 2 files changed, 5 insertions(+) diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index 172cf8a98bd..851fe936d24 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -3,6 +3,8 @@ bzImage cpustr.h mkcpustr offsets.h +voffset.h +zoffset.h setup setup.bin setup.elf diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore index 63eff3b04d0..4a46fab7162 100644 --- a/arch/x86/boot/compressed/.gitignore +++ b/arch/x86/boot/compressed/.gitignore @@ -1,3 +1,6 @@ relocs vmlinux.bin.all vmlinux.relocs +vmlinux.lds +mkpiggy +piggy.S From 367d04c4ec02dad34d80452e32e3370db7fb6fee Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 28 May 2009 09:54:48 +0200 Subject: [PATCH 787/900] amd_iommu: fix lock imbalance In alloc_coherent there is an omitted unlock on the path where mapping fails. Add the unlock. [ Impact: fix lock imbalance in alloc_coherent ] Signed-off-by: Jiri Slaby Cc: Joerg Roedel Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d6898833c36..9f89bb645b3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1541,8 +1541,10 @@ static void *alloc_coherent(struct device *dev, size_t size, *dma_addr = __map_single(dev, iommu, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) + if (*dma_addr == bad_dma_address) { + spin_unlock_irqrestore(&domain->lock, flags); goto out_free; + } iommu_completion_wait(iommu); From e76afc4e7816a0a5300073098cdac93a994eb5ca Mon Sep 17 00:00:00 2001 From: Eric Lammerts Date: Tue, 19 May 2009 20:53:20 -0400 Subject: [PATCH 788/900] fix oops when using console=ttymxcN with N > 0 Signed-off-by: Eric Lammerts Signed-off-by: Sascha Hauer --- drivers/serial/imx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c index 9f460b175c5..5f0be40dfda 100644 --- a/drivers/serial/imx.c +++ b/drivers/serial/imx.c @@ -1031,6 +1031,8 @@ imx_console_setup(struct console *co, char *options) if (co->index == -1 || co->index >= ARRAY_SIZE(imx_ports)) co->index = 0; sport = imx_ports[co->index]; + if(sport == NULL) + return -ENODEV; if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); From 6b4bfb87b638a4f114dfb6f72f4ac1be88a4ebe4 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Tue, 26 May 2009 22:31:46 +0530 Subject: [PATCH 789/900] mx[23]: don't put clock lookups in __initdata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the __initdata annotation for the clock lookups, since they will be needed when loading modules which use clk_get(). Tested-by: Agustín Ferrín Pozuelo Signed-off-by: Rabin Vincent Signed-off-by: Sascha Hauer --- arch/arm/mach-mx2/clock_imx21.c | 2 +- arch/arm/mach-mx2/clock_imx27.c | 2 +- arch/arm/mach-mx3/clock-imx35.c | 2 +- arch/arm/mach-mx3/clock.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/mach-mx2/clock_imx21.c b/arch/arm/mach-mx2/clock_imx21.c index 999d013e06e..e4b08ca804e 100644 --- a/arch/arm/mach-mx2/clock_imx21.c +++ b/arch/arm/mach-mx2/clock_imx21.c @@ -890,7 +890,7 @@ static struct clk clko_clk = { .con_id = n, \ .clk = &c, \ }, -static struct clk_lookup lookups[] __initdata = { +static struct clk_lookup lookups[] = { /* It's unlikely that any driver wants one of them directly: _REGISTER_CLOCK(NULL, "ckih", ckih_clk) _REGISTER_CLOCK(NULL, "ckil", ckil_clk) diff --git a/arch/arm/mach-mx2/clock_imx27.c b/arch/arm/mach-mx2/clock_imx27.c index 3f7280c490f..2c971442f3f 100644 --- a/arch/arm/mach-mx2/clock_imx27.c +++ b/arch/arm/mach-mx2/clock_imx27.c @@ -621,7 +621,7 @@ DEFINE_CLOCK1(csi_clk, 0, 0, 0, parent, &csi_clk1, &per4_clk); .clk = &c, \ }, -static struct clk_lookup lookups[] __initdata = { +static struct clk_lookup lookups[] = { _REGISTER_CLOCK("imx-uart.0", NULL, uart1_clk) _REGISTER_CLOCK("imx-uart.1", NULL, uart2_clk) _REGISTER_CLOCK("imx-uart.2", NULL, uart3_clk) diff --git a/arch/arm/mach-mx3/clock-imx35.c b/arch/arm/mach-mx3/clock-imx35.c index 53a112d4e04..3c1e06f56dd 100644 --- a/arch/arm/mach-mx3/clock-imx35.c +++ b/arch/arm/mach-mx3/clock-imx35.c @@ -404,7 +404,7 @@ DEFINE_CLOCK(gpu2d_clk, 0, CCM_CGR3, 4, NULL, NULL); .clk = &c, \ }, -static struct clk_lookup lookups[] __initdata = { +static struct clk_lookup lookups[] = { _REGISTER_CLOCK(NULL, "asrc", asrc_clk) _REGISTER_CLOCK(NULL, "ata", ata_clk) _REGISTER_CLOCK(NULL, "audmux", audmux_clk) diff --git a/arch/arm/mach-mx3/clock.c b/arch/arm/mach-mx3/clock.c index 9957a11533a..a68fcf981ed 100644 --- a/arch/arm/mach-mx3/clock.c +++ b/arch/arm/mach-mx3/clock.c @@ -516,7 +516,7 @@ DEFINE_CLOCK(ipg_clk, 0, NULL, 0, ipg_get_rate, NULL, &ahb_clk); .clk = &c, \ }, -static struct clk_lookup lookups[] __initdata = { +static struct clk_lookup lookups[] = { _REGISTER_CLOCK(NULL, "emi", emi_clk) _REGISTER_CLOCK(NULL, "cspi", cspi1_clk) _REGISTER_CLOCK(NULL, "cspi", cspi2_clk) From 0e2595cdfd7df9f1128f7185152601ae5417483b Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 20 May 2009 08:10:57 -0500 Subject: [PATCH 790/900] x86: Fix UV BAU activation descriptor init The UV tlb shootdown code has a serious initialization error. An array of structures [32*8] is initialized as if it were [32]. The array is indexed by (cpu number on the blade)*8, so the short initialization works for up to 4 cpus on a blade. But above that, we provide an invalid opcode to the hub's broadcast assist unit. This patch changes the allocation of the array to use its symbolic dimensions for better clarity. And initializes all 32*8 entries. Shortened 'UV_ACTIVATION_DESCRIPTOR_SIZE' to 'UV_ADP_SIZE' per Ingo's recommendation. Tested on the UV simulator. Signed-off-by: Cliff Wickman Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 2 +- arch/x86/kernel/tlb_uv.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 9b0e61bf7a8..bddd44f2f0a 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -37,7 +37,7 @@ #define UV_CPUS_PER_ACT_STATUS 32 #define UV_ACT_STATUS_MASK 0x3 #define UV_ACT_STATUS_SIZE 2 -#define UV_ACTIVATION_DESCRIPTOR_SIZE 32 +#define UV_ADP_SIZE 32 #define UV_DISTRIBUTION_SIZE 256 #define UV_SW_ACK_NPENDING 8 #define UV_NET_ENDPOINT_INTD 0x38 diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index ed0c33761e6..16f0fd4f18e 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode) struct bau_desc *adp; struct bau_desc *ad2; - adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); + /* + * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) + * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade + */ + adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* + UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); BUG_ON(!adp); pa = uv_gpa(adp); /* need the real nasid*/ @@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode) (n << UV_DESC_BASE_PNODE_SHIFT | m)); } - for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { + /* + * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each + * cpu even though we only use the first one; one descriptor can + * describe a broadcast to 256 nodes. + */ + for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); + i++, ad2++) { memset(ad2, 0, sizeof(struct bau_desc)); ad2->header.sw_ack_flag = 1; /* From a2023556409cf7fec5d67a26f7fcfa57c5a4086d Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Tue, 2 Jun 2009 17:06:54 -0700 Subject: [PATCH 791/900] ring-buffer: fix bug in ring_buffer_discard_commit There's a bug in ring_buffer_discard_commit. The wrong pointer is being compared in order to check if the event can be freed from the buffer rather than discarded (i.e. marked as PAD). I noticed this when I was working on duration filtering. The bug is not deadly - it just results in lots of wasted space in the buffer. All filtered events are left in the buffer and marked as discarded, rather than being removed from the buffer to make space for other events. Unfortunately, when I fixed this bug, I got errors doing a filtered function trace. Multiple TIME_EXTEND events pile up in the buffer, and trigger the following loop overage warning in rb_iter_peek(): again: ... if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) return NULL; I'm not sure what the best way is to fix this. I don't know if I should extend the loop threshhold, or if I should make the test more complex (ignore TIME_EXTEND events), or just get rid of this loop check completely. Note that if I implement a workaround for this, then I see another problem from rb_advance_iter(). I haven't tracked that one down yet. In general, it seems like the case of removing filtered events has not been working properly, and so some assumptions about buffer invariant conditions need to be revisited. Here's the patch for the simple fix: Compare correct pointer for checking if an event can be freed rather than left as discarded in the buffer. Signed-off-by: Tim Bird LKML-Reference: <4A25BE9E.5090909@am.sony.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 16b24d49604..94530236869 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1708,7 +1708,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, bpage = cpu_buffer->tail_page; - if (bpage == (void *)addr && rb_page_write(bpage) == old_index) { + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { /* * This is on the tail page. It is possible that * a write could come in and move the tail page From edd813bffc62a980bb4fb9b1243f31c1cce78da3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 23:00:53 -0400 Subject: [PATCH 792/900] ring-buffer: try to discard unneeded timestamps There are times that a race may happen that we add a timestamp in a nested write. This timestamp would just contain a zero delta and serves no purpose. Now that we have a way to discard events, this patch will try to discard the timestamp instead of just wasting the space in the ring buffer. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 67 +++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 94530236869..50926601a28 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1335,6 +1335,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; } +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) + return 1; + } + + /* could not discard */ + return 0; +} + static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, u64 *delta) @@ -1384,10 +1416,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, /* let the caller know this was the commit */ ret = 1; } else { - /* Darn, this is just wasted space */ - event->time_delta = 0; - event->array[0] = 0; - ret = 0; + /* Try to discard the event */ + if (!rb_try_to_discard(cpu_buffer, event)) { + /* Darn, this is just wasted space */ + event->time_delta = 0; + event->array[0] = 0; + ret = 0; + } } *delta = 0; @@ -1682,10 +1717,6 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long new_index, old_index; - struct buffer_page *bpage; - unsigned long index; - unsigned long addr; int cpu; /* The event is discarded regardless */ @@ -1701,24 +1732,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, cpu = smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; - new_index = rb_event_index(event); - old_index = new_index + rb_event_length(event); - addr = (unsigned long)event; - addr &= PAGE_MASK; - - bpage = cpu_buffer->tail_page; - - if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { - /* - * This is on the tail page. It is possible that - * a write could come in and move the tail page - * and write to the next page. That is fine - * because we just shorten what is on this page. - */ - index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) - goto out; - } + if (!rb_try_to_discard(cpu_buffer, event)) + goto out; /* * The commit is still visible by the reader, so we From ea05b57cc19234d8de9887c8a32c2e58e84b56ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Jun 2009 09:30:10 -0400 Subject: [PATCH 793/900] ring-buffer: discard timestamps that are at the start of the buffer Every buffer page in the ring buffer includes its own time stamp. When an event is recorded to the ring buffer with a delta time greater than what can be held in the event header, a time stamp event is created. If the the create timestamp falls over to the next buffer page, it is redundant because the buffer page holds a full time stamp. This patch will try to discard the time stamp when it falls to the start of the next page. This change also fixes a issues with disarding events. If most events are discarded, timestamps will start to creep into the ring buffer. If we do not discard the timestamps then they can fill up the ring buffer over time and waste space. This change will keep time stamps from filling up over another page. If something is recorded in the buffer page, and the rest is filtered, then the time stamps can only fill up to the end of the page. [ Impact: prevent time stamps from filling ring buffer ] Reported-by: Tim Bird Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 50926601a28..7102d7a2fad 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -370,6 +370,9 @@ static inline int test_time_stamp(u64 delta) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) +/* Max number of timestamps that can fit on a page */ +#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) + int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; @@ -1409,8 +1412,12 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, event->array[0] = *delta >> TS_SHIFT; } else { cpu_buffer->commit_page->page->time_stamp = *ts; - event->time_delta = 0; - event->array[0] = 0; + /* try to discard, since we do not need this */ + if (!rb_try_to_discard(cpu_buffer, event)) { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } } cpu_buffer->write_stamp = *ts; /* let the caller know this was the commit */ @@ -2268,8 +2275,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * Check if we are at the end of the buffer. */ if (iter->head >= rb_page_size(iter->head_page)) { - if (RB_WARN_ON(buffer, - iter->head_page == cpu_buffer->commit_page)) + /* discarded commits can make the page empty */ + if (iter->head_page == cpu_buffer->commit_page) return; rb_inc_iter(iter); return; @@ -2312,12 +2319,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) /* * We repeat when a timestamp is encountered. It is possible * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written. The max times - * that this can happen is the number of nested interrupts we - * can have. Nesting 10 deep of interrupts is clearly - * an anomaly. + * as one timestamp is about to be written, or from discarded + * commits. The most that we can have is the number on a single page. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) return NULL; reader = rb_get_reader_page(cpu_buffer); @@ -2383,14 +2388,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) again: /* - * We repeat when a timestamp is encountered. It is possible - * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written. The max times - * that this can happen is the number of nested interrupts we - * can have. Nesting 10 deep of interrupts is clearly - * an anomaly. + * We repeat when a timestamp is encountered. + * We can get multiple timestamps by nested interrupts or also + * if filtering is on (discarding commits). Since discarding + * commits can be frequent we can get a lot of timestamps. + * But we limit them by not adding timestamps if they begin + * at the start of a page. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) From 083a63b48e4dd0a6a2d44216720076dc81ebb255 Mon Sep 17 00:00:00 2001 From: walimis Date: Wed, 3 Jun 2009 16:01:28 +0800 Subject: [PATCH 794/900] tracing/trace_stack: fix the number of entries in the header The last entry in the stack_dump_trace is ULONG_MAX, which is not a valid entry, but max_stack_trace.nr_entries has accounted for it. So when printing the header, we should decrease it by one. Before fix, print as following, for example: Depth Size Location (53 entries) <--- should be 52 ----- ---- -------- 0) 3264 108 update_wall_time+0x4d5/0x9a0 ... 51) 80 80 syscall_call+0x7/0xb ^^^ it's correct. Signed-off-by: walimis LKML-Reference: <1244016090-7814-1-git-send-email-walimisdev@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 1796f00524e..2d7aebd71db 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries); + max_stack_trace.nr_entries - 1); if (!stack_tracer_enabled && !max_stack_size) print_disabled(m); From f11b3f4e2932bfdcfc458ab8d1ece62724ceabfc Mon Sep 17 00:00:00 2001 From: walimis Date: Wed, 3 Jun 2009 16:01:29 +0800 Subject: [PATCH 795/900] tracing/events: fix output format of kernel stack According to "events/ftrace/kernel_stack/format", output format of kernel stack should use "=>" instead of "<=". The second problem is that we shouldn't skip the first entry in the stack, although it seems to be duplicated when used in the "function" tracer, but events also use it. If we skip the first one, we will drop the topmost entry of the stack. The last problem is that if the last entry is ULONG_MAX(0xffffffff), we should drop it, otherwise it will print a NULL name line. before fix: sh-1072 [000] 26.957239: sched_process_fork: parent sh:1072 child sh:1073 sh-1072 [000] 26.957262: <= syscall_call <= sh-1072 [000] 26.957744: sched_switch: task sh:1072 [120] (R) ==> sh:1073 [120] sh-1072 [000] 26.957752: <= preempt_schedule <= wake_up_new_task <= do_fork <= sys_clone <= syscall_call <= After fix: sh-1075 [000] 39.791848: sched_process_fork: parent sh:1075 child sh:1076 sh-1075 [000] 39.791871: => sys_clone => syscall_call sh-1075 [000] 39.792713: sched_switch: task sh:1075 [120] (R) ==> sh:1076 [120] sh-1075 [000] 39.792722: => schedule => preempt_schedule => wake_up_new_task => do_fork => sys_clone => syscall_call Signed-off-by: walimis LKML-Reference: <1244016090-7814-2-git-send-email-walimisdev@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0fe3b223f7e..64596a57160 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -975,16 +975,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); + if (!trace_seq_puts(s, "\n")) + goto partial; for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (!field->caller[i]) + if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) break; - if (i) { - if (!trace_seq_puts(s, " <= ")) - goto partial; + if (!trace_seq_puts(s, " => ")) + goto partial; - if (!seq_print_ip_sym(s, field->caller[i], flags)) - goto partial; - } + if (!seq_print_ip_sym(s, field->caller[i], flags)) + goto partial; if (!trace_seq_puts(s, "\n")) goto partial; } From 048dc50c5e7eada19ebabbad70b7966d14283d41 Mon Sep 17 00:00:00 2001 From: walimis Date: Wed, 3 Jun 2009 16:01:30 +0800 Subject: [PATCH 796/900] tracing/events: fix output format of user stack According to "events/ftrace/user_stack/format", fix the output of user stack. before fix: sh-1073 [000] 31.137561: <- <0804e33c> <- <080835c1> after fix: sh-1072 [000] 37.039329: => => <0804e33c> => <080835c1> Signed-off-by: walimis LKML-Reference: <1244016090-7814-3-git-send-email-walimisdev@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 64596a57160..8dadbbbd2d5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -389,17 +389,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, if (ip == ULONG_MAX || !ret) break; - if (i && ret) - ret = trace_seq_puts(s, " <- "); + if (ret) + ret = trace_seq_puts(s, " => "); if (!ip) { if (ret) ret = trace_seq_puts(s, "??"); + if (ret) + ret = trace_seq_puts(s, "\n"); continue; } if (!ret) break; if (ret) ret = seq_print_user_ip(s, mm, ip, sym_flags); + ret = trace_seq_puts(s, "\n"); } if (mm) @@ -1012,10 +1015,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!seq_print_userip_objs(field, s, flags)) + if (!trace_seq_putc(s, '\n')) goto partial; - if (!trace_seq_putc(s, '\n')) + if (!seq_print_userip_objs(field, s, flags)) goto partial; return TRACE_TYPE_HANDLED; From 56d8bd3f0b98972312cad683947ec90b21011199 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 3 Jun 2009 14:52:03 +0100 Subject: [PATCH 797/900] tracing: fix multiple use of __print_flags and __print_symbolic Here is an updated patch to include the extra call to trace_seq_init() as requested. This is vs. the latest -tip tree and fixes the use of multiple __print_flags and __print_symbolic in a single tracer. Also tested to ensure its working now: mount.gfs2-2534 [000] 235.850587: gfs2_glock_queue: 8.7 glock 1:2 dequeue PR mount.gfs2-2534 [000] 235.850591: gfs2_demote_rq: 8.7 glock 1:0 demote EX to NL flags:DI mount.gfs2-2534 [000] 235.850591: gfs2_glock_queue: 8.7 glock 1:0 dequeue EX glock_workqueue-2529 [000] 235.850666: gfs2_glock_state_change: 8.7 glock 1:0 state EX => NL tgt:NL dmt:NL flags:lDpI glock_workqueue-2529 [000] 235.850672: gfs2_glock_put: 8.7 glock 1:0 state NL => IV flags:I Signed-off-by: Steven Whitehouse LKML-Reference: <1244037123.29604.603.camel@localhost.localdomain> Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 2 ++ kernel/trace/trace_output.c | 10 ++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index b5478dab579..40ede4db4d8 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -104,6 +104,7 @@ * field = (typeof(field))entry; * * p = get_cpu_var(ftrace_event_seq); + * trace_seq_init(p); * ret = trace_seq_printf(s, "\n"); * put_cpu(); * if (!ret) @@ -167,6 +168,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ field = (typeof(field))entry; \ \ p = &get_cpu_var(ftrace_event_seq); \ + trace_seq_init(p); \ ret = trace_seq_printf(s, #call ": " print); \ put_cpu(); \ if (!ret) \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8dadbbbd2d5..8afeea412e7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -223,10 +223,9 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, { unsigned long mask; const char *str; + const char *ret = p->buffer + p->len; int i; - trace_seq_init(p); - for (i = 0; flag_array[i].name && flags; i++) { mask = flag_array[i].mask; @@ -249,7 +248,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, trace_seq_putc(p, 0); - return p->buffer; + return ret; } EXPORT_SYMBOL(ftrace_print_flags_seq); @@ -258,8 +257,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array) { int i; - - trace_seq_init(p); + const char *ret = p->buffer + p->len; for (i = 0; symbol_array[i].name; i++) { @@ -275,7 +273,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, trace_seq_putc(p, 0); - return p->buffer; + return ret; } EXPORT_SYMBOL(ftrace_print_symbols_seq); From 563af16c30ede41eda2d614195d88e07f7c7103d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Jun 2009 11:10:44 -0400 Subject: [PATCH 798/900] tracing: add annotation to what type of stack trace is recorded The current method of printing out a stack trace is to add a new line and print out the trace: yum-updatesd-3120 [002] 573.691303: => do_softirq => irq_exit => smp_apic_timer_interrupt => apic_timer_interrupt This looks a bit awkward, and if we have both stack and user stack traces running, it would be nice to have a title to tell them apart, although it is easy to tell by the output. This patch adds an annotation to the start of the stack traces: init-1 [003] 929.304979: => user_path_at => vfs_fstatat => vfs_stat => sys_newstat => system_call_fastpath cat-3459 [002] 1016.824040: => <0000003aae6c0250> => <00007ffff4b06ae4> => <69636172742f6775> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8afeea412e7..425725c1622 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -976,7 +976,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!trace_seq_puts(s, "\n")) + if (!trace_seq_puts(s, "\n")) goto partial; for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) @@ -1013,7 +1013,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!trace_seq_putc(s, '\n')) + if (!trace_seq_puts(s, "\n")) goto partial; if (!seq_print_userip_objs(field, s, flags)) From c499b0672f8df9379764965c5ec124751699d7c4 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Fri, 3 Apr 2009 14:41:56 +0200 Subject: [PATCH 799/900] mxcmmc: decrease minimum frequency to make MMC cards work This is a temporary workaround until the MMC stack can be fixed. Signed-off-by: Sascha Hauer Signed-off-by: Pierre Ossman --- drivers/mmc/host/mxcmmc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index b4a615c55f2..5950102113f 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -724,7 +724,9 @@ static int mxcmci_probe(struct platform_device *pdev) goto out_clk_put; } - mmc->f_min = clk_get_rate(host->clk) >> 7; + mmc->f_min = clk_get_rate(host->clk) >> 16; + if (mmc->f_min < 400000) + mmc->f_min = 400000; mmc->f_max = clk_get_rate(host->clk) >> 1; /* recommended in data sheet */ From 85b843227a9b8c1a27ebd354a80c89aef067f2ca Mon Sep 17 00:00:00 2001 From: Anand Gadiyar Date: Wed, 15 Apr 2009 17:44:58 +0530 Subject: [PATCH 800/900] omap_hsmmc: Trivial fix for a typo in comment Signed-off-by: Anand Gadiyar Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index e62a22a7f00..c40cb96255a 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -680,7 +680,7 @@ static void mmc_omap_dma_cb(int lch, u16 ch_status, void *data) host->dma_ch = -1; /* * DMA Callback: run in interrupt context. - * mutex_unlock will through a kernel warning if used. + * mutex_unlock will throw a kernel warning if used. */ up(&host->sem); } From 18489fa2ba4c170d96ffc1a41f7b9002dcb983b7 Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Thu, 16 Apr 2009 22:00:36 +0200 Subject: [PATCH 801/900] mxcmmc : Reset the SDHC hardware if software timeout occurs. When a software timeout occurs in polling mode hardware was left in an indeterminate state causing subsequent operations to block. Signed-off-by: Martin Fuzzey --- drivers/mmc/host/mxcmmc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index 5950102113f..dcc9cdb2a4d 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -140,6 +140,8 @@ struct mxcmci_host { struct work_struct datawork; }; +static void mxcmci_set_clk_rate(struct mxcmci_host *host, unsigned int clk_ios); + static inline int mxcmci_use_dma(struct mxcmci_host *host) { return host->do_dma; @@ -345,8 +347,11 @@ static int mxcmci_poll_status(struct mxcmci_host *host, u32 mask) stat = readl(host->base + MMC_REG_STATUS); if (stat & STATUS_ERR_MASK) return stat; - if (time_after(jiffies, timeout)) + if (time_after(jiffies, timeout)) { + mxcmci_softreset(host); + mxcmci_set_clk_rate(host, host->clock); return STATUS_TIME_OUT_READ; + } if (stat & mask) return 0; cpu_relax(); From 656217d28480f63313a488f6973980f9fbb921a1 Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Thu, 16 Apr 2009 22:00:41 +0200 Subject: [PATCH 802/900] mxcmmc: Fix missing return value checking in DMA setup code. Signed-off-by: Martin Fuzzey --- drivers/mmc/host/mxcmmc.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index dcc9cdb2a4d..f4cbe473670 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -162,7 +162,7 @@ static void mxcmci_softreset(struct mxcmci_host *host) writew(0xff, host->base + MMC_REG_RES_TO); } -static void mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data) +static int mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data) { unsigned int nob = data->blocks; unsigned int blksz = data->blksz; @@ -170,6 +170,7 @@ static void mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data) #ifdef HAS_DMA struct scatterlist *sg; int i; + int ret; #endif if (data->flags & MMC_DATA_STREAM) nob = 0xffff; @@ -185,7 +186,7 @@ static void mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data) for_each_sg(data->sg, sg, data->sg_len, i) { if (sg->offset & 3 || sg->length & 3) { host->do_dma = 0; - return; + return 0; } } @@ -194,23 +195,30 @@ static void mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data) host->dma_nents = dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len, host->dma_dir); - imx_dma_setup_sg(host->dma, data->sg, host->dma_nents, datasize, - host->res->start + MMC_REG_BUFFER_ACCESS, - DMA_MODE_READ); + ret = imx_dma_setup_sg(host->dma, data->sg, host->dma_nents, + datasize, + host->res->start + MMC_REG_BUFFER_ACCESS, + DMA_MODE_READ); } else { host->dma_dir = DMA_TO_DEVICE; host->dma_nents = dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len, host->dma_dir); - imx_dma_setup_sg(host->dma, data->sg, host->dma_nents, datasize, - host->res->start + MMC_REG_BUFFER_ACCESS, - DMA_MODE_WRITE); + ret = imx_dma_setup_sg(host->dma, data->sg, host->dma_nents, + datasize, + host->res->start + MMC_REG_BUFFER_ACCESS, + DMA_MODE_WRITE); } + if (ret) { + dev_err(mmc_dev(host->mmc), "failed to setup DMA : %d\n", ret); + return ret; + } wmb(); imx_dma_enable(host->dma); #endif /* HAS_DMA */ + return 0; } static int mxcmci_start_cmd(struct mxcmci_host *host, struct mmc_command *cmd, @@ -536,6 +544,7 @@ static void mxcmci_request(struct mmc_host *mmc, struct mmc_request *req) { struct mxcmci_host *host = mmc_priv(mmc); unsigned int cmdat = host->cmdat; + int error; WARN_ON(host->req != NULL); @@ -545,7 +554,12 @@ static void mxcmci_request(struct mmc_host *mmc, struct mmc_request *req) host->do_dma = 1; #endif if (req->data) { - mxcmci_setup_data(host, req->data); + error = mxcmci_setup_data(host, req->data); + if (error) { + req->cmd->error = error; + goto out; + } + cmdat |= CMD_DAT_CONT_DATA_ENABLE; @@ -553,7 +567,9 @@ static void mxcmci_request(struct mmc_host *mmc, struct mmc_request *req) cmdat |= CMD_DAT_CONT_WRITE; } - if (mxcmci_start_cmd(host, req->cmd, cmdat)) + error = mxcmci_start_cmd(host, req->cmd, cmdat); +out: + if (error) mxcmci_finish_request(host, req); } From 703aaced2b9c9a98285f265f3444c2f89d9d4d19 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Sat, 9 May 2009 01:03:52 -0400 Subject: [PATCH 803/900] mvsdio: allow automatic loading when modular Signed-off-by: Nicolas Pitre Tested-by: Martin Michlmayr Signed-off-by: Pierre Ossman --- drivers/mmc/host/mvsdio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index c643d0fe118..1783043a26a 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -882,3 +882,4 @@ module_param(nodma, int, 0); MODULE_AUTHOR("Maen Suleiman, Nicolas Pitre"); MODULE_DESCRIPTION("Marvell MMC,SD,SDIO Host Controller driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:mvsdio"); From 992697e9b342115dcf052ffa41d418cb4fe1a841 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Fri, 8 May 2009 08:52:49 -0500 Subject: [PATCH 804/900] sdhci-of: Add fsl,esdhc as a valid compatible to bind against We plan to use fsl,esdhc going forward as the base compatible so update the driver to bind against it. Signed-off-by: Kumar Gala Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci-of.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/sdhci-of.c b/drivers/mmc/host/sdhci-of.c index 3ff4ac3abe8..09cc597c631 100644 --- a/drivers/mmc/host/sdhci-of.c +++ b/drivers/mmc/host/sdhci-of.c @@ -277,6 +277,7 @@ static int __devexit sdhci_of_remove(struct of_device *ofdev) static const struct of_device_id sdhci_of_match[] = { { .compatible = "fsl,mpc8379-esdhc", .data = &sdhci_esdhc, }, { .compatible = "fsl,mpc8536-esdhc", .data = &sdhci_esdhc, }, + { .compatible = "fsl,esdhc", .data = &sdhci_esdhc, }, { .compatible = "generic-sdhci", }, {}, }; From e749c6f21fd7dc618f61dd178b4ee739c3cb1c31 Mon Sep 17 00:00:00 2001 From: Ben Nizette Date: Thu, 16 Apr 2009 15:55:21 +1000 Subject: [PATCH 805/900] mmc/omap: Use disable_irq_nosync() from within irq handlers. disable_irq() should wait for all running handlers to complete before returning. As such, if it's used to disable an interrupt from that interrupt's handler it will deadlock. This replaces the dangerous instances with the _nosync() variant which doesn't have this problem. Signed-off-by: Ben Nizette Acked-by: Tony Lindgren Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index bfa25c01c87..dceb5ee3bda 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -822,7 +822,7 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id) del_timer(&host->cmd_abort_timer); host->abort = 1; OMAP_MMC_WRITE(host, IE, 0); - disable_irq(host->irq); + disable_irq_nosync(host->irq); schedule_work(&host->cmd_abort_work); return IRQ_HANDLED; } From 9ca6944cbfad11f2368cf10292e7f3eb036386c2 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 14 May 2009 21:28:05 -0400 Subject: [PATCH 806/900] mvsdio: ignore high speed timing requests from the core Empirical evidences show that this is causing far more problems than it solves when this mode is enabled in the host hardware. Amongst those cards that are known to be non functional when this bit is set are: A-Data "Speedy" 2GB SD card Kodak 512MB SD card Ativa 1GB MicroSD card Marvell 8688 (WIFI/Bluetooth) SDIO card Since those cards do work on other host controllers which do honnor the hs timing, the issue must be with this particular host hardware. Signed-off-by: Nicolas Pitre Signed-off-by: Pierre Ossman --- drivers/mmc/host/mvsdio.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index 1783043a26a..9d3cfa9909c 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -620,9 +620,18 @@ static void mvsd_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) if (ios->bus_width == MMC_BUS_WIDTH_4) ctrl_reg |= MVSD_HOST_CTRL_DATA_WIDTH_4_BITS; + /* + * The HI_SPEED_EN bit is causing trouble with many (but not all) + * high speed SD, SDHC and SDIO cards. Not enabling that bit + * makes all cards work. So let's just ignore that bit for now + * and revisit this issue if problems for not enabling this bit + * are ever reported. + */ +#if 0 if (ios->timing == MMC_TIMING_MMC_HS || ios->timing == MMC_TIMING_SD_HS) ctrl_reg |= MVSD_HOST_CTRL_HI_SPEED_EN; +#endif host->ctrl = ctrl_reg; mvsd_write(MVSD_HOST_CTRL, ctrl_reg); From a6d297f008e124d0bb4312369191b012c10a1a4e Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 26 May 2009 22:35:34 -0400 Subject: [PATCH 807/900] mvsdio: fix config failure with some high speed SDHC cards Especially with Sandisk SDHC cards, the second SWITCH command was failing with a timeout and the card was not recognized at all. However if the system was busy, or debugging was enabled, or a udelay(100) was inserted before the second SWITCH command in the core code, then the timing was so that the card started to work. With some unusual block sizes, the data FIFO status doesn't indicate a "empty" state right away when the data transfer is done. Queuing another data transfer in that condition results in a transfer timeout. The empty FIFO bit eventually get set by itself in less than 50 usecs when it is not set right away. So let's just poll for that bit before configuring the controller with a new data transfer. Signed-off-by: Nicolas Pitre Signed-off-by: Pierre Ossman --- drivers/mmc/host/mvsdio.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index 9d3cfa9909c..b56d72ff06e 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -64,6 +64,31 @@ static int mvsd_setup_data(struct mvsd_host *host, struct mmc_data *data) unsigned int tmout; int tmout_index; + /* + * Hardware weirdness. The FIFO_EMPTY bit of the HW_STATE + * register is sometimes not set before a while when some + * "unusual" data block sizes are used (such as with the SWITCH + * command), even despite the fact that the XFER_DONE interrupt + * was raised. And if another data transfer starts before + * this bit comes to good sense (which eventually happens by + * itself) then the new transfer simply fails with a timeout. + */ + if (!(mvsd_read(MVSD_HW_STATE) & (1 << 13))) { + unsigned long t = jiffies + HZ; + unsigned int hw_state, count = 0; + do { + if (time_after(jiffies, t)) { + dev_warn(host->dev, "FIFO_EMPTY bit missing\n"); + break; + } + hw_state = mvsd_read(MVSD_HW_STATE); + count++; + } while (!(hw_state & (1 << 13))); + dev_dbg(host->dev, "*** wait for FIFO_EMPTY bit " + "(hw=0x%04x, count=%d, jiffies=%ld)\n", + hw_state, count, jiffies - (t - HZ)); + } + /* If timeout=0 then maximum timeout index is used. */ tmout = DIV_ROUND_UP(data->timeout_ns, host->ns_per_clk); tmout += data->timeout_clks; From fbf6a5fcbcc2248f1e676f7a0a7d49cd4b535d2a Mon Sep 17 00:00:00 2001 From: Dave Liu Date: Wed, 6 May 2009 18:40:07 +0800 Subject: [PATCH 808/900] sdhci-of: Fix the wrong accessor to HOSTVER register Freescale eSDHC controller has the special order for the HOST version register. that is not same as the other's registers. The address of HOSTVER in spec is 0xFE, and we need use the in_be16(0xFE) to access it, not in_be16(0xFC). Signed-off-by: Dave Liu Acked-by: Anton Vorontsov Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci-of.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-of.c b/drivers/mmc/host/sdhci-of.c index 09cc597c631..128c614d11a 100644 --- a/drivers/mmc/host/sdhci-of.c +++ b/drivers/mmc/host/sdhci-of.c @@ -55,7 +55,13 @@ static u32 esdhc_readl(struct sdhci_host *host, int reg) static u16 esdhc_readw(struct sdhci_host *host, int reg) { - return in_be16(host->ioaddr + (reg ^ 0x2)); + u16 ret; + + if (unlikely(reg == SDHCI_HOST_VERSION)) + ret = in_be16(host->ioaddr + reg); + else + ret = in_be16(host->ioaddr + (reg ^ 0x2)); + return ret; } static u8 esdhc_readb(struct sdhci_host *host, int reg) From b8da7de56ca0ad34726478a50d138a29a9ff76cb Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 2 Jun 2009 16:50:35 +1000 Subject: [PATCH 809/900] drm: fix irq naming for kms drivers. allocating devname in the i915 driver was a hack originally and I forgot to figure out how to do this properly back then. So this is the cleaner version that just picks devname or driver name in the irq code. It removes the devname allocs from the i915 driver. Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_irq.c | 8 +++++++- drivers/gpu/drm/i915/i915_dma.c | 12 ++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c index 93e677a481f..fc8e5acd9d9 100644 --- a/drivers/gpu/drm/drm_irq.c +++ b/drivers/gpu/drm/drm_irq.c @@ -196,6 +196,7 @@ int drm_irq_install(struct drm_device *dev) { int ret = 0; unsigned long sh_flags = 0; + char *irqname; if (!drm_core_check_feature(dev, DRIVER_HAVE_IRQ)) return -EINVAL; @@ -227,8 +228,13 @@ int drm_irq_install(struct drm_device *dev) if (drm_core_check_feature(dev, DRIVER_IRQ_SHARED)) sh_flags = IRQF_SHARED; + if (dev->devname) + irqname = dev->devname; + else + irqname = dev->driver->name; + ret = request_irq(drm_dev_to_irq(dev), dev->driver->irq_handler, - sh_flags, dev->devname, dev); + sh_flags, irqname, dev); if (ret < 0) { mutex_lock(&dev->struct_mutex); diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index 53d54455262..0ccb63ee50e 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -987,12 +987,6 @@ static int i915_load_modeset_init(struct drm_device *dev) int fb_bar = IS_I9XX(dev) ? 2 : 0; int ret = 0; - dev->devname = kstrdup(DRIVER_NAME, GFP_KERNEL); - if (!dev->devname) { - ret = -ENOMEM; - goto out; - } - dev->mode_config.fb_base = drm_get_resource_start(dev, fb_bar) & 0xff000000; @@ -1006,7 +1000,7 @@ static int i915_load_modeset_init(struct drm_device *dev) ret = i915_probe_agp(dev, &agp_size, &prealloc_size); if (ret) - goto kfree_devname; + goto out; /* Basic memrange allocator for stolen space (aka vram) */ drm_mm_init(&dev_priv->vram, 0, prealloc_size); @@ -1024,7 +1018,7 @@ static int i915_load_modeset_init(struct drm_device *dev) ret = i915_gem_init_ringbuffer(dev); if (ret) - goto kfree_devname; + goto out; /* Allow hardware batchbuffers unless told otherwise. */ @@ -1056,8 +1050,6 @@ static int i915_load_modeset_init(struct drm_device *dev) destroy_ringbuffer: i915_gem_cleanup_ringbuffer(dev); -kfree_devname: - kfree(dev->devname); out: return ret; } From 9863871bd1bbf218b921af5e0bc48ca4f6ea9f12 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 4 Jun 2009 07:08:13 +1000 Subject: [PATCH 810/900] drm/radeon: fix ring free alignment calculations fd.o bz#21849 We were aligning to +16 dwords, instead of to the next 16dword boundary in the ring. Fix the calculation to go to the next 16dword boundary when space checking. Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon_cp.c | 4 ++-- drivers/gpu/drm/radeon/radeon_drv.h | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_cp.c b/drivers/gpu/drm/radeon/radeon_cp.c index 77a7a4d8465..aff90bb9648 100644 --- a/drivers/gpu/drm/radeon/radeon_cp.c +++ b/drivers/gpu/drm/radeon/radeon_cp.c @@ -2185,9 +2185,9 @@ void radeon_commit_ring(drm_radeon_private_t *dev_priv) /* check if the ring is padded out to 16-dword alignment */ - tail_aligned = dev_priv->ring.tail & 0xf; + tail_aligned = dev_priv->ring.tail & (RADEON_RING_ALIGN-1); if (tail_aligned) { - int num_p2 = 16 - tail_aligned; + int num_p2 = RADEON_RING_ALIGN - tail_aligned; ring = dev_priv->ring.start; /* pad with some CP_PACKET2 */ diff --git a/drivers/gpu/drm/radeon/radeon_drv.h b/drivers/gpu/drm/radeon/radeon_drv.h index 8071d965f14..0c6bfc1de15 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.h +++ b/drivers/gpu/drm/radeon/radeon_drv.h @@ -1964,11 +1964,14 @@ do { \ #define RING_LOCALS int write, _nr, _align_nr; unsigned int mask; u32 *ring; +#define RADEON_RING_ALIGN 16 + #define BEGIN_RING( n ) do { \ if ( RADEON_VERBOSE ) { \ DRM_INFO( "BEGIN_RING( %d )\n", (n)); \ } \ - _align_nr = (n + 0xf) & ~0xf; \ + _align_nr = RADEON_RING_ALIGN - ((dev_priv->ring.tail + n) & (RADEON_RING_ALIGN-1)); \ + _align_nr += n; \ if (dev_priv->ring.space <= (_align_nr * sizeof(u32))) { \ COMMIT_RING(); \ radeon_wait_ring( dev_priv, _align_nr * sizeof(u32)); \ From 75185c929ed241f5cf1aa28999b8012181e2c7cb Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Sat, 30 May 2009 20:42:25 -0700 Subject: [PATCH 811/900] drm: add newlines to text sysfs files The contents of various simple text files in sysfs should end with a newline to make them easier to read from the console. Signed-off-by: Keith Packard Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c index 8f9372921f8..182bdf99cf7 100644 --- a/drivers/gpu/drm/drm_sysfs.c +++ b/drivers/gpu/drm/drm_sysfs.c @@ -147,7 +147,7 @@ static ssize_t status_show(struct device *device, enum drm_connector_status status; status = connector->funcs->detect(connector); - return snprintf(buf, PAGE_SIZE, "%s", + return snprintf(buf, PAGE_SIZE, "%s\n", drm_get_connector_status_name(status)); } @@ -166,7 +166,7 @@ static ssize_t dpms_show(struct device *device, if (ret) return 0; - return snprintf(buf, PAGE_SIZE, "%s", + return snprintf(buf, PAGE_SIZE, "%s\n", drm_get_dpms_name((int)dpms_status)); } @@ -176,7 +176,7 @@ static ssize_t enabled_show(struct device *device, { struct drm_connector *connector = to_drm_connector(device); - return snprintf(buf, PAGE_SIZE, connector->encoder ? "enabled" : + return snprintf(buf, PAGE_SIZE, "%s\n", connector->encoder ? "enabled" : "disabled"); } From e36ebaf49274ffa78f17b62bcae4c92c33b5b391 Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Sat, 30 May 2009 20:42:26 -0700 Subject: [PATCH 812/900] drm: set permissions on edid file to 0444 Without initializing the sysfs attributes for the edid file, it was created with mode 0, making it difficult for applications to use. Signed-off-by: Keith Packard Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c index 182bdf99cf7..9987ab88083 100644 --- a/drivers/gpu/drm/drm_sysfs.c +++ b/drivers/gpu/drm/drm_sysfs.c @@ -317,6 +317,7 @@ static struct device_attribute connector_attrs_opt1[] = { static struct bin_attribute edid_attr = { .attr.name = "edid", + .attr.mode = 0444, .size = 128, .read = edid_show, }; From c9fb15f60eb517c958dec64dca9357bf62bf2201 Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Sat, 30 May 2009 20:42:28 -0700 Subject: [PATCH 813/900] drm: Hook up DPMS property handling in drm_crtc.c. Add drm_helper_connector_dpms. Making the drm_crtc.c code recognize the DPMS property and invoke the connector->dpms function doesn't remove any capability from the driver while reducing code duplication. That just highlighted the problem with the existing DPMS functions which could turn off the connector, but failed to turn off any relevant crtcs. The new drm_helper_connector_dpms function manages all of that, using the drm_helper-specific crtc and encoder dpms functions, automatically computing the appropriate DPMS level for each object in the system. This fixes the current troubles in the i915 driver which left PLLs, pipes and planes running while in DPMS_OFF mode or even while they were unused. Signed-off-by: Keith Packard Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_crtc.c | 7 +- drivers/gpu/drm/drm_crtc_helper.c | 109 +++++++++++++++++++++++++++++- drivers/gpu/drm/i915/intel_crt.c | 6 +- drivers/gpu/drm/i915/intel_dvo.c | 1 + drivers/gpu/drm/i915/intel_hdmi.c | 1 + drivers/gpu/drm/i915/intel_lvds.c | 6 +- drivers/gpu/drm/i915/intel_sdvo.c | 1 + drivers/gpu/drm/i915/intel_tv.c | 1 + include/drm/drm_crtc.h | 3 + include/drm/drm_crtc_helper.h | 2 + 10 files changed, 124 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index 94a76887173..8fab7890a36 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -2294,7 +2294,12 @@ int drm_mode_connector_property_set_ioctl(struct drm_device *dev, } } - if (connector->funcs->set_property) + /* Do DPMS ourselves */ + if (property == connector->dev->mode_config.dpms_property) { + if (connector->funcs->dpms) + (*connector->funcs->dpms)(connector, (int) out_resp->value); + ret = 0; + } else if (connector->funcs->set_property) ret = connector->funcs->set_property(connector, property, out_resp->value); /* store the property value if succesful */ diff --git a/drivers/gpu/drm/drm_crtc_helper.c b/drivers/gpu/drm/drm_crtc_helper.c index 45890447fee..a6f73f1e99d 100644 --- a/drivers/gpu/drm/drm_crtc_helper.c +++ b/drivers/gpu/drm/drm_crtc_helper.c @@ -198,6 +198,29 @@ static void drm_helper_add_std_modes(struct drm_device *dev, } } +/** + * drm_helper_encoder_in_use - check if a given encoder is in use + * @encoder: encoder to check + * + * LOCKING: + * Caller must hold mode config lock. + * + * Walk @encoders's DRM device's mode_config and see if it's in use. + * + * RETURNS: + * True if @encoder is part of the mode_config, false otherwise. + */ +bool drm_helper_encoder_in_use(struct drm_encoder *encoder) +{ + struct drm_connector *connector; + struct drm_device *dev = encoder->dev; + list_for_each_entry(connector, &dev->mode_config.connector_list, head) + if (connector->encoder == encoder) + return true; + return false; +} +EXPORT_SYMBOL(drm_helper_encoder_in_use); + /** * drm_helper_crtc_in_use - check if a given CRTC is in a mode_config * @crtc: CRTC to check @@ -216,7 +239,7 @@ bool drm_helper_crtc_in_use(struct drm_crtc *crtc) struct drm_device *dev = crtc->dev; /* FIXME: Locking around list access? */ list_for_each_entry(encoder, &dev->mode_config.encoder_list, head) - if (encoder->crtc == crtc) + if (encoder->crtc == crtc && drm_helper_encoder_in_use(encoder)) return true; return false; } @@ -240,7 +263,7 @@ void drm_helper_disable_unused_functions(struct drm_device *dev) list_for_each_entry(encoder, &dev->mode_config.encoder_list, head) { encoder_funcs = encoder->helper_private; - if (!encoder->crtc) + if (!drm_helper_encoder_in_use(encoder)) (*encoder_funcs->dpms)(encoder, DRM_MODE_DPMS_OFF); } @@ -935,6 +958,88 @@ bool drm_helper_initial_config(struct drm_device *dev) } EXPORT_SYMBOL(drm_helper_initial_config); +static int drm_helper_choose_encoder_dpms(struct drm_encoder *encoder) +{ + int dpms = DRM_MODE_DPMS_OFF; + struct drm_connector *connector; + struct drm_device *dev = encoder->dev; + + list_for_each_entry(connector, &dev->mode_config.connector_list, head) + if (connector->encoder == encoder) + if (connector->dpms < dpms) + dpms = connector->dpms; + return dpms; +} + +static int drm_helper_choose_crtc_dpms(struct drm_crtc *crtc) +{ + int dpms = DRM_MODE_DPMS_OFF; + struct drm_connector *connector; + struct drm_device *dev = crtc->dev; + + list_for_each_entry(connector, &dev->mode_config.connector_list, head) + if (connector->encoder && connector->encoder->crtc == crtc) + if (connector->dpms < dpms) + dpms = connector->dpms; + return dpms; +} + +/** + * drm_helper_connector_dpms + * @connector affected connector + * @mode DPMS mode + * + * Calls the low-level connector DPMS function, then + * calls appropriate encoder and crtc DPMS functions as well + */ +void drm_helper_connector_dpms(struct drm_connector *connector, int mode) +{ + struct drm_encoder *encoder = connector->encoder; + struct drm_crtc *crtc = encoder ? encoder->crtc : NULL; + int old_dpms; + + if (mode == connector->dpms) + return; + + old_dpms = connector->dpms; + connector->dpms = mode; + + /* from off to on, do crtc then encoder */ + if (mode < old_dpms) { + if (crtc) { + struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; + if (crtc_funcs->dpms) + (*crtc_funcs->dpms) (crtc, + drm_helper_choose_crtc_dpms(crtc)); + } + if (encoder) { + struct drm_encoder_helper_funcs *encoder_funcs = encoder->helper_private; + if (encoder_funcs->dpms) + (*encoder_funcs->dpms) (encoder, + drm_helper_choose_encoder_dpms(encoder)); + } + } + + /* from on to off, do encoder then crtc */ + if (mode > old_dpms) { + if (encoder) { + struct drm_encoder_helper_funcs *encoder_funcs = encoder->helper_private; + if (encoder_funcs->dpms) + (*encoder_funcs->dpms) (encoder, + drm_helper_choose_encoder_dpms(encoder)); + } + if (crtc) { + struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; + if (crtc_funcs->dpms) + (*crtc_funcs->dpms) (crtc, + drm_helper_choose_crtc_dpms(crtc)); + } + } + + return; +} +EXPORT_SYMBOL(drm_helper_connector_dpms); + /** * drm_hotplug_stage_two * @dev DRM device diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c index 640f5158eff..79acc4f4c1f 100644 --- a/drivers/gpu/drm/i915/intel_crt.c +++ b/drivers/gpu/drm/i915/intel_crt.c @@ -381,11 +381,6 @@ static int intel_crt_set_property(struct drm_connector *connector, struct drm_property *property, uint64_t value) { - struct drm_device *dev = connector->dev; - - if (property == dev->mode_config.dpms_property && connector->encoder) - intel_crt_dpms(connector->encoder, (uint32_t)(value & 0xf)); - return 0; } @@ -402,6 +397,7 @@ static const struct drm_encoder_helper_funcs intel_crt_helper_funcs = { }; static const struct drm_connector_funcs intel_crt_connector_funcs = { + .dpms = drm_helper_connector_dpms, .detect = intel_crt_detect, .fill_modes = drm_helper_probe_single_connector_modes, .destroy = intel_crt_destroy, diff --git a/drivers/gpu/drm/i915/intel_dvo.c b/drivers/gpu/drm/i915/intel_dvo.c index 8b8d6e65cd3..1ee3007d6ec 100644 --- a/drivers/gpu/drm/i915/intel_dvo.c +++ b/drivers/gpu/drm/i915/intel_dvo.c @@ -316,6 +316,7 @@ static const struct drm_encoder_helper_funcs intel_dvo_helper_funcs = { }; static const struct drm_connector_funcs intel_dvo_connector_funcs = { + .dpms = drm_helper_connector_dpms, .save = intel_dvo_save, .restore = intel_dvo_restore, .detect = intel_dvo_detect, diff --git a/drivers/gpu/drm/i915/intel_hdmi.c b/drivers/gpu/drm/i915/intel_hdmi.c index d0983bb93a1..7d6bdd70532 100644 --- a/drivers/gpu/drm/i915/intel_hdmi.c +++ b/drivers/gpu/drm/i915/intel_hdmi.c @@ -219,6 +219,7 @@ static const struct drm_encoder_helper_funcs intel_hdmi_helper_funcs = { }; static const struct drm_connector_funcs intel_hdmi_connector_funcs = { + .dpms = drm_helper_connector_dpms, .save = intel_hdmi_save, .restore = intel_hdmi_restore, .detect = intel_hdmi_detect, diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index 53731f0ffcb..c92a64ac854 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -343,11 +343,6 @@ static int intel_lvds_set_property(struct drm_connector *connector, struct drm_property *property, uint64_t value) { - struct drm_device *dev = connector->dev; - - if (property == dev->mode_config.dpms_property && connector->encoder) - intel_lvds_dpms(connector->encoder, (uint32_t)(value & 0xf)); - return 0; } @@ -366,6 +361,7 @@ static const struct drm_connector_helper_funcs intel_lvds_connector_helper_funcs }; static const struct drm_connector_funcs intel_lvds_connector_funcs = { + .dpms = drm_helper_connector_dpms, .save = intel_lvds_save, .restore = intel_lvds_restore, .detect = intel_lvds_detect, diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c index f3ef6bfd8ff..3093b4d4a4d 100644 --- a/drivers/gpu/drm/i915/intel_sdvo.c +++ b/drivers/gpu/drm/i915/intel_sdvo.c @@ -1616,6 +1616,7 @@ static const struct drm_encoder_helper_funcs intel_sdvo_helper_funcs = { }; static const struct drm_connector_funcs intel_sdvo_connector_funcs = { + .dpms = drm_helper_connector_dpms, .save = intel_sdvo_save, .restore = intel_sdvo_restore, .detect = intel_sdvo_detect, diff --git a/drivers/gpu/drm/i915/intel_tv.c b/drivers/gpu/drm/i915/intel_tv.c index d2c32983242..98ac0546b7b 100644 --- a/drivers/gpu/drm/i915/intel_tv.c +++ b/drivers/gpu/drm/i915/intel_tv.c @@ -1626,6 +1626,7 @@ static const struct drm_encoder_helper_funcs intel_tv_helper_funcs = { }; static const struct drm_connector_funcs intel_tv_connector_funcs = { + .dpms = drm_helper_connector_dpms, .save = intel_tv_save, .restore = intel_tv_restore, .detect = intel_tv_detect, diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h index 3c1924c010e..7300fb86676 100644 --- a/include/drm/drm_crtc.h +++ b/include/drm/drm_crtc.h @@ -471,6 +471,9 @@ struct drm_connector { u32 property_ids[DRM_CONNECTOR_MAX_PROPERTY]; uint64_t property_values[DRM_CONNECTOR_MAX_PROPERTY]; + /* requested DPMS state */ + int dpms; + void *helper_private; uint32_t encoder_ids[DRM_CONNECTOR_MAX_ENCODER]; diff --git a/include/drm/drm_crtc_helper.h b/include/drm/drm_crtc_helper.h index ec073d8288d..6769ff6c1bc 100644 --- a/include/drm/drm_crtc_helper.h +++ b/include/drm/drm_crtc_helper.h @@ -99,6 +99,8 @@ extern bool drm_crtc_helper_set_mode(struct drm_crtc *crtc, struct drm_framebuffer *old_fb); extern bool drm_helper_crtc_in_use(struct drm_crtc *crtc); +extern void drm_helper_connector_dpms(struct drm_connector *connector, int mode); + extern int drm_helper_mode_fill_fb_struct(struct drm_framebuffer *fb, struct drm_mode_fb_cmd *mode_cmd); From 93c05f222413e3a16e8785f252db4726693abd71 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 4 Jun 2009 09:41:19 +1000 Subject: [PATCH 814/900] drm/i915: intel_lvds.c fix section mismatch intel_no_lvds[] does not require __initdata as it is used only by void intel_lvds_init(struct drm_device *dev). Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Dave Airlie --- drivers/gpu/drm/i915/intel_lvds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index c92a64ac854..53cccfa58b9 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -387,7 +387,7 @@ static int __init intel_no_lvds_dmi_callback(const struct dmi_system_id *id) } /* These systems claim to have LVDS, but really don't */ -static const struct dmi_system_id __initdata intel_no_lvds[] = { +static const struct dmi_system_id intel_no_lvds[] = { { .callback = intel_no_lvds_dmi_callback, .ident = "Apple Mac Mini (Core series)", From 6c51d1cfa0a370b48a157163340190cf5fd2346b Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Tue, 26 May 2009 10:35:52 +1000 Subject: [PATCH 815/900] drm: don't associate _DRM_DRIVER maps with a master A driver will use the _DRM_DRIVER map flag to indicate that it wants to be responsible for removing the map itself, bypassing the DRM's automagic cleanup code. Since the multi-master changes this has been broken, resulting in some drivers having their registers unmapped before it's finished with them. Signed-off-by: Ben Skeggs Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_bufs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_bufs.c b/drivers/gpu/drm/drm_bufs.c index 0411d912d82..80a257554b3 100644 --- a/drivers/gpu/drm/drm_bufs.c +++ b/drivers/gpu/drm/drm_bufs.c @@ -371,7 +371,8 @@ static int drm_addmap_core(struct drm_device * dev, resource_size_t offset, list->user_token = list->hash.key << PAGE_SHIFT; mutex_unlock(&dev->struct_mutex); - list->master = dev->primary->master; + if (!(map->flags & _DRM_DRIVER)) + list->master = dev->primary->master; *maplist = list; return 0; } From fc43896630a421321a19d7970bac27ac94e9d162 Mon Sep 17 00:00:00 2001 From: Adam Jackson Date: Thu, 4 Jun 2009 10:20:34 +1000 Subject: [PATCH 816/900] drm: ignore EDID with really tiny modes. Some EDIDs lie and report tiny modes that aren't possible. Ignore these modes. Signed-off-by: Adam Jackson Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_edid.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index ca9c6165671..6f6b26479d8 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -289,6 +289,11 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, struct drm_display_mode *mode; struct detailed_pixel_timing *pt = &timing->data.pixel_data; + /* ignore tiny modes */ + if (((pt->hactive_hi << 8) | pt->hactive_lo) < 64 || + ((pt->vactive_hi << 8) | pt->hactive_lo) < 64) + return NULL; + if (pt->stereo) { printk(KERN_WARNING "stereo mode not supported\n"); return NULL; From d81e77f041881d8d91c767b8c84f9068290b10c6 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Mon, 25 May 2009 16:50:10 +0000 Subject: [PATCH 817/900] [ARM] pxa/imote2: fix UCAM sensor board ADC model number Signed-off-by: Jonathan Cameron Signed-off-by: Eric Miao --- arch/arm/mach-pxa/imote2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-pxa/imote2.c b/arch/arm/mach-pxa/imote2.c index 2121309b247..2b27336c29f 100644 --- a/arch/arm/mach-pxa/imote2.c +++ b/arch/arm/mach-pxa/imote2.c @@ -412,7 +412,7 @@ static struct platform_device imote2_flash_device = { */ static struct i2c_board_info __initdata imote2_i2c_board_info[] = { { /* UCAM sensor board */ - .type = "max1238", + .type = "max1239", .addr = 0x35, }, { /* ITS400 Sensor board only */ .type = "max1363", From 1257629b0712a0a68a24c532a05a4cd23e3f7565 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Tue, 26 May 2009 22:03:32 +0200 Subject: [PATCH 818/900] [ARM] pxa: fix pxa27x_udc default pullup GPIO Currently, pxa27x_udc tries to use GPIO 0 as D+ pullup if not explicitly configured. Default to an invalid GPIO (-1) instead. Signed-off-by: Philipp Zabel Acked-by: Robert Jarzmik Signed-off-by: Eric Miao --- arch/arm/mach-pxa/devices.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm/mach-pxa/devices.c b/arch/arm/mach-pxa/devices.c index d245e59c51b..29970f703f3 100644 --- a/arch/arm/mach-pxa/devices.c +++ b/arch/arm/mach-pxa/devices.c @@ -72,7 +72,10 @@ void __init pxa_set_mci_info(struct pxamci_platform_data *info) } -static struct pxa2xx_udc_mach_info pxa_udc_info; +static struct pxa2xx_udc_mach_info pxa_udc_info = { + .gpio_pullup = -1, + .gpio_vbus = -1, +}; void __init pxa_set_udc_info(struct pxa2xx_udc_mach_info *info) { From f79b1b146b52765ee38bfb91bb14eb850fa98017 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 28 May 2009 00:25:05 +0800 Subject: [PATCH 819/900] PCI: use fixed-up device class when configuring device The device class may be changed after the fixup, so re-read the class value from pci_dev when configuring the device. Otherwise some devices such as JMicron SATA controller won't work. Reviewed-by: Matthew Wilcox Reviewed-by: Grant Grundler Tested-by: Marc Dionne Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- drivers/pci/probe.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index e3c3e081b83..f1ae2475fff 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -745,6 +745,8 @@ int pci_setup_device(struct pci_dev *dev) /* Early fixups, before probing the BARs */ pci_fixup_device(pci_fixup_early, dev); + /* device class may be changed after fixup */ + class = dev->class >> 8; switch (dev->hdr_type) { /* header type */ case PCI_HEADER_TYPE_NORMAL: /* standard header */ From 75e613cdc7bb2ba3795b1bc3ddf19476c767ba68 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 3 Jun 2009 00:13:13 -0700 Subject: [PATCH 820/900] x86/pci: fix mmconfig detection with 32bit near 4g Pascal reported and bisected a commit: | x86/PCI: don't call e820_all_mapped with -1 in the mmconfig case which broke one system system. ACPI: Using IOAPIC for interrupt routing PCI: MCFG configuration 0: base f0000000 segment 0 buses 0 - 255 PCI: MCFG area at f0000000 reserved in ACPI motherboard resources PCI: Using MMCONFIG for extended config space it didn't have PCI: updated MCFG configuration 0: base f0000000 segment 0 buses 0 - 63 anymore, and try to use 0xf000000 - 0xffffffff for mmconfig For 32bit, mcfg_res->end could be 32bit only (if 64 resources aren't used) So use end - 1 to pass the value in mcfg->end to avoid overflow. We don't need to worry about the e820 path, they are always 64 bit. Reported-by: Pascal Terjan Bisected-by: Pascal Terjan Tested-by: Pascal Terjan Signed-off-by: Yinghai Lu Cc: stable@kernel.org Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 5fa10bb9604..8766b0e216c 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -375,7 +375,7 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res, if (!fixmem32) return AE_OK; if ((mcfg_res->start >= fixmem32->address) && - (mcfg_res->end <= (fixmem32->address + + (mcfg_res->end < (fixmem32->address + fixmem32->address_length))) { mcfg_res->flags = 1; return AE_CTRL_TERMINATE; @@ -392,7 +392,7 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res, return AE_OK; if ((mcfg_res->start >= address.minimum) && - (mcfg_res->end <= (address.minimum + address.address_length))) { + (mcfg_res->end < (address.minimum + address.address_length))) { mcfg_res->flags = 1; return AE_CTRL_TERMINATE; } @@ -418,7 +418,7 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) struct resource mcfg_res; mcfg_res.start = start; - mcfg_res.end = end; + mcfg_res.end = end - 1; mcfg_res.flags = 0; acpi_get_devices("PNP0C01", find_mboard_resource, &mcfg_res, NULL); From 0e7ddf7eeeef5aea85412120539ab5369577faeb Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 4 Jun 2009 11:18:14 +0000 Subject: [PATCH 821/900] drm/i915: Remove a bad BUG_ON in the fence management code. This could be triggered by a gtt mapping fault on 965 that decides to remove the fence from another object that happens to be active currently. Since the other object doesn't rely on the fence reg for its execution, we don't wait for it to finish. We'll soon be not waiting on 915 most of the time as well, so just drop the BUG_ON. Signed-off-by: Eric Anholt --- drivers/gpu/drm/i915/i915_gem.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 670d1288146..39f5c658ef5 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2260,9 +2260,6 @@ try_again: goto try_again; } - BUG_ON(old_obj_priv->active || - (reg->obj->write_domain & I915_GEM_GPU_DOMAINS)); - /* * Zap this virtual mapping so we can set up a fence again * for this object next time we need it. From 2cc3c559fb2fe8cecca82a517bc56e88b0c1effd Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Thu, 4 Jun 2009 09:23:50 -0400 Subject: [PATCH 822/900] Btrfs: set device->total_disk_bytes when adding new device It was not being properly initialized, and so the size saved to disk was not correct. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5f01dad4b69..a6d35b0054c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1440,6 +1440,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->io_align = root->sectorsize; device->sector_size = root->sectorsize; device->total_bytes = i_size_read(bdev->bd_inode); + device->disk_total_bytes = device->total_bytes; device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; From 2cb7878a3a4341d1faa208de962d66f0817d3e7a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 3 Jun 2009 14:52:24 +0930 Subject: [PATCH 823/900] lguest: fix 'unhandled trap 13' with CONFIG_CC_STACKPROTECTOR We don't set up the canary; let's disable stack protector on boot.c so we can get into lguest_init, then set it up. As a side effect, switch_to_new_gdt() sets up %fs for us properly too. Signed-off-by: Rusty Russell Acked-by: Tejun Heo Signed-off-by: Linus Torvalds --- arch/x86/lguest/Makefile | 1 + arch/x86/lguest/boot.c | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile index 27f0c9ed7f6..94e0e54056a 100644 --- a/arch/x86/lguest/Makefile +++ b/arch/x86/lguest/Makefile @@ -1 +1,2 @@ obj-y := i386_head.o boot.o +CFLAGS_boot.o := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ca7ec44bafc..33a93b41739 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -67,6 +67,7 @@ #include #include #include +#include #include /* for struct machine_ops */ /*G:010 Welcome to the Guest! @@ -1088,13 +1089,21 @@ __init void lguest_init(void) * lguest_init() where the rest of the fairly chaotic boot setup * occurs. */ + /* The stack protector is a weird thing where gcc places a canary + * value on the stack and then checks it on return. This file is + * compiled with -fno-stack-protector it, so we got this far without + * problems. The value of the canary is kept at offset 20 from the + * %gs register, so we need to set that up before calling C functions + * in other files. */ + setup_stack_canary_segment(0); + /* We could just call load_stack_canary_segment(), but we might as + * call switch_to_new_gdt() which loads the whole table and sets up + * the per-cpu segment descriptor register %fs as well. */ + switch_to_new_gdt(0); + /* As described in head_32.S, we map the first 128M of memory. */ max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; - /* Load the %fs segment register (the per-cpu segment register) with - * the normal data segment to get through booting. */ - asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); - /* The Host<->Guest Switcher lives at the top of our address space, and * the Host told us how big it is when we made LGUEST_INIT hypercall: * it put the answer in lguest_data.reserve_mem */ From 44fb5511638938a2c37c895abc14df648ffc07e9 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 4 Jun 2009 15:34:51 -0400 Subject: [PATCH 824/900] Btrfs: Fix oops and use after free during space balancing The btrfs allocator uses list_for_each to walk the available block groups when searching for free blocks. It starts off with a hint to help find the best block group for a given allocation. The hint is resolved into a block group, but we don't properly check to make sure the block group we find isn't in the middle of being freed due to filesystem shrinking or balancing. If it is being freed, the list pointers in it are bogus and can't be trusted. But, the code happily goes along and uses them in the list_for_each loop, leading to all kinds of fun. The fix used here is to check to make sure the block group we find really is on the list before we use it. list_del_init is used when removing it from the list, so we can do a proper check. The allocation clustering code has a similar bug where it will trust the block group in the current free space cluster. If our allocation flags have changed (going from single spindle dup to raid1 for example) because the drives in the FS have changed, we're not allowed to use the old block group any more. The fix used here is to check the current cluster against the current allocation flags. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 51 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3e2c7c738f2..35af9335506 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2622,7 +2622,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, search_start); if (block_group && block_group_bits(block_group, data)) { down_read(&space_info->groups_sem); - goto have_block_group; + if (list_empty(&block_group->list) || + block_group->ro) { + /* + * someone is removing this block group, + * we can't jump into the have_block_group + * target because our list pointers are not + * valid + */ + btrfs_put_block_group(block_group); + up_read(&space_info->groups_sem); + } else + goto have_block_group; } else if (block_group) { btrfs_put_block_group(block_group); } @@ -2656,6 +2667,13 @@ have_block_group: * people trying to start a new cluster */ spin_lock(&last_ptr->refill_lock); + if (last_ptr->block_group && + (last_ptr->block_group->ro || + !block_group_bits(last_ptr->block_group, data))) { + offset = 0; + goto refill_cluster; + } + offset = btrfs_alloc_from_cluster(block_group, last_ptr, num_bytes, search_start); if (offset) { @@ -2681,10 +2699,17 @@ have_block_group: last_ptr_loop = 1; search_start = block_group->key.objectid; + /* + * we know this block group is properly + * in the list because + * btrfs_remove_block_group, drops the + * cluster before it removes the block + * group from the list + */ goto have_block_group; } spin_unlock(&last_ptr->lock); - +refill_cluster: /* * this cluster didn't work out, free it and * start over @@ -5968,6 +5993,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, { struct btrfs_path *path; struct btrfs_block_group_cache *block_group; + struct btrfs_free_cluster *cluster; struct btrfs_key key; int ret; @@ -5979,6 +6005,21 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); + /* make sure this block group isn't part of an allocation cluster */ + cluster = &root->fs_info->data_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + /* + * make sure this block group isn't part of a metadata + * allocation cluster + */ + cluster = &root->fs_info->meta_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + path = btrfs_alloc_path(); BUG_ON(!path); @@ -5988,7 +6029,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->block_group_cache_lock); btrfs_remove_free_space_cache(block_group); down_write(&block_group->space_info->groups_sem); - list_del(&block_group->list); + /* + * we must use list_del_init so people can check to see if they + * are still on the list after taking the semaphore + */ + list_del_init(&block_group->list); up_write(&block_group->space_info->groups_sem); spin_lock(&block_group->space_info->lock); From 730c586ad5228c339949b2eb4e72b80ae167abc4 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Thu, 4 Jun 2009 15:20:39 -0700 Subject: [PATCH 825/900] drivers/char/mem.c: avoid OOM lockup during large reads from /dev/zero While running 20 parallel instances of dd as follows: #!/bin/bash for i in `seq 1 20`; do dd if=/dev/zero of=/export/hda3/dd_$i bs=1073741824 count=1 & done wait on a 16G machine, we noticed that rather than just killing the processes, the entire kernel went down. Stracing dd reveals that it first does an mmap2, which makes 1GB worth of zero page mappings. Then it performs a read on those pages from /dev/zero, and finally it performs a write. The machine died during the reads. Looking at the code, it was noticed that /dev/zero's read operation had been changed by 557ed1fa2620dc119adb86b34c614e152a629a80 ("remove ZERO_PAGE") from giving zero page mappings to actually zeroing the page. The zeroing of the pages causes physical pages to be allocated to the process. But, when the process exhausts all the memory that it can, the kernel cannot kill it, as it is still in the kernel mode allocating more memory. Consequently, the kernel eventually crashes. To fix this, I propose that when a fatal signal is pending during /dev/zero read operation, we simply return and let the user process die. Signed-off-by: Salman Qazi Cc: Nick Piggin Signed-off-by: Andrew Morton [ Modified error return and comment trivially. - Linus] Signed-off-by: Linus Torvalds --- drivers/char/mem.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 8f05c38c2f0..65e12bca657 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -694,6 +694,9 @@ static ssize_t read_zero(struct file * file, char __user * buf, written += chunk - unwritten; if (unwritten) break; + /* Consider changing this to just 'signal_pending()' with lots of testing */ + if (fatal_signal_pending(current)) + return written ? written : -EINTR; buf += chunk; count -= chunk; cond_resched(); From 087eb437051b3de817720f9c80c440fc9e7dcce8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 4 Jun 2009 16:29:07 -0700 Subject: [PATCH 826/900] ptrace: tracehook_report_clone: fix false positives The "trace || CLONE_PTRACE" check in tracehook_report_clone() is not right, - If the untraced task does clone(CLONE_PTRACE) the new child is not traced, we must not queue SIGSTOP. - If we forked the traced task, but the tracer exits and untraces both the forking task and the new child (after copy_process() drops tasklist_lock), we should not queue SIGSTOP too. Change the code to check task_ptrace() != 0 instead. This is still racy, but the race is harmless. We can race with another tracer attaching to this child, or the tracer can exit and detach in parallel. But giwen that we didn't do wake_up_new_task() yet, the child must have the pending SIGSTOP anyway. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Christoph Hellwig Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tracehook.h | 11 +++++------ kernel/fork.c | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index c7aa154f4bf..eb96603d92d 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -259,14 +259,12 @@ static inline void tracehook_finish_clone(struct task_struct *child, /** * tracehook_report_clone - in parent, new child is about to start running - * @trace: return value from tracehook_prepare_clone() * @regs: parent's user register state * @clone_flags: flags from parent's system call * @pid: new child's PID in the parent's namespace * @child: new child task * - * Called after a child is set up, but before it has been started - * running. @trace is the value returned by tracehook_prepare_clone(). + * Called after a child is set up, but before it has been started running. * This is not a good place to block, because the child has not started * yet. Suspend the child here if desired, and then block in * tracehook_report_clone_complete(). This must prevent the child from @@ -276,13 +274,14 @@ static inline void tracehook_finish_clone(struct task_struct *child, * * Called with no locks held, but the child cannot run until this returns. */ -static inline void tracehook_report_clone(int trace, struct pt_regs *regs, +static inline void tracehook_report_clone(struct pt_regs *regs, unsigned long clone_flags, pid_t pid, struct task_struct *child) { - if (unlikely(trace) || unlikely(clone_flags & CLONE_PTRACE)) { + if (unlikely(task_ptrace(child))) { /* - * The child starts up with an immediate SIGSTOP. + * It doesn't matter who attached/attaching to this + * task, the pending SIGSTOP is right in any case. */ sigaddset(&child->pending.signal, SIGSTOP); set_tsk_thread_flag(child, TIF_SIGPENDING); diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd0072..875ffbdd96d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1409,7 +1409,7 @@ long do_fork(unsigned long clone_flags, } audit_finish_fork(p); - tracehook_report_clone(trace, regs, clone_flags, nr, p); + tracehook_report_clone(regs, clone_flags, nr, p); /* * We set PF_STARTING at creation in case tracing wants to From 08f67461c609ad96bf26732b590569e02e322019 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Thu, 4 Jun 2009 16:29:08 -0700 Subject: [PATCH 827/900] kbuild: fix detection of CONFIG_FRAME_WARN=0 The checking of CONFIG_FRAME_WARN in the top level Makefile forgot to actually derefence the variable thus leading to an always true check. Signed-off-by: Mike Frysinger Cc: Andi Kleen Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 610d1c332c4..10651549208 100644 --- a/Makefile +++ b/Makefile @@ -533,7 +533,7 @@ endif include $(srctree)/arch/$(SRCARCH)/Makefile -ifneq (CONFIG_FRAME_WARN,0) +ifneq ($(CONFIG_FRAME_WARN),0) KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN}) endif From edaba2c5334492f82d39ec35637c6dea5176a977 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 4 Jun 2009 16:29:09 -0700 Subject: [PATCH 828/900] ptrace: revert "ptrace_detach: the wrong wakeup breaks the ERESTARTxxx logic" Commit 95a3540da9c81a5987be810e1d9a83640a366bd5 ("ptrace_detach: the wrong wakeup breaks the ERESTARTxxx logic") removed the "extra" wake_up_process() from ptrace_detach(), but as Jan pointed out this breaks the compatibility. I believe the changelog is right and this wake_up() is wrong in many ways, but GDB assumes that ptrace(PTRACE_DETACH, child, 0, 0) always wakes up the tracee. Despite the fact this breaks SIGNAL_STOP_STOPPED/group_stop_count logic, and despite the fact this wake_up_process() can break another assumption: PTRACE_DETACH with SIGSTOP should leave the tracee in TASK_STOPPED case. Because the untraced child can dequeue SIGSTOP and call do_signal_stop() before ptrace_detach() calls wake_up_process(). Revert this change for now. We need some fixes even if we we want to keep the current behaviour, but these fixes are not for 2.6.30. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Jan Kratochvil Cc: Denys Vlasenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0692ab5a0d6..42c317874cf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -304,6 +304,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data) if (child->ptrace) { child->exit_code = data; dead = __ptrace_detach(current, child); + if (!child->exit_state) + wake_up_process(child); } write_unlock_irq(&tasklist_lock); From e9e10124e269a39de089c5e0d9936fae2ff889b2 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 5 Jun 2009 11:56:18 +0100 Subject: [PATCH 829/900] ivtv: Fix PCI DMA direction The ivtv stream buffers may be for receive or for send but the attached sg handle is always destined cpu->device. We flush it correctly but the allocation is wrongly done with the same type as the buffers. See bug: http://bugzilla.kernel.org/show_bug.cgi?id=13385 (Note this doesn't close the bug - it fixes the ivtv part and in turn the logging next shows up some rather alarming DMA sg list warnings in libata) Signed-off-by: Alan Cox Acked-by: Hans Verkuil Signed-off-by: Linus Torvalds --- drivers/media/video/ivtv/ivtv-queue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/video/ivtv/ivtv-queue.c b/drivers/media/video/ivtv/ivtv-queue.c index ff7b7deded4..7fde36e6d22 100644 --- a/drivers/media/video/ivtv/ivtv-queue.c +++ b/drivers/media/video/ivtv/ivtv-queue.c @@ -230,7 +230,8 @@ int ivtv_stream_alloc(struct ivtv_stream *s) return -ENOMEM; } if (ivtv_might_use_dma(s)) { - s->sg_handle = pci_map_single(itv->pdev, s->sg_dma, sizeof(struct ivtv_sg_element), s->dma); + s->sg_handle = pci_map_single(itv->pdev, s->sg_dma, + sizeof(struct ivtv_sg_element), PCI_DMA_TODEVICE); ivtv_stream_sync_for_cpu(s); } From 27704a16c9e0fb4c6b04344c7c4c40ac16148ec0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 5 Jun 2009 10:21:52 -0700 Subject: [PATCH 830/900] Revert "drm: don't associate _DRM_DRIVER maps with a master" This reverts commit 6c51d1cfa0a370b48a157163340190cf5fd2346b, which apparently causes DRI initialization failures on Radeons. Reported-by: Markus Trippelsdorf Requested-by: Dave Airlie Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_bufs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_bufs.c b/drivers/gpu/drm/drm_bufs.c index 80a257554b3..0411d912d82 100644 --- a/drivers/gpu/drm/drm_bufs.c +++ b/drivers/gpu/drm/drm_bufs.c @@ -371,8 +371,7 @@ static int drm_addmap_core(struct drm_device * dev, resource_size_t offset, list->user_token = list->hash.key << PAGE_SHIFT; mutex_unlock(&dev->struct_mutex); - if (!(map->flags & _DRM_DRIVER)) - list->master = dev->primary->master; + list->master = dev->primary->master; *maplist = list; return 0; } From 2c701b10283b58937201004276319ef9d9051b5d Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 5 Jun 2009 12:37:07 -0400 Subject: [PATCH 831/900] [CPUFREQ] powernow-k8: check space_id of _PCT registers to be FFH The powernow-k8 driver checks to see that the Performance Control/Status Registers are declared as FFH (functional fixed hardware) by the BIOS. However, this check got broken in the commit: 0e64a0c982c06a6b8f5e2a7f29eb108fdf257b2f [CPUFREQ] checkpatch cleanups for powernow-k8 Fix based on an original patch from Naga Chumbalkar. Signed-off-by: Naga Chumbalkar Cc: Mark Langsdorf Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index f6b32d11235..35dc8fbe92b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -835,7 +835,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { struct cpufreq_frequency_table *powernow_table; int ret_val = -ENODEV; - acpi_integer space_id; + acpi_integer control, status; if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); @@ -848,12 +848,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) goto err_out; } - space_id = data->acpi_data.control_register.space_id; - if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || - (space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { + control = data->acpi_data.control_register.space_id; + status = data->acpi_data.status_register.space_id; + + if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) || + (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) { dprintk("Invalid control/status registers (%x - %x)\n", - data->acpi_data.control_register.space_id, - space_id); + control, status); goto err_out; } From 45bc955bb1324a46c9539550cc615994e6d0a43d Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Fri, 5 Jun 2009 10:41:39 -0400 Subject: [PATCH 832/900] pata_legacy: wait for async probing The basic problem here that pata_legacy attaches the host, sees if it found any devices and detaches it if none were found. With async probing, it's not waiting until discovery is finished before deciding it has no devices and trying the detach leading to this warning: ata1: PATA max PIO4 cmd 0x1f0 ctl 0x3f6 irq 14 ------------[ cut here ]------------ WARNING: at drivers/ata/libata-core.c:6222 ata_host_detach+0x75/0x90() Modules linked in: Pid: 1, comm: swapper Not tainted 2.6.30-rc7 #1 Call Trace: [] ? ata_host_detach+0x75/0x90 [] ? ata_host_detach+0x75/0x90 [] ? warn_slowpath_common+0x45/0x80 [] ? warn_slowpath_null+0xa/0x10 [] ? ata_host_detach+0x75/0x90 [] ? legacy_init+0x44e/0x87f [] ? legacy_init+0x0/0x87f [] ? _stext+0x21/0x140 [] ? proc_register+0x2f/0x190 [] ? create_proc_entry+0x5c/0xc0 [] ? register_irq_proc+0x6e/0x90 [] ? kernel_init+0x6e/0xbf [] ? kernel_init+0x0/0xbf [] ? kernel_thread_helper+0x7/0x10 ---[ end trace ef1ee36e873ae3a0 ]--- Because it detaches before the probe is complete. One way to fix it would be to put an async_synchronize_full() before looking for devices, which this patch does. A better way might be to separate libata into its own domain and only wait for that. Reported-by: Mikael Pettersson Signed-off-by: James Bottomley Signed-off-by: Jeff Garzik --- drivers/ata/pata_legacy.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ata/pata_legacy.c b/drivers/ata/pata_legacy.c index f72c6c5b820..6932e56d179 100644 --- a/drivers/ata/pata_legacy.c +++ b/drivers/ata/pata_legacy.c @@ -48,6 +48,7 @@ * */ +#include #include #include #include @@ -1028,6 +1029,7 @@ static __init int legacy_init_one(struct legacy_probe *probe) &legacy_sht); if (ret) goto fail; + async_synchronize_full(); ld->platform_dev = pdev; /* Nothing found means we drop the port as its probably not there */ From 5f33b3bcd7aac66a51e6bfaf35e8cff4eabafb06 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Mon, 1 Jun 2009 22:42:10 +0300 Subject: [PATCH 833/900] pata_efar: fix PIO2 underclocking Fix the PIO mode 2 using mode 0 timings -- this driver should enable the fast timing bank starting with PIO2, just like the PIIX/ICH drivers do. Also, fix/rephrase some comments while at it. Signed-off-by: Sergei Shtylyov Signed-off-by: Jeff Garzik --- drivers/ata/pata_efar.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/ata/pata_efar.c b/drivers/ata/pata_efar.c index 2085e0a3a05..2a6412f5d11 100644 --- a/drivers/ata/pata_efar.c +++ b/drivers/ata/pata_efar.c @@ -22,7 +22,7 @@ #include #define DRV_NAME "pata_efar" -#define DRV_VERSION "0.4.4" +#define DRV_VERSION "0.4.5" /** * efar_pre_reset - Enable bits @@ -98,18 +98,17 @@ static void efar_set_piomode (struct ata_port *ap, struct ata_device *adev) { 2, 1 }, { 2, 3 }, }; - if (pio > 2) - control |= 1; /* TIME1 enable */ + if (pio > 1) + control |= 1; /* TIME */ if (ata_pio_need_iordy(adev)) /* PIO 3/4 require IORDY */ - control |= 2; /* IE enable */ - /* Intel specifies that the PPE functionality is for disk only */ + control |= 2; /* IE */ + /* Intel specifies that the prefetch/posting is for disk only */ if (adev->class == ATA_DEV_ATA) - control |= 4; /* PPE enable */ + control |= 4; /* PPE */ pci_read_config_word(dev, idetm_port, &idetm_data); - /* Enable PPE, IE and TIME as appropriate */ - + /* Set PPE, IE, and TIME as appropriate */ if (adev->devno == 0) { idetm_data &= 0xCCF0; idetm_data |= control; @@ -129,7 +128,7 @@ static void efar_set_piomode (struct ata_port *ap, struct ata_device *adev) pci_write_config_byte(dev, 0x44, slave_data); } - idetm_data |= 0x4000; /* Ensure SITRE is enabled */ + idetm_data |= 0x4000; /* Ensure SITRE is set */ pci_write_config_word(dev, idetm_port, idetm_data); } From 9b10ae86d1616f46dabb67c663fe6a9c3a502663 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 30 May 2009 20:50:12 +0900 Subject: [PATCH 834/900] ahci: add warning messages for hp laptops with broken suspend Harddisks on HP dv[4-6] and HDX18 fail to come online after resume on earlier BIOSen. Fortunately, HP recently released BIOS updates for all machines to fix the issue. Detect old BIOSen, warn the user to update BIOS on boot and suspend attempts and fail suspend. Kudos to all the bug reporters. Signed-off-by: Tejun Heo Cc: kernel.org@epperson.homelinux.net Cc: emisca@gmail.com Cc: Gadi Cohen Cc: Paul Swanson Cc: s@ourada.org Cc: Trevor Davenport Cc: corruptor1972 Cc: Victoria Wilson Cc: khiraly Cc: Sean Signed-off-by: Jeff Garzik --- drivers/ata/ahci.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 08186ecbaf8..6b91c26a463 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -220,6 +220,7 @@ enum { AHCI_HFLAG_NO_HOTPLUG = (1 << 7), /* ignore PxSERR.DIAG.N */ AHCI_HFLAG_SECT255 = (1 << 8), /* max 255 sectors */ AHCI_HFLAG_YES_NCQ = (1 << 9), /* force NCQ cap on */ + AHCI_HFLAG_NO_SUSPEND = (1 << 10), /* don't suspend */ /* ap->flags bits */ @@ -2316,9 +2317,17 @@ static int ahci_port_suspend(struct ata_port *ap, pm_message_t mesg) static int ahci_pci_device_suspend(struct pci_dev *pdev, pm_message_t mesg) { struct ata_host *host = dev_get_drvdata(&pdev->dev); + struct ahci_host_priv *hpriv = host->private_data; void __iomem *mmio = host->iomap[AHCI_PCI_BAR]; u32 ctl; + if (mesg.event & PM_EVENT_SUSPEND && + hpriv->flags & AHCI_HFLAG_NO_SUSPEND) { + dev_printk(KERN_ERR, &pdev->dev, + "BIOS update required for suspend/resume\n"); + return -EIO; + } + if (mesg.event & PM_EVENT_SLEEP) { /* AHCI spec rev1.1 section 8.3.3: * Software must disable interrupts prior to requesting a @@ -2610,6 +2619,63 @@ static bool ahci_broken_system_poweroff(struct pci_dev *pdev) return false; } +static bool ahci_broken_suspend(struct pci_dev *pdev) +{ + static const struct dmi_system_id sysids[] = { + /* + * On HP dv[4-6] and HDX18 with earlier BIOSen, link + * to the harddisk doesn't become online after + * resuming from STR. Warn and fail suspend. + */ + { + .ident = "dv4", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, + "HP Pavilion dv4 Notebook PC"), + }, + .driver_data = "F.30", /* cutoff BIOS version */ + }, + { + .ident = "dv5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, + "HP Pavilion dv5 Notebook PC"), + }, + .driver_data = "F.16", /* cutoff BIOS version */ + }, + { + .ident = "dv6", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, + "HP Pavilion dv6 Notebook PC"), + }, + .driver_data = "F.21", /* cutoff BIOS version */ + }, + { + .ident = "HDX18", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, + "HP HDX18 Notebook PC"), + }, + .driver_data = "F.23", /* cutoff BIOS version */ + }, + { } /* terminate list */ + }; + const struct dmi_system_id *dmi = dmi_first_match(sysids); + const char *ver; + + if (!dmi || pdev->bus->number || pdev->devfn != PCI_DEVFN(0x1f, 2)) + return false; + + ver = dmi_get_system_info(DMI_BIOS_VERSION); + + return !ver || strcmp(ver, dmi->driver_data) < 0; +} + static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { static int printed_version; @@ -2715,6 +2781,12 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) "quirky BIOS, skipping spindown on poweroff\n"); } + if (ahci_broken_suspend(pdev)) { + hpriv->flags |= AHCI_HFLAG_NO_SUSPEND; + dev_printk(KERN_WARNING, &pdev->dev, + "BIOS update required for suspend/resume\n"); + } + /* CAP.NP sometimes indicate the index of the last enabled * port, at other times, that of the last possible port, so * determining the maximum port number requires looking at From 65e31643115349fd7a81acbe75ec4a54d5df8aad Mon Sep 17 00:00:00 2001 From: Ville Syrjala Date: Tue, 19 May 2009 01:37:44 +0300 Subject: [PATCH 835/900] ata_piix: Add HP Compaq nc6000 to the broken poweroff list HP Compaq nc6000 suffers from the double disk spindown issue. Add it to the broken poweroff DMI list. Signed-off-by: Ville Syrjala Signed-off-by: Jeff Garzik --- drivers/ata/ata_piix.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c index d51a17c0f59..1aeb7082b0c 100644 --- a/drivers/ata/ata_piix.c +++ b/drivers/ata/ata_piix.c @@ -1455,6 +1455,15 @@ static bool piix_broken_system_poweroff(struct pci_dev *pdev) /* PCI slot number of the controller */ .driver_data = (void *)0x1FUL, }, + { + .ident = "HP Compaq nc6000", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nc6000"), + }, + /* PCI slot number of the controller */ + .driver_data = (void *)0x1FUL, + }, { } /* terminate list */ }; From cd24f8c1e7e27a2c6051a9a338d4704a2431dbf0 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Fri, 5 Jun 2009 18:48:08 +0100 Subject: [PATCH 836/900] mtd: davinci nand: update clock naming DaVinci clock support has been updated in mainline. Update clock names accordingly. Signed-off-by: Kevin Hilman Acked-by: David Brownell Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse Signed-off-by: Linus Torvalds --- drivers/mtd/nand/davinci_nand.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/nand/davinci_nand.c b/drivers/mtd/nand/davinci_nand.c index 0119220de7d..02700f769b8 100644 --- a/drivers/mtd/nand/davinci_nand.c +++ b/drivers/mtd/nand/davinci_nand.c @@ -407,16 +407,17 @@ static int __init nand_davinci_probe(struct platform_device *pdev) } info->chip.ecc.mode = ecc_mode; - info->clk = clk_get(&pdev->dev, "AEMIFCLK"); + info->clk = clk_get(&pdev->dev, "aemif"); if (IS_ERR(info->clk)) { ret = PTR_ERR(info->clk); - dev_dbg(&pdev->dev, "unable to get AEMIFCLK, err %d\n", ret); + dev_dbg(&pdev->dev, "unable to get AEMIF clock, err %d\n", ret); goto err_clk; } ret = clk_enable(info->clk); if (ret < 0) { - dev_dbg(&pdev->dev, "unable to enable AEMIFCLK, err %d\n", ret); + dev_dbg(&pdev->dev, "unable to enable AEMIF clock, err %d\n", + ret); goto err_clk_enable; } From a3cb900cc408977a11519bc7c760f3e499079589 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 13 May 2009 15:02:27 +0100 Subject: [PATCH 837/900] [libata] pata_ali: Use IGN_SIMPLEX Some ALi devices report simplex if they have been disabled and re-enabled, and restoring the byte does not work. Ignore it - the needed supporting logic is already present for the SATA ULi ports. Signed-off-by: Alan Cox Signed-off-by: Jeff Garzik --- drivers/ata/pata_ali.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/ata/pata_ali.c b/drivers/ata/pata_ali.c index 751b7ea4816..fc9c5d6d7d8 100644 --- a/drivers/ata/pata_ali.c +++ b/drivers/ata/pata_ali.c @@ -497,14 +497,16 @@ static int ali_init_one(struct pci_dev *pdev, const struct pci_device_id *id) }; /* Revision 0x20 added DMA */ static const struct ata_port_info info_20 = { - .flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48, + .flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48 | + ATA_FLAG_IGN_SIMPLEX, .pio_mask = ATA_PIO4, .mwdma_mask = ATA_MWDMA2, .port_ops = &ali_20_port_ops }; /* Revision 0x20 with support logic added UDMA */ static const struct ata_port_info info_20_udma = { - .flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48, + .flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48 | + ATA_FLAG_IGN_SIMPLEX, .pio_mask = ATA_PIO4, .mwdma_mask = ATA_MWDMA2, .udma_mask = ATA_UDMA2, @@ -512,7 +514,8 @@ static int ali_init_one(struct pci_dev *pdev, const struct pci_device_id *id) }; /* Revision 0xC2 adds UDMA66 */ static const struct ata_port_info info_c2 = { - .flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48, + .flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48 | + ATA_FLAG_IGN_SIMPLEX, .pio_mask = ATA_PIO4, .mwdma_mask = ATA_MWDMA2, .udma_mask = ATA_UDMA4, @@ -520,7 +523,8 @@ static int ali_init_one(struct pci_dev *pdev, const struct pci_device_id *id) }; /* Revision 0xC3 is UDMA66 for now */ static const struct ata_port_info info_c3 = { - .flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48, + .flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48 | + ATA_FLAG_IGN_SIMPLEX, .pio_mask = ATA_PIO4, .mwdma_mask = ATA_MWDMA2, .udma_mask = ATA_UDMA4, @@ -528,7 +532,8 @@ static int ali_init_one(struct pci_dev *pdev, const struct pci_device_id *id) }; /* Revision 0xC4 is UDMA100 */ static const struct ata_port_info info_c4 = { - .flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48, + .flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48 | + ATA_FLAG_IGN_SIMPLEX, .pio_mask = ATA_PIO4, .mwdma_mask = ATA_MWDMA2, .udma_mask = ATA_UDMA5, @@ -536,7 +541,7 @@ static int ali_init_one(struct pci_dev *pdev, const struct pci_device_id *id) }; /* Revision 0xC5 is UDMA133 with LBA48 DMA */ static const struct ata_port_info info_c5 = { - .flags = ATA_FLAG_SLAVE_POSS, + .flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_IGN_SIMPLEX, .pio_mask = ATA_PIO4, .mwdma_mask = ATA_MWDMA2, .udma_mask = ATA_UDMA6, From fe2245c905631a3a353504fc04388ce3dfaf9d9e Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Sun, 5 Jul 2009 15:50:52 -0500 Subject: [PATCH 838/900] x86: enable GART-IOMMU only after setting up protection methods The current code to set up the GART as an IOMMU enables GART translations before it removes the aperture from the kernel memory map, sets the GART PTEs to UC, sets up the guard and scratch pages, or does a wbinvd(). This leaves the possibility of cache aliasing open and can cause system crashes. Re-order the code so as to enable the GART translations only after all safeguards are in place and the tlb has been flushed. AMD has tested this patch on both Istanbul systems and 1st generation Opteron systems with APG enabled and seen no adverse effects. Istanbul systems with HT Assist enabled sometimes see MCE errors due to cache artifacts with the unmodified code. Signed-off-by: Mark Langsdorf Cc: Cc: Joerg Roedel Cc: akpm@linux-foundation.org Cc: jbarnes@virtuousgeek.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 1e8920d98f7..cfd9f906389 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -658,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; - enable_gart_translations(); - error = sysdev_class_register(&gart_sysdev_class); if (!error) error = sysdev_register(&device_gart); @@ -816,6 +814,14 @@ void __init gart_iommu_init(void) * the pages as Not-Present: */ wbinvd(); + + /* + * Now all caches are flushed and we can safely enable + * GART hardware. Doing it early leaves the possibility + * of stale cache entries that can lead to GART PTE + * errors. + */ + enable_gart_translations(); /* * Try to workaround a bug (thanks to BenH): From 460bcf57b128ce1c0dd553d905fedc097f9955c6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 12 May 2009 07:37:56 -0400 Subject: [PATCH 839/900] Fix nobh_truncate_page() to not pass stack garbage to get_block() The nobh_truncate_page() function is used by ext2, exofs, and jfs. Of these three, only ext2 and jfs's get_block() function pays attention to bh->b_size --- which is normally always the filesystem blocksize except when the get_block() function is called by either mpage_readpage(), mpage_readpages(), or the direct I/O routines in fs/direct_io.c. Unfortunately, nobh_truncate_page() does not initialize map_bh before calling the filesystem-supplied get_block() function. So ext2 and jfs will try to calculate the number of blocks to map by taking stack garbage and shifting it left by inode->i_blkbits. This should be *mostly* harmless (except the filesystem will do some unnneeded work) unless the stack garbage is less than filesystem's blocksize, in which case maxblocks will be zero, and the attempt to find out whether or not the filesystem has a hole at a given logical block will fail, and the page cache entry might not get zero'ed out. Also if the stack garbage in in map_bh->state happens to have the BH_Mapped bit set, there could be an attempt to call readpage() on a non-existent page, which could cause nobh_truncate_page() to return an error when it should not. Fix this by initializing map_bh->state and map_bh->size. Fortunately, it's probably fairly unlikely that ext2 and jfs users mount with nobh these days. Signed-off-by: "Theodore Ts'o" Cc: Dave Kleikamp Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Al Viro --- fs/buffer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index aed297739eb..49106127a4a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2736,6 +2736,8 @@ has_buffers: pos += blocksize; } + map_bh.b_size = blocksize; + map_bh.b_state = 0; err = get_block(inode, iblock, &map_bh, 0); if (err) goto unlock; From 72a43d63cb51057393edfbcfc4596066205ad15d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 13 May 2009 19:13:40 +0100 Subject: [PATCH 840/900] ext3/4 with synchronous writes gets wedged by Postfix OK, that's probably the easiest way to do that, as much as I don't like it... Since iget() et.al. will not accept I_FREEING (will wait to go away and restart), and since we'd better have serialization between new/free on fs data structures anyway, we can afford simply skipping I_FREEING et.al. in insert_inode_locked(). We do that from new_inode, so it won't race with free_inode in any interesting ways and it won't race with iget (of any origin; nfsd or in case of fs corruption a lookup) since both still will wait for I_LOCK. Reviewed-by: "Theodore Ts'o" Acked-by: Jan Kara Tested-by: David Watson Signed-off-by: Al Viro --- fs/inode.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 0571983755d..a4876e56195 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1053,13 +1053,22 @@ int insert_inode_locked(struct inode *inode) struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); - struct inode *old; inode->i_state |= I_LOCK|I_NEW; while (1) { + struct hlist_node *node; + struct inode *old = NULL; spin_lock(&inode_lock); - old = find_inode_fast(sb, head, ino); - if (likely(!old)) { + hlist_for_each_entry(old, node, head, i_hash) { + if (old->i_ino != ino) + continue; + if (old->i_sb != sb) + continue; + if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) + continue; + break; + } + if (likely(!node)) { hlist_add_head(&inode->i_hash, head); spin_unlock(&inode_lock); return 0; @@ -1081,14 +1090,24 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, { struct super_block *sb = inode->i_sb; struct hlist_head *head = inode_hashtable + hash(sb, hashval); - struct inode *old; inode->i_state |= I_LOCK|I_NEW; while (1) { + struct hlist_node *node; + struct inode *old = NULL; + spin_lock(&inode_lock); - old = find_inode(sb, head, test, data); - if (likely(!old)) { + hlist_for_each_entry(old, node, head, i_hash) { + if (old->i_sb != sb) + continue; + if (!test(old, data)) + continue; + if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) + continue; + break; + } + if (likely(!node)) { hlist_add_head(&inode->i_hash, head); spin_unlock(&inode_lock); return 0; From f07502dae230a2c3b65381fd1b06e8a18b2c7525 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 6 Jun 2009 21:18:09 +0100 Subject: [PATCH 841/900] integrity: fix IMA inode leak CONFIG_IMA=y inode activity leaks iint_cache and radix_tree_node objects until the system runs out of memory. Nowhere is calling ima_inode_free() a.k.a. ima_iint_delete(). Fix that by calling it from destroy_inode(). Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- fs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/inode.c b/fs/inode.c index a4876e56195..bca0c618fdb 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -219,6 +219,7 @@ static struct inode *alloc_inode(struct super_block *sb) void destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); + ima_inode_free(inode); security_inode_free(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); From 7caf6a49bb17d0377210693af5737563b31aa5ee Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 5 Jun 2009 12:01:35 +0200 Subject: [PATCH 842/900] dma-debug: change hash_bucket_find from first-fit to best-fit Some device drivers map the same physical address multiple times to a dma address. Without an IOMMU this results in the same dma address being put into the dma-debug hash multiple times. With a first-fit match in hash_bucket_find() this function may return the wrong dma_debug_entry. This can result in false positive warnings. This patch fixes it by changing the first-fit behavior of hash_bucket_find() into a best-fit algorithm. Reported-by: Torsten Kaiser Reported-by: FUJITA Tomonori Signed-off-by: Joerg Roedel Cc: lethal@linux-sh.org Cc: just.for.lkml@googlemail.com Cc: hancockrwd@gmail.com Cc: jens.axboe@oracle.com Cc: bharrosh@panasas.com Cc: FUJITA Tomonori Cc: Linus Torvalds Cc: LKML-Reference: <20090605104132.GE24836@amd.com> Signed-off-by: Ingo Molnar --- lib/dma-debug.c | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index cdd205d6bf7..8fcc09c91e1 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -186,15 +186,50 @@ static void put_hash_bucket(struct hash_bucket *bucket, static struct dma_debug_entry *hash_bucket_find(struct hash_bucket *bucket, struct dma_debug_entry *ref) { - struct dma_debug_entry *entry; + struct dma_debug_entry *entry, *ret = NULL; + int matches = 0, match_lvl, last_lvl = 0; list_for_each_entry(entry, &bucket->list, list) { - if ((entry->dev_addr == ref->dev_addr) && - (entry->dev == ref->dev)) + if ((entry->dev_addr != ref->dev_addr) || + (entry->dev != ref->dev)) + continue; + + /* + * Some drivers map the same physical address multiple + * times. Without a hardware IOMMU this results in the + * same device addresses being put into the dma-debug + * hash multiple times too. This can result in false + * positives being reported. Therfore we implement a + * best-fit algorithm here which returns the entry from + * the hash which fits best to the reference value + * instead of the first-fit. + */ + matches += 1; + match_lvl = 0; + entry->size == ref->size ? ++match_lvl : match_lvl; + entry->type == ref->type ? ++match_lvl : match_lvl; + entry->direction == ref->direction ? ++match_lvl : match_lvl; + + if (match_lvl == 3) { + /* perfect-fit - return the result */ return entry; + } else if (match_lvl > last_lvl) { + /* + * We found an entry that fits better then the + * previous one + */ + last_lvl = match_lvl; + ret = entry; + } } - return NULL; + /* + * If we have multiple matches but no perfect-fit, just return + * NULL. + */ + ret = (matches == 1) ? ret : NULL; + + return ret; } /* From 5095f59bda6793a7b8f0856096d6893fe98e0e51 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 5 Jun 2009 23:27:17 +0530 Subject: [PATCH 843/900] x86: cpu_debug: Remove model information to reduce encoding-decoding Remove model information, encoding/decoding and reduce bookkeeping. This, besides removing a lot of code and cleaning up the code, also enables these features on many more CPUs that were enumerated before. Reported-by: Ingo Molnar Signed-off-by: Jaswinder Singh Rajput Cc: Alan Cox LKML-Reference: <1244224637.8212.6.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpu_debug.h | 101 +------- arch/x86/kernel/cpu/cpu_debug.c | 401 +++++++------------------------ 2 files changed, 89 insertions(+), 413 deletions(-) diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h index 222802029fa..d96c1ee3a95 100644 --- a/arch/x86/include/asm/cpu_debug.h +++ b/arch/x86/include/asm/cpu_debug.h @@ -86,105 +86,7 @@ enum cpu_file_bit { CPU_VALUE_BIT, /* value */ }; -#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) - -/* - * DisplayFamily_DisplayModel Processor Families/Processor Number Series - * -------------------------- ------------------------------------------ - * 05_01, 05_02, 05_04 Pentium, Pentium with MMX - * - * 06_01 Pentium Pro - * 06_03, 06_05 Pentium II Xeon, Pentium II - * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III - * - * 06_09, 060D Pentium M - * - * 06_0E Core Duo, Core Solo - * - * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series, - * Core 2 Quad, Core 2 Extreme, Core 2 Duo, - * Pentium dual-core - * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650 - * - * 06_1C Atom - * - * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4 - * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D - * - * 0F_06 Xeon 7100, 5000 Series, Xeon MP, - * Pentium 4, Pentium D - */ - -/* Register processors bits */ -enum cpu_processor_bit { - CPU_NONE, -/* Intel */ - CPU_INTEL_PENTIUM_BIT, - CPU_INTEL_P6_BIT, - CPU_INTEL_PENTIUM_M_BIT, - CPU_INTEL_CORE_BIT, - CPU_INTEL_CORE2_BIT, - CPU_INTEL_ATOM_BIT, - CPU_INTEL_XEON_P4_BIT, - CPU_INTEL_XEON_MP_BIT, -/* AMD */ - CPU_AMD_K6_BIT, - CPU_AMD_K7_BIT, - CPU_AMD_K8_BIT, - CPU_AMD_0F_BIT, - CPU_AMD_10_BIT, - CPU_AMD_11_BIT, -}; - -#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT) -#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT) -#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT) -#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT) -#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT) -#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT) -#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT) -#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT) - -#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M) -#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2) -#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP) -#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM) -#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM) -#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM) -#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON) -#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON) -#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT) -#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON) -#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON) -#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT) -#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX) -#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE) -#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE) -#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT) -#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE) -#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT) -#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE) - -/* Select all supported Intel CPUs */ -#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE) - -#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT) -#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT) -#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT) -#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT) -#define CPU_AMD_10 (1 << CPU_AMD_10_BIT) -#define CPU_AMD_11 (1 << CPU_AMD_11_BIT) - -#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11) -#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS) -#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS) -#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS) - -/* Select all supported AMD CPUs */ -#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS) - -/* Select all supported CPUs */ -#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL) +#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) #define MAX_CPU_FILES 512 @@ -220,7 +122,6 @@ struct cpu_debug_range { unsigned min; /* Register range min */ unsigned max; /* Register range max */ unsigned flag; /* Supported flags */ - unsigned model; /* Supported models */ }; #endif /* _ASM_X86_CPU_DEBUG_H */ diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 46e29ab96c6..86afe13fc31 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -32,9 +32,7 @@ static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); -static DEFINE_PER_CPU(unsigned, cpu_modelflag); static DEFINE_PER_CPU(int, cpu_priv_count); -static DEFINE_PER_CPU(unsigned, cpu_model); static DEFINE_MUTEX(cpu_debug_lock); @@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = { { "value", CPU_REG_ALL, 1 }, }; -/* Intel Registers Range */ -static struct cpu_debug_range cpu_intel_range[] = { - { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, - { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, - { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, - { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, - { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, +/* CPU Registers Range */ +static struct cpu_debug_range cpu_reg_range[] = { + { 0x00000000, 0x00000001, CPU_MC, }, + { 0x00000006, 0x00000007, CPU_MONITOR, }, + { 0x00000010, 0x00000010, CPU_TIME, }, + { 0x00000011, 0x00000013, CPU_PMC, }, + { 0x00000017, 0x00000017, CPU_PLATFORM, }, + { 0x0000001B, 0x0000001B, CPU_APIC, }, + { 0x0000002A, 0x0000002B, CPU_POWERON, }, + { 0x0000002C, 0x0000002C, CPU_FREQ, }, + { 0x0000003A, 0x0000003A, CPU_CONTROL, }, + { 0x00000040, 0x00000047, CPU_LBRANCH, }, + { 0x00000060, 0x00000067, CPU_LBRANCH, }, + { 0x00000079, 0x00000079, CPU_BIOS, }, + { 0x00000088, 0x0000008A, CPU_CACHE, }, + { 0x0000008B, 0x0000008B, CPU_BIOS, }, + { 0x0000009B, 0x0000009B, CPU_MONITOR, }, + { 0x000000C1, 0x000000C4, CPU_PMC, }, + { 0x000000CD, 0x000000CD, CPU_FREQ, }, + { 0x000000E7, 0x000000E8, CPU_PERF, }, + { 0x000000FE, 0x000000FE, CPU_MTRR, }, - { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, - { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, - { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, - { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, + { 0x00000116, 0x0000011E, CPU_CACHE, }, + { 0x00000174, 0x00000176, CPU_SYSENTER, }, + { 0x00000179, 0x0000017B, CPU_MC, }, + { 0x00000186, 0x00000189, CPU_PMC, }, + { 0x00000198, 0x00000199, CPU_PERF, }, + { 0x0000019A, 0x0000019A, CPU_TIME, }, + { 0x0000019B, 0x0000019D, CPU_THERM, }, + { 0x000001A0, 0x000001A0, CPU_MISC, }, + { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, + { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, + { 0x000001D9, 0x000001D9, CPU_DEBUG, }, + { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, - { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, - { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, - { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, - { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, + { 0x00000200, 0x0000020F, CPU_MTRR, }, + { 0x00000250, 0x00000250, CPU_MTRR, }, + { 0x00000258, 0x00000259, CPU_MTRR, }, + { 0x00000268, 0x0000026F, CPU_MTRR, }, + { 0x00000277, 0x00000277, CPU_PAT, }, + { 0x000002FF, 0x000002FF, CPU_MTRR, }, - { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, - { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, - { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, - { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, + { 0x00000300, 0x00000311, CPU_PMC, }, + { 0x00000345, 0x00000345, CPU_PMC, }, + { 0x00000360, 0x00000371, CPU_PMC, }, + { 0x0000038D, 0x00000390, CPU_PMC, }, + { 0x000003A0, 0x000003BE, CPU_PMC, }, + { 0x000003C0, 0x000003CD, CPU_PMC, }, + { 0x000003E0, 0x000003E1, CPU_PMC, }, + { 0x000003F0, 0x000003F2, CPU_PMC, }, - { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, - { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, - { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, + { 0x00000400, 0x00000417, CPU_MC, }, + { 0x00000480, 0x0000048B, CPU_VMX, }, - { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, - { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, - { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, - { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, - { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, + { 0x00000600, 0x00000600, CPU_DEBUG, }, + { 0x00000680, 0x0000068F, CPU_LBRANCH, }, + { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, - { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, - { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, - { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, - { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, - { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, - { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, - { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, + { 0x000107CC, 0x000107D3, CPU_PMC, }, - { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, - { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, - { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, - { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, - { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, - { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, + { 0xC0000080, 0xC0000080, CPU_FEATURES, }, + { 0xC0000081, 0xC0000084, CPU_CALL, }, + { 0xC0000100, 0xC0000102, CPU_BASE, }, + { 0xC0000103, 0xC0000103, CPU_TIME, }, - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, - - { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, - { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, - { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, - { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, - { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, - { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, - { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, - { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, - - { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, - { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, - { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, - { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, - { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, - { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, - { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE }, - - { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE }, - { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON }, - - { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON }, - { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON }, - { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON }, + { 0xC0010000, 0xC0010007, CPU_PMC, }, + { 0xC0010010, 0xC0010010, CPU_CONF, }, + { 0xC0010015, 0xC0010015, CPU_CONF, }, + { 0xC0010016, 0xC001001A, CPU_MTRR, }, + { 0xC001001D, 0xC001001D, CPU_MTRR, }, + { 0xC001001F, 0xC001001F, CPU_CONF, }, + { 0xC0010030, 0xC0010035, CPU_BIOS, }, + { 0xC0010044, 0xC0010048, CPU_MC, }, + { 0xC0010050, 0xC0010056, CPU_SMM, }, + { 0xC0010058, 0xC0010058, CPU_CONF, }, + { 0xC0010060, 0xC0010060, CPU_CACHE, }, + { 0xC0010061, 0xC0010068, CPU_SMM, }, + { 0xC0010069, 0xC001006B, CPU_SMM, }, + { 0xC0010070, 0xC0010071, CPU_SMM, }, + { 0xC0010111, 0xC0010113, CPU_SMM, }, + { 0xC0010114, 0xC0010118, CPU_SVM, }, + { 0xC0010140, 0xC0010141, CPU_OSVM, }, + { 0xC0011022, 0xC0011023, CPU_CONF, }, }; -/* AMD Registers Range */ -static struct cpu_debug_range cpu_amd_range[] = { - { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, }, - { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, }, - { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS }, - { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, }, - { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, }, - { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, }, - - { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, }, - { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, }, - { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, }, - - { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, }, - { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, }, - { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, }, - { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, }, - { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, }, - { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, }, - { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, }, - { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, }, - { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, }, - { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, }, - { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, }, - { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, }, - { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, }, - { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, }, - { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, }, - { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, }, - { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, }, - { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, }, -}; - - -/* Intel */ -static int get_intel_modelflag(unsigned model) -{ - int flag; - - switch (model) { - case 0x0501: - case 0x0502: - case 0x0504: - flag = CPU_INTEL_PENTIUM; - break; - case 0x0601: - case 0x0603: - case 0x0605: - case 0x0607: - case 0x0608: - case 0x060A: - case 0x060B: - flag = CPU_INTEL_P6; - break; - case 0x0609: - case 0x060D: - flag = CPU_INTEL_PENTIUM_M; - break; - case 0x060E: - flag = CPU_INTEL_CORE; - break; - case 0x060F: - case 0x0617: - flag = CPU_INTEL_CORE2; - break; - case 0x061C: - flag = CPU_INTEL_ATOM; - break; - case 0x0F00: - case 0x0F01: - case 0x0F02: - case 0x0F03: - case 0x0F04: - flag = CPU_INTEL_XEON_P4; - break; - case 0x0F06: - flag = CPU_INTEL_XEON_MP; - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -/* AMD */ -static int get_amd_modelflag(unsigned model) -{ - int flag; - - switch (model >> 8) { - case 0x6: - flag = CPU_AMD_K6; - break; - case 0x7: - flag = CPU_AMD_K7; - break; - case 0x8: - flag = CPU_AMD_K8; - break; - case 0xf: - flag = CPU_AMD_0F; - break; - case 0x10: - flag = CPU_AMD_10; - break; - case 0x11: - flag = CPU_AMD_11; - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -static int get_cpu_modelflag(unsigned cpu) -{ - int flag; - - flag = per_cpu(cpu_model, cpu); - - switch (flag >> 16) { - case X86_VENDOR_INTEL: - flag = get_intel_modelflag(flag); - break; - case X86_VENDOR_AMD: - flag = get_amd_modelflag(flag & 0xffff); - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -static int get_cpu_range_count(unsigned cpu) -{ - int index; - - switch (per_cpu(cpu_model, cpu) >> 16) { - case X86_VENDOR_INTEL: - index = ARRAY_SIZE(cpu_intel_range); - break; - case X86_VENDOR_AMD: - index = ARRAY_SIZE(cpu_amd_range); - break; - default: - index = 0; - break; - } - - return index; -} - static int is_typeflag_valid(unsigned cpu, unsigned flag) { - unsigned vendor, modelflag; - int i, index; + int i; /* Standard Registers should be always valid */ if (flag >= CPU_TSS) return 1; - modelflag = per_cpu(cpu_modelflag, cpu); - vendor = per_cpu(cpu_model, cpu) >> 16; - index = get_cpu_range_count(cpu); - - for (i = 0; i < index; i++) { - switch (vendor) { - case X86_VENDOR_INTEL: - if ((cpu_intel_range[i].model & modelflag) && - (cpu_intel_range[i].flag & flag)) - return 1; - break; - case X86_VENDOR_AMD: - if ((cpu_amd_range[i].model & modelflag) && - (cpu_amd_range[i].flag & flag)) - return 1; - break; - } + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { + if (cpu_reg_range[i].flag == flag) + return 1; } /* Invalid */ @@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag) static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, int index, unsigned flag) { - unsigned modelflag; - - modelflag = per_cpu(cpu_modelflag, cpu); - *max = 0; - switch (per_cpu(cpu_model, cpu) >> 16) { - case X86_VENDOR_INTEL: - if ((cpu_intel_range[index].model & modelflag) && - (cpu_intel_range[index].flag & flag)) { - *min = cpu_intel_range[index].min; - *max = cpu_intel_range[index].max; - } - break; - case X86_VENDOR_AMD: - if ((cpu_amd_range[index].model & modelflag) && - (cpu_amd_range[index].flag & flag)) { - *min = cpu_amd_range[index].min; - *max = cpu_amd_range[index].max; - } - break; - } + if (cpu_reg_range[index].flag == flag) { + *min = cpu_reg_range[index].min; + *max = cpu_reg_range[index].max; + } else + *max = 0; return *max; } @@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) unsigned msr, msr_min, msr_max; struct cpu_private *priv; u32 low, high; - int i, range; + int i; if (seq) { priv = seq->private; @@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) } } - range = get_cpu_range_count(cpu); - - for (i = 0; i < range; i++) { + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) continue; @@ -788,13 +569,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) { struct dentry *cpu_dentry = NULL; unsigned reg, reg_min, reg_max; - int i, range, err = 0; + int i, err = 0; char reg_dir[12]; u32 low, high; - range = get_cpu_range_count(cpu); - - for (i = 0; i < range; i++) { + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { if (!get_cpu_range(cpu, ®_min, ®_max, i, cpu_base[type].flag)) continue; @@ -850,10 +629,6 @@ static int cpu_init_cpu(void) cpui = &cpu_data(cpu); if (!cpu_has(cpui, X86_FEATURE_MSR)) continue; - per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) | - (cpui->x86 << 8) | - (cpui->x86_model)); - per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu); sprintf(cpu_dir, "cpu%d", cpu); cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); From 521a415c9f6d4e5463807ce6d36598acabcd204f Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sun, 7 Jun 2009 13:52:50 +0200 Subject: [PATCH 844/900] pdc202xx_old: fix 'pdc20246_dma_ops' Commit ac95beedf8bc97b24f9540d4da9952f07221c023 (ide: add struct ide_port_ops (take 2)) erroneously converted the driver's dma_timeout() and dma_lost_irq() methods to call the driver's resetproc() method regardless of whether it was defined for this specific controller while it hadn't been defined and hence called for PDC20246. So the dma_clear() method, the successor of dma_timeout(), shouldn't exist and the dma_lost_irq() method should be standard for PDC20246. Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/pdc202xx_old.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c index 248a54bd238..8df26302a0b 100644 --- a/drivers/ide/pdc202xx_old.c +++ b/drivers/ide/pdc202xx_old.c @@ -328,9 +328,8 @@ static const struct ide_dma_ops pdc20246_dma_ops = { .dma_start = ide_dma_start, .dma_end = ide_dma_end, .dma_test_irq = pdc202xx_dma_test_irq, - .dma_lost_irq = pdc202xx_dma_lost_irq, + .dma_lost_irq = ide_dma_lost_irq, .dma_timer_expiry = ide_dma_sff_timer_expiry, - .dma_clear = pdc202xx_reset, .dma_sff_read_status = ide_dma_sff_read_status, }; From 669165daad2ec839df85b8c5f7bc155e76a2f404 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sun, 7 Jun 2009 13:52:50 +0200 Subject: [PATCH 845/900] pdc202xx_old: fix resetproc() method pdc202xx_reset() calls pdc202xx_reset_host() twice, for both channels, while that function actually twiddles the single, shared software reset bit -- the net effect is a duplicated reset and horrendous 4 second delay happening not only on a channel reset but also when dma_lost_irq() and dma_clear() methods are called. Fold pdc202xx_reset_host() into pdc202xx_reset(), fix printk(), and move it before the actual reset... Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/pdc202xx_old.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c index 8df26302a0b..b3bc96f930a 100644 --- a/drivers/ide/pdc202xx_old.c +++ b/drivers/ide/pdc202xx_old.c @@ -1,6 +1,6 @@ /* * Copyright (C) 1998-2002 Andre Hedrick - * Copyright (C) 2006-2007 MontaVista Software, Inc. + * Copyright (C) 2006-2007, 2009 MontaVista Software, Inc. * Copyright (C) 2007 Bartlomiej Zolnierkiewicz * * Portions Copyright (C) 1999 Promise Technology, Inc. @@ -227,28 +227,19 @@ somebody_else: return (dma_stat & 4) == 4; /* return 1 if INTR asserted */ } -static void pdc202xx_reset_host (ide_hwif_t *hwif) +static void pdc202xx_reset(ide_drive_t *drive) { + ide_hwif_t *hwif = drive->hwif; unsigned long high_16 = hwif->extra_base - 16; u8 udma_speed_flag = inb(high_16 | 0x001f); + printk(KERN_WARNING "PDC202xx: software reset...\n"); + outb(udma_speed_flag | 0x10, high_16 | 0x001f); mdelay(100); outb(udma_speed_flag & ~0x10, high_16 | 0x001f); mdelay(2000); /* 2 seconds ?! */ - printk(KERN_WARNING "PDC202XX: %s channel reset.\n", - hwif->channel ? "Secondary" : "Primary"); -} - -static void pdc202xx_reset (ide_drive_t *drive) -{ - ide_hwif_t *hwif = drive->hwif; - ide_hwif_t *mate = hwif->mate; - - pdc202xx_reset_host(hwif); - pdc202xx_reset_host(mate); - ide_set_max_pio(drive); } From 4a4aca641bc4598e77b866804f47c651ec4a764d Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 5 Jun 2009 12:02:38 +0200 Subject: [PATCH 846/900] x86: Add quirk for reboot stalls on a Dell Optiplex 360 The Dell Optiplex 360 hangs on reboot, just like the Optiplex 330, so the same quirk is needed. Signed-off-by: Jean Delvare Cc: Steve Conklin Cc: Leann Ogasawara Cc: LKML-Reference: <200906051202.38311.jdelvare@suse.de> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 667188e0b5a..d2d1ce8170f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KP561"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 360", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), + DMI_MATCH(DMI_BOARD_NAME, "0T656F"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", From 103428e57be323c3c5545db8ad12667099bc6005 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 7 Jun 2009 16:48:40 +0400 Subject: [PATCH 847/900] x86, apic: Fix dummy apic read operation together with broken MP handling Ingo Molnar reported that read_apic is buggy novadays: [ 0.000000] Using APIC driver default [ 0.000000] SMP: Allowing 1 CPUs, 0 hotplug CPUs [ 0.000000] Local APIC disabled by BIOS -- you can enable it with "lapic" [ 0.000000] APIC: disable apic facility [ 0.000000] ------------[ cut here ]------------ [ 0.000000] WARNING: at arch/x86/kernel/apic/apic.c:254 native_apic_read_dummy+0x2d/0x3b() [ 0.000000] Hardware name: HP OmniBook PC Indeed we still rely on apic->read operation for SMP compiled kernel. And instead of disfigure the SMP code with #ifdef we allow to call apic->read. To capture any unexpected results we check for apic->read being called for sane reason via WARN_ON_ONCE but(!) instead of OR we should use AND logical operation (thanks Yinghai for spotting the root of the problem). Along with that we could be have bad MP table and we are to fix it that way no SMP started and no complains about BIOS bug if apic was just disabled via command line. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090607124840.GD4547@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 9 ++++++++- arch/x86/kernel/smpboot.c | 8 +++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e82488d3f0b..a4c9cf0bf70 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -249,7 +249,7 @@ static void native_apic_write_dummy(u32 reg, u32 v) static u32 native_apic_read_dummy(u32 reg) { - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); return 0; } @@ -1609,6 +1609,13 @@ void __init init_apic_mappings(void) new_apicid = read_apic_id(); if (boot_cpu_physical_apicid != new_apicid) { boot_cpu_physical_apicid = new_apicid; + /* + * yeah -- we lie about apic_version + * in case if apic was disabled via boot option + * but it's not a problem for SMP compiled kernel + * since smp_sanity_check is prepared for such a case + * and disable smp mode + */ apic_version[new_apicid] = GET_APIC_VERSION(apic_read(APIC_LVR)); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d2e8de95815..7c80007ea5f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -992,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - printk(KERN_ERR "... forcing use of dummy APIC emulation." + if (!disable_apic) { + pr_err("BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + pr_err("... forcing use of dummy APIC emulation." "(tell your hw vendor)\n"); + } smpboot_clear_io_apic(); arch_disable_smp_support(); return -1; From a4046f8d299e00e9855ae292527c2d66a42670eb Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 7 Jun 2009 12:19:37 +0400 Subject: [PATCH 848/900] x86, nmi: Use predefined numbers instead of hardcoded one [ Impact: cleanup ] Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090607081937.GC4547@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c45a0a568df..c9726440993 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void) * but since they are power of two we could use a * cheaper way --cvg */ - return nmi_watchdog & 0x3; + return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC); } #endif From 3aa6b186f86c5d06d6d92d14311ffed51f091f40 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Sun, 7 Jun 2009 16:23:48 +0200 Subject: [PATCH 849/900] x86: Fix non-lazy GS handling in sys_vm86() This fixes a stack corruption panic or null dereference oops due to a bad GS in resume_userspace() when returning from sys_vm86() and calling lockdep_sys_exit(). Only a problem when CONFIG_LOCKDEP and CONFIG_CC_STACKPROTECTOR enabled. Signed-off-by: Lubomir Rintel Cc: H. Peter Anvin LKML-Reference: <1244384628.2323.4.camel@bimbo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vm86_32.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index d7ac84e7fc1..6a177694058 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk info->regs.pt.ds = 0; info->regs.pt.es = 0; info->regs.pt.fs = 0; - -/* we are clearing gs later just before "jmp resume_userspace", - * because it is not saved/restored. - */ +#ifndef CONFIG_X86_32_LAZY_GS + info->regs.pt.gs = 0; +#endif /* * The flags register is also special: we cannot trust that the user @@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk __asm__ __volatile__( "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" +#ifdef CONFIG_X86_32_LAZY_GS "mov %2, %%gs\n\t" +#endif "jmp resume_userspace" : /* no outputs */ :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); From aeef50bc0483fa70ce0bddb686ec84a274b7f3d4 Mon Sep 17 00:00:00 2001 From: "Figo.zhang" Date: Sun, 7 Jun 2009 22:30:36 +0800 Subject: [PATCH 850/900] x86, microcode: Simplify vfree() use vfree() does its own 'NULL' check, so no need for check before calling it. In v2, remove the stray newline. [ Impact: cleanup ] Signed-off-by: Figo.zhang Cc: Dmitry Adamushko LKML-Reference: <1244385036.3402.11.camel@myhost> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index c8be20f1644..366baa17991 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -241,10 +241,8 @@ static int install_equiv_cpu_table(const u8 *buf) static void free_equiv_cpu_table(void) { - if (equiv_cpu_table) { - vfree(equiv_cpu_table); - equiv_cpu_table = NULL; - } + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; } static enum ucode_state @@ -279,8 +277,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) mc_header = (struct microcode_header_amd *)mc; if (get_matching_microcode(cpu, mc, new_rev)) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); new_rev = mc_header->patch_id; new_mc = mc; } else @@ -292,8 +289,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) if (new_mc) { if (!leftover) { - if (uci->mc) - vfree(uci->mc); + vfree(uci->mc); uci->mc = new_mc; pr_debug("microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", From aa853f85d9ed593672d0f24a98c72a2518cb63e6 Mon Sep 17 00:00:00 2001 From: Alessandro Rubini Date: Sat, 6 Jun 2009 10:17:57 +0100 Subject: [PATCH 851/900] [ARM] 5543/1: arm: serial amba: add missing declaration in serial.h This header is sometimes included in the uncompress stage to get register values, but no can be included there. So declare "struct amba_device" here before using it in a prototype. Signed-off-by: Alessandro Rubini Acked-by: Andrea Gallo Signed-off-by: Russell King --- include/linux/amba/serial.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h index 48ee32a18ac..64a982ea5d5 100644 --- a/include/linux/amba/serial.h +++ b/include/linux/amba/serial.h @@ -159,6 +159,7 @@ #define UART01x_FR_MODEM_ANY (UART01x_FR_DCD|UART01x_FR_DSR|UART01x_FR_CTS) #ifndef __ASSEMBLY__ +struct amba_device; /* in uncompress this is included but amba/bus.h is not */ struct amba_pl010_data { void (*set_mctrl)(struct amba_device *dev, void __iomem *base, unsigned int mctrl); }; From a4c0364be3f43d3e17fe19270f8b3d64881606e6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 6 Jun 2009 12:34:39 +0300 Subject: [PATCH 852/900] KVM: Explicity initialize cpus_hardware_enabled Under CONFIG_MAXSMP, cpus_hardware_enabled is allocated from the heap and not statically initialized. This causes a crash on reboot when kvm thinks vmx is enabled on random nonexistent cpus and accesses nonexistent percpu lists. Fix by explicitly clearing the variable. Cc: stable@kernel.org Reported-and-tested-by: Yinghai Lu Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1ecbe2391c8..4293528200b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2305,6 +2305,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size, r = -ENOMEM; goto out_free_0; } + cpumask_clear(cpus_hardware_enabled); r = kvm_arch_hardware_setup(); if (r < 0) From 312325094785566a0e42a88c1bf6e7eb54c5d70e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 8 Jun 2009 15:07:08 +0200 Subject: [PATCH 853/900] dma-debug: comment style fixes Last patch series introduced some new comment which does not fit the Kernel comment style guidelines. Fix it with this patch. Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 77053d9ef51..b8a61ff0854 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -542,7 +542,8 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf, write_lock_irqsave(&driver_name_lock, flags); - /* Now handle the string we got from userspace very carefully. + /* + * Now handle the string we got from userspace very carefully. * The rules are: * - only use the first token we got * - token delimiter is everything looking like a space @@ -551,7 +552,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf, */ if (!isalnum(buf[0])) { /* - If the first character userspace gave us is not + * If the first character userspace gave us is not * alphanumerical then assume the filter should be * switched off. */ From c17e2cf7376a2010b8b114fdeebd4e340a5e9cb2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 8 Jun 2009 15:19:29 +0200 Subject: [PATCH 854/900] dma-debug: code style fixes This patch changes the recent updates to dma-debug to conform with coding style guidelines of Linux and the -tip tree. Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index b8a61ff0854..9561825c14a 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -501,8 +501,8 @@ out_err: static ssize_t filter_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { - unsigned long flags; char buf[NAME_MAX_LEN + 1]; + unsigned long flags; int len; if (!current_driver_name[0]) @@ -523,9 +523,9 @@ static ssize_t filter_read(struct file *file, char __user *user_buf, static ssize_t filter_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { - unsigned long flags; char buf[NAME_MAX_LEN]; - size_t len = NAME_MAX_LEN - 1; + unsigned long flags; + size_t len; int i; /* @@ -534,7 +534,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf, * disabled. Since copy_from_user can fault and may sleep we * need to copy to temporary buffer first */ - len = min(count, len); + len = min(count, NAME_MAX_LEN - 1); if (copy_from_user(buf, userbuf, len)) return -EFAULT; @@ -1040,18 +1040,19 @@ EXPORT_SYMBOL(debug_dma_map_sg); static int get_nr_mapped_entries(struct device *dev, struct scatterlist *s) { - struct dma_debug_entry *entry; + struct dma_debug_entry *entry, ref; struct hash_bucket *bucket; unsigned long flags; - int mapped_ents = 0; - struct dma_debug_entry ref; + int mapped_ents; - ref.dev = dev; + ref.dev = dev; ref.dev_addr = sg_dma_address(s); - ref.size = sg_dma_len(s), + ref.size = sg_dma_len(s), + + bucket = get_hash_bucket(&ref, &flags); + entry = hash_bucket_find(bucket, &ref); + mapped_ents = 0; - bucket = get_hash_bucket(&ref, &flags); - entry = hash_bucket_find(bucket, &ref); if (entry) mapped_ents = entry->sg_mapped_ents; put_hash_bucket(bucket, &flags); From e7ed70eedccc78e79ce6da2155e9caf90aff4003 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 8 Jun 2009 15:39:24 +0200 Subject: [PATCH 855/900] dma-debug: use pr_* instead of printk(KERN_* ...) The pr_* macros are shorter than the old printk(KERN_ ...) variant. Change the dma-debug code to use the new macros and save a few unnecessary line breaks. If lines don't break the source code can also be grepped more easily. Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 9561825c14a..24c4a2c5d61 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -139,7 +139,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry) { #ifdef CONFIG_STACKTRACE if (entry) { - printk(KERN_WARNING "Mapped at:\n"); + pr_warning("Mapped at:\n"); print_stack_trace(&entry->stacktrace, 0); } #endif @@ -377,8 +377,7 @@ static struct dma_debug_entry *dma_entry_alloc(void) spin_lock_irqsave(&free_entries_lock, flags); if (list_empty(&free_entries)) { - printk(KERN_ERR "DMA-API: debugging out of memory " - "- disabling\n"); + pr_err("DMA-API: debugging out of memory - disabling\n"); global_disable = true; goto out; } @@ -483,8 +482,7 @@ static int prealloc_memory(u32 num_entries) num_free_entries = num_entries; min_free_entries = num_entries; - printk(KERN_INFO "DMA-API: preallocated %d debug entries\n", - num_entries); + pr_info("DMA-API: preallocated %d debug entries\n", num_entries); return 0; @@ -534,7 +532,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf, * disabled. Since copy_from_user can fault and may sleep we * need to copy to temporary buffer first */ - len = min(count, NAME_MAX_LEN - 1); + len = min(count, (size_t)(NAME_MAX_LEN - 1)); if (copy_from_user(buf, userbuf, len)) return -EFAULT; @@ -557,8 +555,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf, * switched off. */ if (current_driver_name[0]) - printk(KERN_INFO "DMA-API: switching off dma-debug " - "driver filter\n"); + pr_info("DMA-API: switching off dma-debug driver filter\n"); current_driver_name[0] = 0; current_driver = NULL; goto out_unlock; @@ -576,8 +573,8 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf, current_driver_name[i] = 0; current_driver = NULL; - printk(KERN_INFO "DMA-API: enable driver filter for driver [%s]\n", - current_driver_name); + pr_info("DMA-API: enable driver filter for driver [%s]\n", + current_driver_name); out_unlock: write_unlock_irqrestore(&driver_name_lock, flags); @@ -594,7 +591,7 @@ static int dma_debug_fs_init(void) { dma_debug_dent = debugfs_create_dir("dma-api", NULL); if (!dma_debug_dent) { - printk(KERN_ERR "DMA-API: can not create debugfs directory\n"); + pr_err("DMA-API: can not create debugfs directory\n"); return -ENOMEM; } @@ -693,7 +690,7 @@ void dma_debug_add_bus(struct bus_type *bus) nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); if (nb == NULL) { - printk(KERN_ERR "dma_debug_add_bus: out of memory\n"); + pr_err("dma_debug_add_bus: out of memory\n"); return; } @@ -718,8 +715,7 @@ void dma_debug_init(u32 num_entries) } if (dma_debug_fs_init() != 0) { - printk(KERN_ERR "DMA-API: error creating debugfs entries " - "- disabling\n"); + pr_err("DMA-API: error creating debugfs entries - disabling\n"); global_disable = true; return; @@ -729,8 +725,7 @@ void dma_debug_init(u32 num_entries) num_entries = req_entries; if (prealloc_memory(num_entries) != 0) { - printk(KERN_ERR "DMA-API: debugging out of memory error " - "- disabled\n"); + pr_err("DMA-API: debugging out of memory error - disabled\n"); global_disable = true; return; @@ -738,7 +733,7 @@ void dma_debug_init(u32 num_entries) nr_total_entries = num_free_entries; - printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n"); + pr_info("DMA-API: debugging enabled by kernel config\n"); } static __init int dma_debug_cmdline(char *str) @@ -747,8 +742,7 @@ static __init int dma_debug_cmdline(char *str) return -EINVAL; if (strncmp(str, "off", 3) == 0) { - printk(KERN_INFO "DMA-API: debugging disabled on kernel " - "command line\n"); + pr_info("DMA-API: debugging disabled on kernel command line\n"); global_disable = true; } @@ -1239,8 +1233,8 @@ static int __init dma_debug_driver_setup(char *str) } if (current_driver_name[0]) - printk(KERN_INFO "DMA-API: enable driver filter for " - "driver [%s]\n", current_driver_name); + pr_info("DMA-API: enable driver filter for driver [%s]\n", + current_driver_name); return 1; From be81c6ea23b8b471141734ef4bc005f5127aaf43 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 8 Jun 2009 15:46:19 +0200 Subject: [PATCH 856/900] dma-debug: disable/enable irqs only once in device_dma_allocations There is no need to disable/enable irqs on each loop iteration. Just disable irqs for the whole time the loop runs. Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 24c4a2c5d61..27b369da52c 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -649,15 +649,19 @@ static int device_dma_allocations(struct device *dev) unsigned long flags; int count = 0, i; + local_irq_save(flags); + for (i = 0; i < HASH_SIZE; ++i) { - spin_lock_irqsave(&dma_entry_hash[i].lock, flags); + spin_lock(&dma_entry_hash[i].lock); list_for_each_entry(entry, &dma_entry_hash[i].list, list) { if (entry->dev == dev) count += 1; } - spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags); + spin_unlock(&dma_entry_hash[i].lock); } + local_irq_restore(flags); + return count; } From 0bf841281e58d0b3cc9fe9dc4383df7694bde6bd Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 8 Jun 2009 15:53:46 +0200 Subject: [PATCH 857/900] dma-debug: simplify logic in driver_filter() This patch makes the driver_filter function more readable by reorganizing the code. The removal of a code code block to an upper indentation level makes hard-to-read line-wraps unnecessary. Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 27b369da52c..ad65fc0317d 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -147,6 +147,10 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry) static bool driver_filter(struct device *dev) { + struct device_driver *drv; + unsigned long flags; + bool ret; + /* driver filter off */ if (likely(!current_driver_name[0])) return true; @@ -155,32 +159,28 @@ static bool driver_filter(struct device *dev) if (current_driver && dev->driver == current_driver) return true; + if (current_driver || !current_driver_name[0]) + return false; + /* driver filter on but not yet initialized */ - if (!current_driver && current_driver_name[0]) { - struct device_driver *drv = get_driver(dev->driver); - unsigned long flags; - bool ret = false; + drv = get_driver(dev->driver); + if (!drv) + return false; - if (!drv) - return false; + /* lock to protect against change of current_driver_name */ + read_lock_irqsave(&driver_name_lock, flags); - /* lock to protect against change of current_driver_name */ - read_lock_irqsave(&driver_name_lock, flags); - - if (drv->name && - strncmp(current_driver_name, drv->name, - NAME_MAX_LEN-1) == 0) { - current_driver = drv; - ret = true; - } - - read_unlock_irqrestore(&driver_name_lock, flags); - put_driver(drv); - - return ret; + ret = false; + if (drv->name && + strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) { + current_driver = drv; + ret = true; } - return false; + read_unlock_irqrestore(&driver_name_lock, flags); + put_driver(drv); + + return ret; } #define err_printk(dev, entry, format, arg...) do { \ From e36b80b658d471be5a8a40f00e2c7614524b86a2 Mon Sep 17 00:00:00 2001 From: Matthieu Castet Date: Fri, 22 May 2009 22:25:04 +0200 Subject: [PATCH 858/900] SSB: BCM47xx: Export ssb_watchdog_timer_set this patch export ssb_watchdog_timer_set to allow to use it in a Linux watchdog driver. Signed-off-by: Matthieu CASTET Acked-by : Michael Buesch Signed-off-by: Ralf Baechle --- drivers/ssb/embedded.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ssb/embedded.c b/drivers/ssb/embedded.c index 7dc3a6b4139..a0e0d246b59 100644 --- a/drivers/ssb/embedded.c +++ b/drivers/ssb/embedded.c @@ -29,6 +29,7 @@ int ssb_watchdog_timer_set(struct ssb_bus *bus, u32 ticks) } return -ENODEV; } +EXPORT_SYMBOL(ssb_watchdog_timer_set); u32 ssb_gpio_in(struct ssb_bus *bus, u32 mask) { From e082f188f774544bc2c2edf51176157503c98fe4 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 2 Jun 2009 19:05:28 +0100 Subject: [PATCH 859/900] MIPS: Sibyte: Honor CONFIG_CMDLINE Original patch by Imre Kaloz . Signed-off-by: Ralf Baechle --- arch/mips/sibyte/cfe/setup.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/mips/sibyte/cfe/setup.c b/arch/mips/sibyte/cfe/setup.c index 3de30f79db3..eb5396cf81b 100644 --- a/arch/mips/sibyte/cfe/setup.c +++ b/arch/mips/sibyte/cfe/setup.c @@ -288,13 +288,7 @@ void __init prom_init(void) */ cfe_cons_handle = cfe_getstdhandle(CFE_STDHANDLE_CONSOLE); if (cfe_getenv("LINUX_CMDLINE", arcs_cmdline, CL_SIZE) < 0) { - if (argc < 0) { - /* - * It's OK for direct boot to not provide a - * command line - */ - strcpy(arcs_cmdline, "root=/dev/ram0 "); - } else { + if (argc >= 0) { /* The loader should have set the command line */ /* too early for panic to do any good */ printk("LINUX_CMDLINE not defined in cfe."); From c9d89d97f0d174b9154820dd5c6726d1c794cd99 Mon Sep 17 00:00:00 2001 From: Yoichi Yuasa Date: Tue, 2 Jun 2009 23:15:10 +0900 Subject: [PATCH 860/900] MIPS: Kconfig: Remove "Support for" from Cavium system type Signed-off-by: Yoichi Yuasa Acked-by: David Daney Signed-off-by: Ralf Baechle --- arch/mips/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 09b1287a92c..28119e641f1 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -593,7 +593,7 @@ config WR_PPMC board, which is based on GT64120 bridge chip. config CAVIUM_OCTEON_SIMULATOR - bool "Support for the Cavium Networks Octeon Simulator" + bool "Cavium Networks Octeon Simulator" select CEVT_R4K select 64BIT_PHYS_ADDR select DMA_COHERENT @@ -607,7 +607,7 @@ config CAVIUM_OCTEON_SIMULATOR hardware. config CAVIUM_OCTEON_REFERENCE_BOARD - bool "Support for the Cavium Networks Octeon reference board" + bool "Cavium Networks Octeon reference board" select CEVT_R4K select 64BIT_PHYS_ADDR select DMA_COHERENT From e25bfc9243f2eab12a2ce92b7f4b8a2e3e6949a6 Mon Sep 17 00:00:00 2001 From: Yoichi Yuasa Date: Tue, 2 Jun 2009 23:17:07 +0900 Subject: [PATCH 861/900] MIPS: Cobalt: PCI bus is always required to obtain the board ID Signed-off-by: Yoichi Yuasa Signed-off-by: Ralf Baechle --- arch/mips/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 28119e641f1..25f3b0a11ca 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -72,6 +72,7 @@ config MIPS_COBALT select IRQ_CPU select IRQ_GT641XX select PCI_GT64XXX_PCI0 + select PCI select SYS_HAS_CPU_NEVADA select SYS_HAS_EARLY_PRINTK select SYS_SUPPORTS_32BIT_KERNEL From 3a553147eaad5d4de90ab1f695aa13ddbea684ec Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 4 Jun 2009 18:05:49 +0530 Subject: [PATCH 862/900] MIPS: ioctl.h: Fix headers_check warnings Make ioctl.h compatible with asm-generic/ioctl.h and userspace fix the following 'make headers_check' warning: usr/include/asm-mips/ioctl.h:64: extern's make no sense in userspace Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ralf Baechle --- arch/mips/include/asm/ioctl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/mips/include/asm/ioctl.h b/arch/mips/include/asm/ioctl.h index 85067e248a8..916163401b2 100644 --- a/arch/mips/include/asm/ioctl.h +++ b/arch/mips/include/asm/ioctl.h @@ -60,12 +60,16 @@ ((nr) << _IOC_NRSHIFT) | \ ((size) << _IOC_SIZESHIFT)) +#ifdef __KERNEL__ /* provoke compile error for invalid uses of size argument */ extern unsigned int __invalid_size_argument_for_IOC; #define _IOC_TYPECHECK(t) \ ((sizeof(t) == sizeof(t[1]) && \ sizeof(t) < (1 << _IOC_SIZEBITS)) ? \ sizeof(t) : __invalid_size_argument_for_IOC) +#else +#define _IOC_TYPECHECK(t) (sizeof(t)) +#endif /* used to create numbers */ #define _IO(type, nr) _IOC(_IOC_NONE, (type), (nr), 0) From 5636919b5c909fee54a6ef5226475ecae012ad02 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Sat, 28 Feb 2009 09:44:28 +0000 Subject: [PATCH 863/900] MIPS: Outline udelay and fix a few issues. Outlining fixes the issue were on certain CPUs such as the R10000 family the delay loop would need an extra cycle if it overlaps a cacheline boundary. The rewrite also fixes build errors with GCC 4.4 which was changed in way incompatible with the kernel's inline assembly. Relying on pure C for computation of the delay value removes the need for explicit. The price we pay is a slight slowdown of the computation - to be fixed on another day. Signed-off-by: Ralf Baechle --- arch/mips/include/asm/cpu-info.h | 4 +- arch/mips/include/asm/delay.h | 92 ++------------------------------ arch/mips/kernel/proc.c | 2 +- arch/mips/lib/Makefile | 4 +- arch/mips/lib/delay.c | 56 +++++++++++++++++++ 5 files changed, 66 insertions(+), 92 deletions(-) create mode 100644 arch/mips/lib/delay.c diff --git a/arch/mips/include/asm/cpu-info.h b/arch/mips/include/asm/cpu-info.h index 744cd8fb107..126044308de 100644 --- a/arch/mips/include/asm/cpu-info.h +++ b/arch/mips/include/asm/cpu-info.h @@ -39,8 +39,8 @@ struct cache_desc { #define MIPS_CACHE_PINDEX 0x00000020 /* Physically indexed cache */ struct cpuinfo_mips { - unsigned long udelay_val; - unsigned long asid_cache; + unsigned int udelay_val; + unsigned int asid_cache; /* * Capability and feature descriptor structure for MIPS CPU diff --git a/arch/mips/include/asm/delay.h b/arch/mips/include/asm/delay.h index b0bccd2c4ed..a07e51b2be1 100644 --- a/arch/mips/include/asm/delay.h +++ b/arch/mips/include/asm/delay.h @@ -11,94 +11,12 @@ #ifndef _ASM_DELAY_H #define _ASM_DELAY_H -#include -#include +extern void __delay(unsigned int loops); +extern void __ndelay(unsigned int ns); +extern void __udelay(unsigned int us); -#include -#include - -static inline void __delay(unsigned long loops) -{ - if (sizeof(long) == 4) - __asm__ __volatile__ ( - " .set noreorder \n" - " .align 3 \n" - "1: bnez %0, 1b \n" - " subu %0, 1 \n" - " .set reorder \n" - : "=r" (loops) - : "0" (loops)); - else if (sizeof(long) == 8 && !DADDI_WAR) - __asm__ __volatile__ ( - " .set noreorder \n" - " .align 3 \n" - "1: bnez %0, 1b \n" - " dsubu %0, 1 \n" - " .set reorder \n" - : "=r" (loops) - : "0" (loops)); - else if (sizeof(long) == 8 && DADDI_WAR) - __asm__ __volatile__ ( - " .set noreorder \n" - " .align 3 \n" - "1: bnez %0, 1b \n" - " dsubu %0, %2 \n" - " .set reorder \n" - : "=r" (loops) - : "0" (loops), "r" (1)); -} - - -/* - * Division by multiplication: you don't have to worry about - * loss of precision. - * - * Use only for very small delays ( < 1 msec). Should probably use a - * lookup table, really, as the multiplications take much too long with - * short delays. This is a "reasonable" implementation, though (and the - * first constant multiplications gets optimized away if the delay is - * a constant) - */ - -static inline void __udelay(unsigned long usecs, unsigned long lpj) -{ - unsigned long hi, lo; - - /* - * The rates of 128 is rounded wrongly by the catchall case - * for 64-bit. Excessive precission? Probably ... - */ -#if defined(CONFIG_64BIT) && (HZ == 128) - usecs *= 0x0008637bd05af6c7UL; /* 2**64 / (1000000 / HZ) */ -#elif defined(CONFIG_64BIT) - usecs *= (0x8000000000000000UL / (500000 / HZ)); -#else /* 32-bit junk follows here */ - usecs *= (unsigned long) (((0x8000000000000000ULL / (500000 / HZ)) + - 0x80000000ULL) >> 32); -#endif - - if (sizeof(long) == 4) - __asm__("multu\t%2, %3" - : "=h" (usecs), "=l" (lo) - : "r" (usecs), "r" (lpj) - : GCC_REG_ACCUM); - else if (sizeof(long) == 8 && !R4000_WAR) - __asm__("dmultu\t%2, %3" - : "=h" (usecs), "=l" (lo) - : "r" (usecs), "r" (lpj) - : GCC_REG_ACCUM); - else if (sizeof(long) == 8 && R4000_WAR) - __asm__("dmultu\t%3, %4\n\tmfhi\t%0" - : "=r" (usecs), "=h" (hi), "=l" (lo) - : "r" (usecs), "r" (lpj) - : GCC_REG_ACCUM); - - __delay(usecs); -} - -#define __udelay_val cpu_data[raw_smp_processor_id()].udelay_val - -#define udelay(usecs) __udelay((usecs), __udelay_val) +#define ndelay(ns) __udelay(ns) +#define udelay(us) __udelay(us) /* make sure "usecs *= ..." in udelay do not overflow. */ #if HZ >= 1000 diff --git a/arch/mips/kernel/proc.c b/arch/mips/kernel/proc.c index 26760cad8b6..e0a4ac18fa0 100644 --- a/arch/mips/kernel/proc.c +++ b/arch/mips/kernel/proc.c @@ -42,7 +42,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, fmt, __cpu_name[n], (version >> 4) & 0x0f, version & 0x0f, (fp_vers >> 4) & 0x0f, fp_vers & 0x0f); - seq_printf(m, "BogoMIPS\t\t: %lu.%02lu\n", + seq_printf(m, "BogoMIPS\t\t: %u.%02u\n", cpu_data[n].udelay_val / (500000/HZ), (cpu_data[n].udelay_val / (5000/HZ)) % 100); seq_printf(m, "wait instruction\t: %s\n", cpu_wait ? "yes" : "no"); diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile index c13c7ad2cda..2adead5a8a3 100644 --- a/arch/mips/lib/Makefile +++ b/arch/mips/lib/Makefile @@ -2,8 +2,8 @@ # Makefile for MIPS-specific library files.. # -lib-y += csum_partial.o memcpy.o memcpy-inatomic.o memset.o strlen_user.o \ - strncpy_user.o strnlen_user.o uncached.o +lib-y += csum_partial.o delay.o memcpy.o memcpy-inatomic.o memset.o \ + strlen_user.o strncpy_user.o strnlen_user.o uncached.o obj-y += iomap.o obj-$(CONFIG_PCI) += iomap-pci.o diff --git a/arch/mips/lib/delay.c b/arch/mips/lib/delay.c new file mode 100644 index 00000000000..f69c6b569eb --- /dev/null +++ b/arch/mips/lib/delay.c @@ -0,0 +1,56 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1994 by Waldorf Electronics + * Copyright (C) 1995 - 2000, 01, 03 by Ralf Baechle + * Copyright (C) 1999, 2000 Silicon Graphics, Inc. + * Copyright (C) 2007 Maciej W. Rozycki + */ +#include +#include +#include + +#include +#include + +inline void __delay(unsigned int loops) +{ + __asm__ __volatile__ ( + " .set noreorder \n" + " .align 3 \n" + "1: bnez %0, 1b \n" + " subu %0, 1 \n" + " .set reorder \n" + : "=r" (loops) + : "0" (loops)); +} +EXPORT_SYMBOL(__delay); + +/* + * Division by multiplication: you don't have to worry about + * loss of precision. + * + * Use only for very small delays ( < 1 msec). Should probably use a + * lookup table, really, as the multiplications take much too long with + * short delays. This is a "reasonable" implementation, though (and the + * first constant multiplications gets optimized away if the delay is + * a constant) + */ + +void __udelay(unsigned long us) +{ + unsigned int lpj = current_cpu_data.udelay_val; + + __delay((us * 0x000010c7 * HZ * lpj) >> 32); +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long ns) +{ + unsigned int lpj = current_cpu_data.udelay_val; + + __delay((us * 0x00000005 * HZ * lpj) >> 32); +} +EXPORT_SYMBOL(__ndelay); From 5284c6b99ea017f73c09b50f34a637ff9d5d26a0 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 8 Jun 2009 12:31:00 +0100 Subject: [PATCH 864/900] pata_netcell: Fix typo The previous patch submission had a I typo I didn't catch but Bartlomiej noted. Guess this proves the point about any patch being risky late in an rc Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- drivers/ata/pata_netcell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/pata_netcell.c b/drivers/ata/pata_netcell.c index 9a698097134..f0d52f72f5b 100644 --- a/drivers/ata/pata_netcell.c +++ b/drivers/ata/pata_netcell.c @@ -26,7 +26,7 @@ static unsigned int netcell_read_id(struct ata_device *adev, unsigned int err_mask = ata_do_dev_read_id(adev, tf, id); /* Firmware forgets to mark words 85-87 valid */ if (err_mask == 0) - id[ATA_ID_CSF_DEFAULT] |= 0x0400; + id[ATA_ID_CSF_DEFAULT] |= 0x4000; return err_mask; } From c4ed3f04ba9defe22aa729d1646f970f791c03d7 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 8 Jun 2009 10:44:05 -0500 Subject: [PATCH 865/900] x86, UV: Fix macros for multiple coherency domains Fix bug in the SGI UV macros that support systems with multiple coherency domains. The macros used for referencing global MMR (chipset registers) are failing to correctly "or" the NASID (node identifier) bits that reside above M+N. These high bits are supplied automatically by the chipset for memory accesses coming from the processor socket. However, the bits must be present for references to the special global MMR space used to map chipset registers. (See uv_hub.h for more details ...) The bug results in references to invalid/incorrect nodes. Signed-off-by: Jack Steiner Cc: LKML-Reference: <20090608154405.GA16395@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 6 ++++-- arch/x86/kernel/apic/x2apic_uv_x.c | 15 +++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index d3a98ea1062..341070f7ad5 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -133,6 +133,7 @@ struct uv_scir_s { struct uv_hub_info_s { unsigned long global_mmr_base; unsigned long gpa_mask; + unsigned int gnode_extra; unsigned long gnode_upper; unsigned long lowmem_remap_top; unsigned long lowmem_remap_base; @@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); * p - PNODE (local part of nsids, right shifted 1) */ #define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) -#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper) +#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) +#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1) #define UV_LOCAL_MMR_BASE 0xf4000000UL #define UV_GLOBAL_MMR32_BASE 0xf8000000UL @@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) #define UV_GLOBAL_MMR64_PNODE_BITS(p) \ - ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT) + ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) #define UV_APIC_PNODE_SHIFT 6 diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2bda6935297..39f2af4b546 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -562,7 +562,7 @@ void __init uv_system_init(void) union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; - int max_pnode = 0; + int gnode_extra, max_pnode = 0; unsigned long mmr_base, present, paddr; unsigned short pnode_mask; @@ -574,6 +574,13 @@ void __init uv_system_init(void) mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; + pnode_mask = (1 << n_val) - 1; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); + gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; + gnode_upper = ((unsigned long)gnode_extra << m_val); + printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", + n_val, m_val, gnode_upper, gnode_extra); + printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) @@ -607,11 +614,6 @@ void __init uv_system_init(void) } } - pnode_mask = (1 << n_val) - 1; - node_id.v = uv_read_local_mmr(UVH_NODE_ID); - gnode_upper = (((unsigned long)node_id.s.node_id) & - ~((1 << n_val) - 1)) << m_val; - uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size); @@ -634,6 +636,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; + uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; From c9690998ef48ffefeccb91c70a7739eebdea57f9 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Mon, 8 Jun 2009 19:09:39 +0200 Subject: [PATCH 866/900] x86: memtest: remove 64-bit division Using gcc 3.3.5 a "make allmodconfig" + "CONFIG_KVM=n" triggers a build error: arch/x86/mm/built-in.o(.init.text+0x43f7): In function `__change_page_attr': arch/x86/mm/pageattr.c:114: undefined reference to `__udivdi3' make: *** [.tmp_vmlinux1] Error 1 The culprit turned out to be a division in arch/x86/mm/memtest.c For more info see this thread: http://marc.info/?l=linux-kernel&m=124416232620683 The patch entirely removes the division that caused the build error. [ Impact: build fix with certain GCC versions ] Reported-by: Tetsuo Handa Signed-off-by: Andreas Herrmann Cc: Yinghai Lu Cc: xiyou.wangcong@gmail.com Cc: Andrew Morton Cc: LKML-Reference: <20090608170939.GB12431@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 605c8be0621..c0bedcd10f9 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -40,23 +40,23 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) static void __init memtest(u64 pattern, u64 start_phys, u64 size) { - u64 i, count; - u64 *start; + u64 *p; + void *start, *end; u64 start_bad, last_bad; u64 start_phys_aligned; size_t incr; incr = sizeof(pattern); start_phys_aligned = ALIGN(start_phys, incr); - count = (size - (start_phys_aligned - start_phys))/incr; start = __va(start_phys_aligned); + end = start + size - (start_phys_aligned - start_phys); start_bad = 0; last_bad = 0; - for (i = 0; i < count; i++) - start[i] = pattern; - for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { - if (*start == pattern) + for (p = start; p < end; p++) + *p = pattern; + for (p = start; p < end; p++, start_phys_aligned += incr) { + if (*p == pattern) continue; if (start_phys_aligned == last_bad + incr) { last_bad += incr; From 3af968e066d593bc4dacc021715f3e95ddf0996f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Jun 2009 12:31:53 -0700 Subject: [PATCH 867/900] async: Fix lack of boot-time console due to insufficient synchronization Our async work synchronization was broken by "async: make sure independent async domains can't accidentally entangle" (commit d5a877e8dd409d8c702986d06485c374b705d340), because it would report the wrong lowest active async ID when there was both running and pending async work. This caused things like no being able to read the root filesystem, resulting in missing console devices and inability to run 'init', causing a boot-time panic. This fixes it by properly returning the lowest pending async ID: if there is any running async work, that will have a lower ID than any pending work, and we should _not_ look at the pending work list. There were alternative patches from Jaswinder and James, but this one also cleans up the code by removing the pointless 'ret' variable and the unnecesary testing for an empty list around 'for_each_entry()' (if the list is empty, the for_each_entry() thing just won't execute). Fixes-bug: http://bugzilla.kernel.org/show_bug.cgi?id=13474 Reported-and-tested-by: Chris Clayton Cc: Jaswinder Singh Rajput Cc: James Bottomley Cc: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/async.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/kernel/async.c b/kernel/async.c index 50540301ed0..27235f5de19 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -92,23 +92,18 @@ extern int initcall_debug; static async_cookie_t __lowest_in_progress(struct list_head *running) { struct async_entry *entry; - async_cookie_t ret = next_cookie; /* begin with "infinity" value */ if (!list_empty(running)) { entry = list_first_entry(running, struct async_entry, list); - ret = entry->cookie; + return entry->cookie; } - if (!list_empty(&async_pending)) { - list_for_each_entry(entry, &async_pending, list) - if (entry->running == running) { - ret = entry->cookie; - break; - } - } + list_for_each_entry(entry, &async_pending, list) + if (entry->running == running) + return entry->cookie; - return ret; + return next_cookie; /* "infinity" value */ } static async_cookie_t lowest_in_progress(struct list_head *running) From 1f8a6a10fb9437eac3f516ea4324a19087872f30 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 Jun 2009 18:18:39 +0200 Subject: [PATCH 868/900] ring-buffer: pass in lockdep class key for reader_lock On Sun, 7 Jun 2009, Ingo Molnar wrote: > Testing tracer sched_switch: <6>Starting ring buffer hammer > PASSED > Testing tracer sysprof: PASSED > Testing tracer function: PASSED > Testing tracer irqsoff: > ============================================= > PASSED > Testing tracer preemptoff: PASSED > Testing tracer preemptirqsoff: [ INFO: possible recursive locking detected ] > PASSED > Testing tracer branch: 2.6.30-rc8-tip-01972-ge5b9078-dirty #5760 > --------------------------------------------- > rb_consumer/431 is trying to acquire lock: > (&cpu_buffer->reader_lock){......}, at: [] ring_buffer_reset_cpu+0x37/0x70 > > but task is already holding lock: > (&cpu_buffer->reader_lock){......}, at: [] ring_buffer_consume+0x7e/0xc0 > > other info that might help us debug this: > 1 lock held by rb_consumer/431: > #0: (&cpu_buffer->reader_lock){......}, at: [] ring_buffer_consume+0x7e/0xc0 The ring buffer is a generic structure, and can be used outside of ftrace. If ftrace traces within the use of the ring buffer, it can produce false positives with lockdep. This patch passes in a static lock key into the allocation of the ring buffer, so that different ring buffers will have their own lock class. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra LKML-Reference: <1244477919.13761.9042.camel@twins> [ store key in ring buffer descriptor ] Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 14 +++++++++++++- kernel/trace/ring_buffer.c | 9 +++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index f1345828c7c..8670f1575fe 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -105,7 +105,19 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, * size is in bytes for each per CPU buffer. */ struct ring_buffer * -ring_buffer_alloc(unsigned long size, unsigned flags); +__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key); + +/* + * Because the ring buffer is generic, if other users of the ring buffer get + * traced by ftrace, it can produce lockdep warnings. We need to keep each + * ring buffer's lock class separate. + */ +#define ring_buffer_alloc(size, flags) \ +({ \ + static struct lock_class_key __key; \ + __ring_buffer_alloc((size), (flags), &__key); \ +}) + void ring_buffer_free(struct ring_buffer *buffer); int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7102d7a2fad..22878b0d370 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -426,6 +426,8 @@ struct ring_buffer { atomic_t record_disabled; cpumask_var_t cpumask; + struct lock_class_key *reader_lock_key; + struct mutex mutex; struct ring_buffer_per_cpu **buffers; @@ -565,6 +567,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; spin_lock_init(&cpu_buffer->reader_lock); + lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; INIT_LIST_HEAD(&cpu_buffer->pages); @@ -635,7 +638,8 @@ static int rb_cpu_notify(struct notifier_block *self, * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ -struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) +struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, + struct lock_class_key *key) { struct ring_buffer *buffer; int bsize; @@ -658,6 +662,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; buffer->clock = trace_clock_local; + buffer->reader_lock_key = key; /* need at least two pages */ if (buffer->pages == 1) @@ -715,7 +720,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) kfree(buffer); return NULL; } -EXPORT_SYMBOL_GPL(ring_buffer_alloc); +EXPORT_SYMBOL_GPL(__ring_buffer_alloc); /** * ring_buffer_free - free a ring buffer. From f001a70cdc61c01452d42e8b32fd7c7842ef62d5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 Jun 2009 14:30:31 +1000 Subject: [PATCH 869/900] md/raid5: use conf->raid_disks in preference to mddev->raid_disk mddev->raid_disks can be changed and any time by a request from user-space. It is a suggestion as to what number of raid_disks is desired. conf->raid_disks can only be changed by the raid5 module with suitable locks in place. It is a statement as to the current number of raid_disks. There are two places where the latter should be used, but the former is used. This can lead to a crash when reshaping an array. This patch changes to mddev-> to conf-> Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5d400aef8d9..75469e63ff1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3288,7 +3288,7 @@ static void unplug_slaves(mddev_t *mddev) int i; rcu_read_lock(); - for (i=0; iraid_disks; i++) { + for (i = 0; i < conf->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { struct request_queue *r_queue = bdev_get_queue(rdev->bdev); @@ -4034,7 +4034,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski * We don't need to check the 'failed' flag as when that gets set, * recovery aborts. */ - for (i=0; iraid_disks; i++) + for (i = 0; i < conf->raid_disks; i++) if (conf->disks[i].rdev == NULL) still_degraded = 1; From a8c906ca3f63d05f0d25490cf82276f73c6fe095 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 Jun 2009 14:39:59 +1000 Subject: [PATCH 870/900] md/raid5 - avoid deadlocks in get_active_stripe during reshape md has functionality to 'quiesce' and array so that all pending IO completed and no new IO starts. This is used to achieve a stable state before making internal changes. Currently this quiescing applies equally to normal IO, resync IO, and reshape IO. However there is a problem with applying it to reshape IO. Reshape can have multiple 'stripe_heads' that must be active together. If the quiesce come between allocating the first and the last of such a collection, then we deadlock, as the last will not be allocated until the quiesce is lifted, the quiesce will not be lifted until the first (which has been allocated) gets used, and that first cannot be used until the last is allocated. It is not necessary to inhibit reshape IO when a quiesce is requested. Those places in the code that require a full quiesce will ensure the reshape thread is not running at all. So allow reshape requests to get access to new stripe_heads without being blocked by a 'quiesce'. This only affects in-place reshapes (i.e. where the array does not grow or shrink) and these are only newly supported. So this patch is not needed in earlier kernels. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 75469e63ff1..59f2ec0759b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -362,7 +362,7 @@ static void raid5_unplug_device(struct request_queue *q); static struct stripe_head * get_active_stripe(raid5_conf_t *conf, sector_t sector, - int previous, int noblock) + int previous, int noblock, int noquiesce) { struct stripe_head *sh; @@ -372,7 +372,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, do { wait_event_lock_irq(conf->wait_for_stripe, - conf->quiesce == 0, + conf->quiesce == 0 || noquiesce, conf->device_lock, /* nothing */); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { @@ -2671,7 +2671,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, sector_t bn = compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, &dd_idx, NULL); - sh2 = get_active_stripe(conf, s, 0, 1); + sh2 = get_active_stripe(conf, s, 0, 1, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -2944,7 +2944,7 @@ static bool handle_stripe5(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { struct stripe_head *sh2 - = get_active_stripe(conf, sh->sector, 1, 1); + = get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { /* sh cannot be written until sh2 has been read. * so arrange for sh to be delayed a little @@ -3189,7 +3189,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { struct stripe_head *sh2 - = get_active_stripe(conf, sh->sector, 1, 1); + = get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { /* sh cannot be written until sh2 has been read. * so arrange for sh to be delayed a little @@ -3675,7 +3675,7 @@ static int make_request(struct request_queue *q, struct bio * bi) (unsigned long long)logical_sector); sh = get_active_stripe(conf, new_sector, previous, - (bi->bi_rw&RWA_MASK)); + (bi->bi_rw&RWA_MASK), 0); if (sh) { if (unlikely(previous)) { /* expansion might have moved on while waiting for a @@ -3873,7 +3873,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped = 0; - sh = get_active_stripe(conf, stripe_addr+i, 0, 0); + sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -3922,7 +3922,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - sh = get_active_stripe(conf, first_sector, 1, 0); + sh = get_active_stripe(conf, first_sector, 1, 0, 1); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -4022,9 +4022,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_cond_end_sync(mddev->bitmap, sector_nr); - sh = get_active_stripe(conf, sector_nr, 0, 1); + sh = get_active_stripe(conf, sector_nr, 0, 1, 0); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, 0, 0); + sh = get_active_stripe(conf, sector_nr, 0, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -4086,7 +4086,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, 0, 1); + sh = get_active_stripe(conf, sector, 0, 1, 0); if (!sh) { /* failed to get a stripe - must wait */ From 0e6e0271a210817e202c8a4bfffbde3e3c0616d1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 Jun 2009 16:32:22 +1000 Subject: [PATCH 871/900] md/raid5: fix bug in reshape code when chunk_size decreases. Now that we support changing the chunksize, we calculate "reshape_sectors" to be the max of number of sectors in old and new chunk size. However there is one please where we still use 'chunksize' rather than 'reshape_sectors'. This causes a reshape that reduces the size of chunks to freeze. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 59f2ec0759b..bb37fb1b2d8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3916,7 +3916,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped raid5_compute_sector(conf, stripe_addr*(new_data_disks), 1, &dd_idx, NULL); last_sector = - raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512) + raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) *(new_data_disks) - 1), 1, &dd_idx, NULL); if (last_sector >= mddev->dev_sectors) From 29150078d7a1758df8c7a6cd2ec066ac65e1fab9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Jun 2009 10:54:18 +0200 Subject: [PATCH 872/900] amd-iommu: remove BUS_NOTIFY_BOUND_DRIVER handling Handling this event causes device assignment in KVM to fail because the device gets re-attached as soon as the pci-stub registers as the driver for the device. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 8510e90ebfe..81872604eb7 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1145,17 +1145,6 @@ static int device_change_notifier(struct notifier_block *nb, "to a non-dma-ops domain\n", dev_name(dev)); switch (action) { - case BUS_NOTIFY_BOUND_DRIVER: - if (domain) - goto out; - dma_domain = find_protection_domain(devid); - if (!dma_domain) - dma_domain = iommu->default_dom; - attach_device(iommu, &dma_domain->domain, devid); - DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain " - "%d for device %s\n", - dma_domain->domain.id, dev_name(dev)); - break; case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; From 71ff3bca2f70264effe8cbbdd5bc10cf6be5f2f0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 8 Jun 2009 13:47:33 -0700 Subject: [PATCH 873/900] amd-iommu: detach device explicitly before attaching it to a new domain This fixes a bug with a device that could not be assigned to a KVM guest because it is still assigned to a dma_ops protection domain. [chrisw: simply remove WARN_ON(), will always fire since dev->driver will be pci-sub] Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 81872604eb7..772e91088e4 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2073,7 +2073,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, old_domain = domain_for_device(devid); if (old_domain) - return -EBUSY; + detach_device(old_domain, devid); attach_device(iommu, domain, devid); From e9a22a13c71986851e931bdfa054f68839ff8576 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Jun 2009 12:00:37 +0200 Subject: [PATCH 874/900] amd-iommu: remove unnecessary "AMD IOMMU: " prefix That prefix is already included in the DUMP_printk macro. So there is no need to repeat it in the format string. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 772e91088e4..1c60554537c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1266,9 +1266,8 @@ static int get_device_resources(struct device *dev, dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; attach_device(*iommu, *domain, *bdf); - DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain " - "%d for device %s\n", - (*domain)->id, dev_name(dev)); + DUMP_printk("Using protection domain %d for device %s\n", + (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) From fdd7b4c3302c93f6833e338903ea77245eb510b4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Jun 2009 04:01:02 -0700 Subject: [PATCH 875/900] r8169: fix crash when large packets are received Michael Tokarev reported receiving a large packet could crash a machine with RTL8169 NIC. ( original thread at http://lkml.org/lkml/2009/6/8/192 ) Problem is this driver tells that NIC frames up to 16383 bytes can be received but provides skb to rx ring allocated with smaller sizes (1536 bytes in case standard 1500 bytes MTU is used) When a frame larger than what was allocated by driver is received, dma transfert can occurs past the end of buffer and corrupt kernel memory. Fix is to tell to NIC what is the maximum size a frame can be. This bug is very old, (before git introduction, linux-2.6.10), and should be backported to stable versions. Reported-by: Michael Tokarev Signed-off-by: Eric Dumazet Tested-by: Michael Tokarev Signed-off-by: David S. Miller --- drivers/net/r8169.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 8247a945a1d..3b19e0ce290 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -66,7 +66,6 @@ static const int multicast_filter_limit = 32; #define RX_DMA_BURST 6 /* Maximum PCI burst, '6' is 1024 */ #define TX_DMA_BURST 6 /* Maximum PCI burst, '6' is 1024 */ #define EarlyTxThld 0x3F /* 0x3F means NO early transmit */ -#define RxPacketMaxSize 0x3FE8 /* 16K - 1 - ETH_HLEN - VLAN - CRC... */ #define SafeMtu 0x1c20 /* ... actually life sucks beyond ~7k */ #define InterFrameGap 0x03 /* 3 means InterFrameGap = the shortest one */ @@ -2357,10 +2356,10 @@ static u16 rtl_rw_cpluscmd(void __iomem *ioaddr) return cmd; } -static void rtl_set_rx_max_size(void __iomem *ioaddr) +static void rtl_set_rx_max_size(void __iomem *ioaddr, unsigned int rx_buf_sz) { /* Low hurts. Let's disable the filtering. */ - RTL_W16(RxMaxSize, 16383); + RTL_W16(RxMaxSize, rx_buf_sz); } static void rtl8169_set_magic_reg(void __iomem *ioaddr, unsigned mac_version) @@ -2407,7 +2406,7 @@ static void rtl_hw_start_8169(struct net_device *dev) RTL_W8(EarlyTxThres, EarlyTxThld); - rtl_set_rx_max_size(ioaddr); + rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz); if ((tp->mac_version == RTL_GIGA_MAC_VER_01) || (tp->mac_version == RTL_GIGA_MAC_VER_02) || @@ -2668,7 +2667,7 @@ static void rtl_hw_start_8168(struct net_device *dev) RTL_W8(EarlyTxThres, EarlyTxThld); - rtl_set_rx_max_size(ioaddr); + rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz); tp->cp_cmd |= RTL_R16(CPlusCmd) | PktCntrDisable | INTT_1; @@ -2846,7 +2845,7 @@ static void rtl_hw_start_8101(struct net_device *dev) RTL_W8(EarlyTxThres, EarlyTxThld); - rtl_set_rx_max_size(ioaddr); + rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz); tp->cp_cmd |= rtl_rw_cpluscmd(ioaddr) | PCIMulRW; From 52ea3a56a3268bc2a5a7c75e98c81463004e38ef Mon Sep 17 00:00:00 2001 From: Minoru Usui Date: Tue, 9 Jun 2009 04:03:09 -0700 Subject: [PATCH 876/900] cls_cgroup: Fix oops when user send improperly 'tc filter add' request I found a bug in cls_cgroup_change() in cls_cgroup.c. cls_cgroup_change() expected tca[TCA_OPTIONS] was set from user space properly, but tc in iproute2-2.6.29-1 (which I used) didn't set it. In the current source code of tc in git, it set tca[TCA_OPTIONS]. git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git If we always use a newest iproute2 in git when we use cls_cgroup, we don't face this oops probably. But I think, kernel shouldn't panic regardless of use program's behaviour. Signed-off-by: Minoru Usui Signed-off-by: David S. Miller --- net/sched/cls_cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index cc29b44b150..e5becb92b3e 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -167,6 +167,9 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base, struct tcf_exts e; int err; + if (!tca[TCA_OPTIONS]) + return -EINVAL; + if (head == NULL) { if (!handle) return -EINVAL; From 0281b5dc0350cbf6dd21ed558a33cccce77abc02 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 6 Jun 2009 14:50:36 -0700 Subject: [PATCH 877/900] cpumask: introduce zalloc_cpumask_var So can get cpumask_var with cpumask_clear Signed-off-by: Yinghai Lu Signed-off-by: Rusty Russell --- include/linux/cpumask.h | 15 +++++++++++++++ lib/cpumask.c | 12 ++++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 9f315382610..c5ac87ca7bc 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1022,6 +1022,8 @@ typedef struct cpumask *cpumask_var_t; bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); +bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); +bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); @@ -1040,6 +1042,19 @@ static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, return true; } +static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +{ + cpumask_clear(*mask); + return true; +} + +static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, + int node) +{ + cpumask_clear(*mask); + return true; +} + static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } diff --git a/lib/cpumask.c b/lib/cpumask.c index 1f71b97de0f..eb23aaa0c7b 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -119,6 +119,12 @@ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) } EXPORT_SYMBOL(alloc_cpumask_var_node); +bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) +{ + return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node); +} +EXPORT_SYMBOL(zalloc_cpumask_var_node); + /** * alloc_cpumask_var - allocate a struct cpumask * @mask: pointer to cpumask_var_t where the cpumask is returned @@ -135,6 +141,12 @@ bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) } EXPORT_SYMBOL(alloc_cpumask_var); +bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +{ + return alloc_cpumask_var(mask, flags | __GFP_ZERO); +} +EXPORT_SYMBOL(zalloc_cpumask_var); + /** * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena. * @mask: pointer to cpumask_var_t where the cpumask is returned From eaa958402ea40851097d051f52ba1bb7a885efe9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 6 Jun 2009 14:51:36 -0700 Subject: [PATCH 878/900] cpumask: alloc zeroed cpumask for static cpumask_var_ts These are defined as static cpumask_var_t so if MAXSMP is not used, they are cleared already. Avoid surprises when MAXSMP is enabled. Signed-off-by: Yinghai Lu Signed-off-by: Rusty Russell --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 2 +- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +- arch/x86/kernel/tlb_uv.c | 2 +- drivers/acpi/processor_core.c | 2 +- drivers/cpufreq/cpufreq.c | 2 +- kernel/sched_cpupri.c | 2 +- kernel/sched_rt.c | 2 +- kernel/smp.c | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 54b6de2cd94..752e8c6b2c7 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -550,7 +550,7 @@ static int __init acpi_cpufreq_early_init(void) return -ENOMEM; } for_each_possible_cpu(i) { - if (!alloc_cpumask_var_node( + if (!zalloc_cpumask_var_node( &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, GFP_KERNEL, cpu_to_node(i))) { diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index a8363e5be4e..d47c775eb0a 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -322,7 +322,7 @@ static int powernow_acpi_init(void) goto err0; } - if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, + if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, GFP_KERNEL)) { retval = -ENOMEM; goto err05; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 35dc8fbe92b..cf52215d9eb 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -887,7 +887,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); - if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { printk(KERN_ERR PFX "unable to alloc powernow_k8_data cpumask\n"); ret_val = -ENOMEM; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index c9f1fdc0283..55c831ed71c 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -471,7 +471,7 @@ static int centrino_target (struct cpufreq_policy *policy, if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) return -ENOMEM; - if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { + if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { free_cpumask_var(saved_mask); return -ENOMEM; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 6fb0b359d2a..09dd1d414fc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -1163,7 +1163,7 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); err = mce_init_banks(); if (err) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index ed0c33761e6..8c7b03b0cfc 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -832,7 +832,7 @@ static int __init uv_bau_init(void) return 0; for_each_possible_cpu(cur_cpu) - alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), + zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), GFP_KERNEL, cpu_to_node(cur_cpu)); uv_bau_retry_limit = 1; diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 45ad3288c5f..23f0fb84f1c 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -844,7 +844,7 @@ static int acpi_processor_add(struct acpi_device *device) if (!pr) return -ENOMEM; - if (!alloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) { kfree(pr); return -ENOMEM; } diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 47d2ad0ae07..6e2ec0b1894 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -808,7 +808,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) ret = -ENOMEM; goto nomem_out; } - if (!alloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) { free_cpumask_var(policy->cpus); kfree(policy); ret = -ENOMEM; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index cdd3c89574c..344712a5e3e 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -165,7 +165,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) vec->count = 0; if (bootmem) alloc_bootmem_cpumask_var(&vec->mask); - else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) + else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f2c66f8f971..9bf0d2a7304 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void) unsigned int i; for_each_possible_cpu(i) - alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_SMP */ diff --git a/kernel/smp.c b/kernel/smp.c index 858baac568e..ad63d850120 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, + if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return NOTIFY_BAD; break; From 8437a617708d014d6f220df201a24960e00d57b1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 6 Jun 2009 14:52:35 -0700 Subject: [PATCH 879/900] kvm: fix kvm reboot crash when MAXSMP is used one system was found there is crash during reboot then kvm/MAXSMP Sending all processes the KILL signal... done Please stand by while rebooting the system... [ 1721.856538] md: stopping all md devices. [ 1722.852139] kvm: exiting hardware virtualization [ 1722.854601] BUG: unable to handle kernel NULL pointer dereference at (null) [ 1722.872219] IP: [] hardware_disable+0x4c/0xb4 [ 1722.877955] PGD 0 [ 1722.880042] Oops: 0000 [#1] SMP [ 1722.892548] last sysfs file: /sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/vendor [ 1722.900977] CPU 9 [ 1722.912606] Modules linked in: [ 1722.914226] Pid: 0, comm: swapper Not tainted 2.6.30-rc7-tip-01843-g2305324-dirty #299 ... [ 1722.932589] RIP: 0010:[] [] hardware_disable+0x4c/0xb4 [ 1722.942709] RSP: 0018:ffffc900010b6ed8 EFLAGS: 00010046 [ 1722.956121] RAX: 0000000000000000 RBX: ffffc9000e253140 RCX: 0000000000000009 [ 1722.972202] RDX: 000000000000b020 RSI: ffffc900010c3220 RDI: ffffffffffffd790 [ 1722.977399] RBP: ffffc900010b6f08 R08: 0000000000000000 R09: 0000000000000000 [ 1722.995149] R10: 00000000000004b8 R11: 966912b6c78fddbd R12: 0000000000000009 [ 1723.011551] R13: 000000000000b020 R14: 0000000000000009 R15: 0000000000000000 [ 1723.019898] FS: 0000000000000000(0000) GS:ffffc900010b3000(0000) knlGS:0000000000000000 [ 1723.034389] CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b [ 1723.041164] CR2: 0000000000000000 CR3: 0000000001001000 CR4: 00000000000006e0 [ 1723.056192] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1723.072546] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 1723.080562] Process swapper (pid: 0, threadinfo ffff88107e464000, task ffff88047e5a2550) [ 1723.096144] Stack: [ 1723.099071] 0000000000000046 ffffc9000e253168 966912b6c78fddbd ffffc9000e253140 [ 1723.115471] ffff880c7d4304d0 ffffc9000e253168 ffffc900010b6f28 ffffffff81011022 [ 1723.132428] ffffc900010b6f48 966912b6c78fddbd ffffc900010b6f48 ffffffff8100b83b [ 1723.141973] Call Trace: [ 1723.142981] <0> [] kvm_arch_hardware_disable+0x26/0x3c [ 1723.158153] [] hardware_disable+0x3f/0x55 [ 1723.172168] [] generic_smp_call_function_interrupt+0x76/0x13c [ 1723.178836] [] smp_call_function_interrupt+0x3a/0x5e [ 1723.194689] [] call_function_interrupt+0x13/0x20 [ 1723.199750] <0> [] ? acpi_idle_enter_c1+0xd3/0xf4 [ 1723.217508] [] ? acpi_idle_enter_c1+0xcd/0xf4 [ 1723.232172] [] ? acpi_idle_enter_bm+0xe7/0x2ce [ 1723.235141] [] ? __atomic_notifier_call_chain+0x0/0xac [ 1723.253381] [] ? menu_select+0x58/0xd2 [ 1723.258179] [] ? cpuidle_idle_call+0xa4/0xf3 [ 1723.272828] [] ? cpu_idle+0xb8/0x101 [ 1723.277085] [] ? start_secondary+0x1bc/0x1d7 [ 1723.293708] Code: b0 00 00 65 48 8b 04 25 28 00 00 00 48 89 45 e0 31 c0 48 8b 04 cd 30 ee 27 82 49 89 cc 49 89 d5 48 8b 04 10 48 8d b8 90 d7 ff ff <48> 8b 87 70 28 00 00 48 8d 98 90 d7 ff ff eb 16 e8 e9 fe ff ff [ 1723.335524] RIP [] hardware_disable+0x4c/0xb4 [ 1723.342076] RSP [ 1723.352021] CR2: 0000000000000000 [ 1723.354348] ---[ end trace e2aec53dae150aa1 ]--- it turns out that we need clear cpus_hardware_enabled in that case. Reported-and-tested-by: Yinghai Lu Signed-off-by: Yinghai Lu Signed-off-by: Rusty Russell --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4293528200b..4d0dd390aa5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2301,7 +2301,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size, bad_pfn = page_to_pfn(bad_page); - if (!alloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto out_free_0; } From c1d4c41f2fdfe66dea957b76d005affba3e56b26 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 9 Jun 2009 15:17:37 +0200 Subject: [PATCH 880/900] bsg: setting rq->bio to NULL Due to commit 1cd96c242a829d52f7a5ae98f554ca9775429685 ("block: WARN in __blk_put_request() for potential bio leak"), BSG SMP requests get the false warnings: WARNING: at block/blk-core.c:1068 __blk_put_request+0x52/0xc0() This sets rq->bio to NULL to avoid that false warnings. Signed-off-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- block/bsg.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/bsg.c b/block/bsg.c index 206060e795d..dd81be455e0 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -315,6 +315,7 @@ out: blk_put_request(rq); if (next_rq) { blk_rq_unmap_user(next_rq->bio); + next_rq->bio = NULL; blk_put_request(next_rq); } return ERR_PTR(ret); @@ -448,6 +449,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, hdr->dout_resid = rq->data_len; hdr->din_resid = rq->next_rq->data_len; blk_rq_unmap_user(bidi_bio); + rq->next_rq->bio = NULL; blk_put_request(rq->next_rq); } else if (rq_data_dir(rq) == READ) hdr->din_resid = rq->data_len; @@ -466,6 +468,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, blk_rq_unmap_user(bio); if (rq->cmd != rq->__cmd) kfree(rq->cmd); + rq->bio = NULL; blk_put_request(rq); return ret; From 42937e81a82b6bbc51a309c83da140b3a7ca5945 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Mon, 8 Jun 2009 15:55:09 +0200 Subject: [PATCH 881/900] x86: Detect use of extended APIC ID for AMD CPUs Booting a 32-bit kernel on Magny-Cours results in the following panic: ... Using APIC driver default ... Overriding APIC driver with bigsmp ... Getting VERSION: 80050010 Getting VERSION: 80050010 Getting ID: 10000000 Getting ID: ef000000 Getting LVT0: 700 Getting LVT1: 10000 Kernel panic - not syncing: Boot APIC ID in local APIC unexpected (16 vs 0) Pid: 1, comm: swapper Not tainted 2.6.30-rcX #2 Call Trace: [] ? panic+0x38/0xd3 [] ? native_smp_prepare_cpus+0x259/0x31f [] ? kernel_init+0x3e/0x141 [] ? kernel_init+0x0/0x141 [] ? kernel_thread_helper+0x7/0x10 The reason is that default_get_apic_id handled extension of local APIC ID field just in case of XAPIC. Thus for this AMD CPU, default_get_apic_id() returns 0 and bigsmp_get_apic_id() returns 16 which leads to the respective kernel panic. This patch introduces a Linux specific feature flag to indicate support for extended APIC id (8 bits instead of 4 bits width) and sets the flag on AMD CPUs if applicable. Signed-off-by: Andreas Herrmann Cc: LKML-Reference: <20090608135509.GA12431@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 2 +- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/kernel/cpu/amd.c | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 42f2f837742..9b2c04910e0 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -410,7 +410,7 @@ static inline unsigned default_get_apic_id(unsigned long x) { unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (APIC_XAPIC(ver)) + if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID)) return (x >> 24) & 0xFF; else return (x >> 24) & 0x0F; diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index bb83b1c397a..78dee4f0f7a 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -94,6 +94,7 @@ #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ +#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459daa6..0802e151c2c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -6,6 +6,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 # include @@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) (c->x86_model == 8 && c->x86_mask >= 8)) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) + /* check CPU config space for extended APIC ID */ + if (c->x86 >= 0xf) { + unsigned int val; + val = read_pci_config(0, 24, 0, 0x68); + if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } +#endif } static void __cpuinit init_amd(struct cpuinfo_x86 *c) From 0b8c3d5ab000c22889af7f9409799a6cdc31a2b2 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Tue, 9 Jun 2009 10:40:50 -0400 Subject: [PATCH 882/900] x86: Clear TS in irq_ts_save() when in an atomic section The dynamic FPU context allocation changes caused the padlock driver to generate the below warning. Fix it by masking TS when doing padlock encryption operations in an atomic section. This solves: BUG: sleeping function called from invalid context at mm/slub.c:1602 in_atomic(): 1, irqs_disabled(): 0, pid: 82, name: cryptomgr_test Pid: 82, comm: cryptomgr_test Not tainted 2.6.29.4-168.test7.fc11.x86_64 #1 Call Trace: [] __might_sleep+0x10b/0x110 [] kmem_cache_alloc+0x37/0xf1 [] init_fpu+0x49/0x8a [] math_state_restore+0x3e/0xbc [] do_device_not_available+0x9/0xb [] device_not_available+0x1b/0x20 [] ? aes_crypt+0x66/0x74 [padlock_aes] [] ? blkcipher_walk_next+0x257/0x2e0 [] ? blkcipher_walk_first+0x18e/0x19d [] aes_encrypt+0x9d/0xe5 [padlock_aes] [] crypt+0x6b/0x114 [xts] [] ? aes_encrypt+0x0/0xe5 [padlock_aes] [] ? aes_encrypt+0x0/0xe5 [padlock_aes] [] encrypt+0x49/0x4b [xts] [] async_encrypt+0x3c/0x3e [] test_skcipher+0x1da/0x658 [] ? crypto_spawn_tfm+0x8e/0xb1 [] ? __crypto_alloc_tfm+0x11b/0x15f [] ? crypto_spawn_tfm+0x8e/0xb1 [] ? skcipher_geniv_init+0x2b/0x47 [] ? async_chainiv_init+0x5c/0x61 [] alg_test_skcipher+0x63/0x9b [] alg_test+0x12d/0x175 [] cryptomgr_test+0x38/0x54 [] ? cryptomgr_test+0x0/0x54 [] kthread+0x4d/0x78 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? kthread+0x0/0x78 [] ? child_rip+0x0/0x20 Signed-off-by: Chuck Ebbert Cc: Suresh Siddha LKML-Reference: <20090609104050.50158cfe@dhcp-100-2-144.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 71c9e518398..4aab52f8e41 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -305,18 +305,18 @@ static inline void kernel_fpu_end(void) /* * Some instructions like VIA's padlock instructions generate a spurious * DNA fault but don't modify SSE registers. And these instructions - * get used from interrupt context aswell. To prevent these kernel instructions - * in interrupt context interact wrongly with other user/kernel fpu usage, we + * get used from interrupt context as well. To prevent these kernel instructions + * in interrupt context interacting wrongly with other user/kernel fpu usage, we * should use them only in the context of irq_ts_save/restore() */ static inline int irq_ts_save(void) { /* - * If we are in process context, we are ok to take a spurious DNA fault. - * Otherwise, doing clts() in process context require pre-emption to - * be disabled or some heavy lifting like kernel_fpu_begin() + * If in process context and not atomic, we can take a spurious DNA fault. + * Otherwise, doing clts() in process context requires disabling preemption + * or some heavy lifting like kernel_fpu_begin() */ - if (!in_interrupt()) + if (!in_atomic()) return 0; if (read_cr0() & X86_CR0_TS) { From f57a8a1911342265e7acdc190333c4e9235a6632 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 5 Jun 2009 14:11:30 -0400 Subject: [PATCH 883/900] ring-buffer: fix ret in rb_add_time_stamp The update of ret got mistakenly added to the if statement of rb_try_to_discard. The variable ret should be 1 on commit and zero otherwise. [ Impact: fix compiler warning and real bug ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 22878b0d370..2e642b2b725 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1433,8 +1433,8 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, /* Darn, this is just wasted space */ event->time_delta = 0; event->array[0] = 0; - ret = 0; } + ret = 0; } *delta = 0; From 55782138e47d9baf2f7d3a7af9e7cf42adf72c56 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 9 Jun 2009 13:43:05 +0800 Subject: [PATCH 884/900] tracing/events: convert block trace points to TRACE_EVENT() TRACE_EVENT is a more generic way to define tracepoints. Doing so adds these new capabilities to this tracepoint: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions ... Cons: - no dev_t info for the output of plug, unplug_timer and unplug_io events. no dev_t info for getrq and sleeprq events if bio == NULL. no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL. This is mainly because we can't get the deivce from a request queue. But this may change in the future. - A packet command is converted to a string in TP_assign, not TP_print. While blktrace do the convertion just before output. Since pc requests should be rather rare, this is not a big issue. - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT has a unique format, which means we have some unused data in a trace entry. The overhead is minimized by using __dynamic_array() instead of __array(). I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing: dd dd + ioctl blktrace dd + TRACE_EVENT (splice) 1 7.36s, 42.7 MB/s 7.50s, 42.0 MB/s 7.41s, 42.5 MB/s 2 7.43s, 42.3 MB/s 7.48s, 42.1 MB/s 7.43s, 42.4 MB/s 3 7.38s, 42.6 MB/s 7.45s, 42.2 MB/s 7.41s, 42.5 MB/s So the overhead of tracing is very small, and no regression when using those trace events vs blktrace. And the binary output of TRACE_EVENT is much smaller than blktrace: # ls -l -h -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out Following are some comparisons between TRACE_EVENT and blktrace: plug: kjournald-480 [000] 303.084981: block_plug: [kjournald] kjournald-480 [000] 303.084981: 8,0 P N [kjournald] unplug_io: kblockd/0-118 [000] 300.052973: block_unplug_io: [kblockd/0] 1 kblockd/0-118 [000] 300.052974: 8,0 U N [kblockd/0] 1 remap: kjournald-480 [000] 303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384 kjournald-480 [000] 303.085043: 8,0 A W 102736992 + 8 <- (8,8) 33384 bio_backmerge: kjournald-480 [000] 303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald] kjournald-480 [000] 303.085086: 8,0 M W 102737032 + 8 [kjournald] getrq: kjournald-480 [000] 303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald] kjournald-480 [000] 303.084975: 8,0 G W 102736984 + 8 [kjournald] bash-2066 [001] 1072.953770: 8,0 G N [bash] bash-2066 [001] 1072.953773: block_getrq: 0,0 N 0 + 0 [bash] rq_complete: konsole-2065 [001] 300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0] konsole-2065 [001] 300.053191: 8,0 C W 103669040 + 16 [0] ksoftirqd/1-7 [001] 1072.953811: 8,0 C N (5a 00 08 00 00 00 00 00 24 00) [0] ksoftirqd/1-7 [001] 1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0] rq_insert: kjournald-480 [000] 303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald] kjournald-480 [000] 303.084986: 8,0 I W 102736984 + 8 [kjournald] Changelog from v2 -> v3: - use the newly introduced __dynamic_array(). Changelog from v1 -> v2: - use __string() instead of __array() to minimize the memory required to store hex dump of rq->cmd(). - support large pc requests. - add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT. - some cleanups. Signed-off-by: Li Zefan LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- block/blk-core.c | 16 +- block/elevator.c | 8 +- drivers/md/dm.c | 5 +- fs/bio.c | 3 +- include/linux/blktrace_api.h | 13 + include/trace/block.h | 76 ------ include/trace/events/block.h | 483 +++++++++++++++++++++++++++++++++++ kernel/trace/Makefile | 5 +- kernel/trace/blktrace.c | 78 +++++- mm/bounce.c | 5 +- 10 files changed, 588 insertions(+), 104 deletions(-) delete mode 100644 include/trace/block.h create mode 100644 include/trace/events/block.h diff --git a/block/blk-core.c b/block/blk-core.c index 1306de9cce0..9475bf99b89 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -28,22 +28,14 @@ #include #include #include -#include + +#define CREATE_TRACE_POINTS +#include #include "blk.h" -DEFINE_TRACE(block_plug); -DEFINE_TRACE(block_unplug_io); -DEFINE_TRACE(block_unplug_timer); -DEFINE_TRACE(block_getrq); -DEFINE_TRACE(block_sleeprq); -DEFINE_TRACE(block_rq_requeue); -DEFINE_TRACE(block_bio_backmerge); -DEFINE_TRACE(block_bio_frontmerge); -DEFINE_TRACE(block_bio_queue); -DEFINE_TRACE(block_rq_complete); -DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */ EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); static int __make_request(struct request_queue *q, struct bio *bio); diff --git a/block/elevator.c b/block/elevator.c index 7073a907257..e220f0c543e 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -33,17 +33,16 @@ #include #include #include -#include #include #include +#include + #include "blk.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); -DEFINE_TRACE(block_rq_abort); - /* * Merge hash stuff. */ @@ -55,9 +54,6 @@ static const int elv_hash_shift = 6; #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) -DEFINE_TRACE(block_rq_insert); -DEFINE_TRACE(block_rq_issue); - /* * Query io scheduler to see if the current process issuing bio may be * merged with rq. diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e2ee4a79ea2..3fd8b1e6548 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -20,7 +20,8 @@ #include #include #include -#include + +#include #define DM_MSG_PREFIX "core" @@ -53,8 +54,6 @@ struct dm_target_io { union map_info info; }; -DEFINE_TRACE(block_bio_complete); - /* * For request-based dm. * One of these is allocated per request. diff --git a/fs/bio.c b/fs/bio.c index 98711647ece..740699c4f90 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -26,10 +26,9 @@ #include #include #include -#include #include /* for struct sg_iovec */ -DEFINE_TRACE(block_split); +#include /* * Test patch to inline a certain number of bi_io_vec's inside the bio diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 82b4636030e..c7ec31dd04c 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -218,5 +218,18 @@ static inline int blk_trace_init_sysfs(struct device *dev) #endif /* CONFIG_BLK_DEV_IO_TRACE */ +#ifdef CONFIG_EVENT_TRACING + +static inline int blk_cmd_buf_len(struct request *rq) +{ + return blk_pc_request(rq) ? rq->cmd_len * 3 : 1; +} + +extern void blk_dump_cmd(char *buf, struct request *rq); +extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes); +extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq); + +#endif /* CONFIG_EVENT_TRACING */ + #endif /* __KERNEL__ */ #endif diff --git a/include/trace/block.h b/include/trace/block.h deleted file mode 100644 index 5b12efa096b..00000000000 --- a/include/trace/block.h +++ /dev/null @@ -1,76 +0,0 @@ -#ifndef _TRACE_BLOCK_H -#define _TRACE_BLOCK_H - -#include -#include - -DECLARE_TRACE(block_rq_abort, - TP_PROTO(struct request_queue *q, struct request *rq), - TP_ARGS(q, rq)); - -DECLARE_TRACE(block_rq_insert, - TP_PROTO(struct request_queue *q, struct request *rq), - TP_ARGS(q, rq)); - -DECLARE_TRACE(block_rq_issue, - TP_PROTO(struct request_queue *q, struct request *rq), - TP_ARGS(q, rq)); - -DECLARE_TRACE(block_rq_requeue, - TP_PROTO(struct request_queue *q, struct request *rq), - TP_ARGS(q, rq)); - -DECLARE_TRACE(block_rq_complete, - TP_PROTO(struct request_queue *q, struct request *rq), - TP_ARGS(q, rq)); - -DECLARE_TRACE(block_bio_bounce, - TP_PROTO(struct request_queue *q, struct bio *bio), - TP_ARGS(q, bio)); - -DECLARE_TRACE(block_bio_complete, - TP_PROTO(struct request_queue *q, struct bio *bio), - TP_ARGS(q, bio)); - -DECLARE_TRACE(block_bio_backmerge, - TP_PROTO(struct request_queue *q, struct bio *bio), - TP_ARGS(q, bio)); - -DECLARE_TRACE(block_bio_frontmerge, - TP_PROTO(struct request_queue *q, struct bio *bio), - TP_ARGS(q, bio)); - -DECLARE_TRACE(block_bio_queue, - TP_PROTO(struct request_queue *q, struct bio *bio), - TP_ARGS(q, bio)); - -DECLARE_TRACE(block_getrq, - TP_PROTO(struct request_queue *q, struct bio *bio, int rw), - TP_ARGS(q, bio, rw)); - -DECLARE_TRACE(block_sleeprq, - TP_PROTO(struct request_queue *q, struct bio *bio, int rw), - TP_ARGS(q, bio, rw)); - -DECLARE_TRACE(block_plug, - TP_PROTO(struct request_queue *q), - TP_ARGS(q)); - -DECLARE_TRACE(block_unplug_timer, - TP_PROTO(struct request_queue *q), - TP_ARGS(q)); - -DECLARE_TRACE(block_unplug_io, - TP_PROTO(struct request_queue *q), - TP_ARGS(q)); - -DECLARE_TRACE(block_split, - TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu), - TP_ARGS(q, bio, pdu)); - -DECLARE_TRACE(block_remap, - TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t from), - TP_ARGS(q, bio, dev, from)); - -#endif diff --git a/include/trace/events/block.h b/include/trace/events/block.h new file mode 100644 index 00000000000..a99d1e565bb --- /dev/null +++ b/include/trace/events/block.h @@ -0,0 +1,483 @@ +#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BLOCK_H + +#include +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM block + +TRACE_EVENT(block_rq_abort, + + TP_PROTO(struct request_queue *q, struct request *rq), + + TP_ARGS(q, rq), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( int, errors ) + __array( char, rwbs, 6 ) + __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + ), + + TP_fast_assign( + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector; + __entry->nr_sector = blk_pc_request(rq) ? + 0 : rq->hard_nr_sectors; + __entry->errors = rq->errors; + + blk_fill_rwbs_rq(__entry->rwbs, rq); + blk_dump_cmd(__get_str(cmd), rq); + ), + + TP_printk("%d,%d %s (%s) %llu + %u [%d]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, __get_str(cmd), + __entry->sector, __entry->nr_sector, __entry->errors) +); + +TRACE_EVENT(block_rq_insert, + + TP_PROTO(struct request_queue *q, struct request *rq), + + TP_ARGS(q, rq), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( unsigned int, bytes ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + ), + + TP_fast_assign( + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector; + __entry->nr_sector = blk_pc_request(rq) ? + 0 : rq->hard_nr_sectors; + __entry->bytes = blk_pc_request(rq) ? rq->data_len : 0; + + blk_fill_rwbs_rq(__entry->rwbs, rq); + blk_dump_cmd(__get_str(cmd), rq); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, __entry->bytes, __get_str(cmd), + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_rq_issue, + + TP_PROTO(struct request_queue *q, struct request *rq), + + TP_ARGS(q, rq), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( unsigned int, bytes ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + ), + + TP_fast_assign( + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector; + __entry->nr_sector = blk_pc_request(rq) ? + 0 : rq->hard_nr_sectors; + __entry->bytes = blk_pc_request(rq) ? rq->data_len : 0; + + blk_fill_rwbs_rq(__entry->rwbs, rq); + blk_dump_cmd(__get_str(cmd), rq); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, __entry->bytes, __get_str(cmd), + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_rq_requeue, + + TP_PROTO(struct request_queue *q, struct request *rq), + + TP_ARGS(q, rq), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( int, errors ) + __array( char, rwbs, 6 ) + __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + ), + + TP_fast_assign( + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector; + __entry->nr_sector = blk_pc_request(rq) ? + 0 : rq->hard_nr_sectors; + __entry->errors = rq->errors; + + blk_fill_rwbs_rq(__entry->rwbs, rq); + blk_dump_cmd(__get_str(cmd), rq); + ), + + TP_printk("%d,%d %s (%s) %llu + %u [%d]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, __get_str(cmd), + __entry->sector, __entry->nr_sector, __entry->errors) +); + +TRACE_EVENT(block_rq_complete, + + TP_PROTO(struct request_queue *q, struct request *rq), + + TP_ARGS(q, rq), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( int, errors ) + __array( char, rwbs, 6 ) + __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + ), + + TP_fast_assign( + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector; + __entry->nr_sector = blk_pc_request(rq) ? + 0 : rq->hard_nr_sectors; + __entry->errors = rq->errors; + + blk_fill_rwbs_rq(__entry->rwbs, rq); + blk_dump_cmd(__get_str(cmd), rq); + ), + + TP_printk("%d,%d %s (%s) %llu + %u [%d]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, __get_str(cmd), + __entry->sector, __entry->nr_sector, __entry->errors) +); +TRACE_EVENT(block_bio_bounce, + + TP_PROTO(struct request_queue *q, struct bio *bio), + + TP_ARGS(q, bio), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_bio_complete, + + TP_PROTO(struct request_queue *q, struct bio *bio), + + TP_ARGS(q, bio), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned, nr_sector ) + __field( int, error ) + __array( char, rwbs, 6 ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + ), + + TP_printk("%d,%d %s %llu + %u [%d]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->error) +); + +TRACE_EVENT(block_bio_backmerge, + + TP_PROTO(struct request_queue *q, struct bio *bio), + + TP_ARGS(q, bio), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_bio_frontmerge, + + TP_PROTO(struct request_queue *q, struct bio *bio), + + TP_ARGS(q, bio), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned, nr_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_bio_queue, + + TP_PROTO(struct request_queue *q, struct bio *bio), + + TP_ARGS(q, bio), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_getrq, + + TP_PROTO(struct request_queue *q, struct bio *bio, int rw), + + TP_ARGS(q, bio, rw), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; + __entry->sector = bio ? bio->bi_sector : 0; + __entry->nr_sector = bio ? bio->bi_size >> 9 : 0; + blk_fill_rwbs(__entry->rwbs, + bio ? bio->bi_rw : 0, __entry->nr_sector); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_sleeprq, + + TP_PROTO(struct request_queue *q, struct bio *bio, int rw), + + TP_ARGS(q, bio, rw), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; + __entry->sector = bio ? bio->bi_sector : 0; + __entry->nr_sector = bio ? bio->bi_size >> 9 : 0; + blk_fill_rwbs(__entry->rwbs, + bio ? bio->bi_rw : 0, __entry->nr_sector); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_plug, + + TP_PROTO(struct request_queue *q), + + TP_ARGS(q), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("[%s]", __entry->comm) +); + +TRACE_EVENT(block_unplug_timer, + + TP_PROTO(struct request_queue *q), + + TP_ARGS(q), + + TP_STRUCT__entry( + __field( int, nr_rq ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE]; + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("[%s] %d", __entry->comm, __entry->nr_rq) +); + +TRACE_EVENT(block_unplug_io, + + TP_PROTO(struct request_queue *q), + + TP_ARGS(q), + + TP_STRUCT__entry( + __field( int, nr_rq ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE]; + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("[%s] %d", __entry->comm, __entry->nr_rq) +); + +TRACE_EVENT(block_split, + + TP_PROTO(struct request_queue *q, struct bio *bio, + unsigned int new_sector), + + TP_ARGS(q, bio, new_sector), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( sector_t, new_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->new_sector = new_sector; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu / %llu [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->new_sector, __entry->comm) +); + +TRACE_EVENT(block_remap, + + TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, + sector_t from), + + TP_ARGS(q, bio, dev, from), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( dev_t, old_dev ) + __field( sector_t, old_sector ) + __array( char, rwbs, 6 ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + __entry->old_dev = dev; + __entry->old_sector = from; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + ), + + TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, + MAJOR(__entry->old_dev), MINOR(__entry->old_dev), + __entry->old_sector) +); + +#endif /* _TRACE_BLOCK_H */ + +/* This part must be outside protection */ +#include + diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 06b85850fab..844164dca90 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -45,7 +45,10 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o -obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +ifeq ($(CONFIG_BLOCK),y) +obj-$(CONFIG_EVENT_TRACING) += blktrace.o +endif obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e3abf55bc8e..7bd6a9893c2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -23,10 +23,14 @@ #include #include #include -#include #include + +#include + #include "trace_output.h" +#ifdef CONFIG_BLK_DEV_IO_TRACE + static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; @@ -1658,3 +1662,75 @@ int blk_trace_init_sysfs(struct device *dev) return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); } +#endif /* CONFIG_BLK_DEV_IO_TRACE */ + +#ifdef CONFIG_EVENT_TRACING + +void blk_dump_cmd(char *buf, struct request *rq) +{ + int i, end; + int len = rq->cmd_len; + unsigned char *cmd = rq->cmd; + + if (!blk_pc_request(rq)) { + buf[0] = '\0'; + return; + } + + for (end = len - 1; end >= 0; end--) + if (cmd[end]) + break; + end++; + + for (i = 0; i < len; i++) { + buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]); + if (i == end && end != len - 1) { + sprintf(buf, " .."); + break; + } + } +} + +void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) +{ + int i = 0; + + if (rw & WRITE) + rwbs[i++] = 'W'; + else if (rw & 1 << BIO_RW_DISCARD) + rwbs[i++] = 'D'; + else if (bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; + + if (rw & 1 << BIO_RW_AHEAD) + rwbs[i++] = 'A'; + if (rw & 1 << BIO_RW_BARRIER) + rwbs[i++] = 'B'; + if (rw & 1 << BIO_RW_SYNCIO) + rwbs[i++] = 'S'; + if (rw & 1 << BIO_RW_META) + rwbs[i++] = 'M'; + + rwbs[i] = '\0'; +} + +void blk_fill_rwbs_rq(char *rwbs, struct request *rq) +{ + int rw = rq->cmd_flags & 0x03; + int bytes; + + if (blk_discard_rq(rq)) + rw |= (1 << BIO_RW_DISCARD); + + if (blk_pc_request(rq)) + bytes = rq->data_len; + else + bytes = rq->hard_nr_sectors << 9; + + blk_fill_rwbs(rwbs, rw, bytes); +} + +#endif /* CONFIG_EVENT_TRACING */ + diff --git a/mm/bounce.c b/mm/bounce.c index e590272fe7a..65f5e17e411 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -14,16 +14,15 @@ #include #include #include -#include #include +#include + #define POOL_SIZE 64 #define ISA_POOL_SIZE 16 static mempool_t *page_pool, *isa_page_pool; -DEFINE_TRACE(block_bio_bounce); - #ifdef CONFIG_HIGHMEM static __init int init_emergency_pool(void) { From 6556d1df88fe68f9836beeb43342a336691cb67c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 9 Jun 2009 14:04:26 -0400 Subject: [PATCH 885/900] tracing: fix the block trace points print size The sector field is either u64 or unsigned long depending on the arch. This patch casts the sector to unsigned long long to prevent the printf warnings. [ Impact: remove compile warnings ] Signed-off-by: Steven Rostedt --- include/trace/events/block.h | 45 ++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index a99d1e565bb..53effd496a5 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -37,7 +37,8 @@ TRACE_EVENT(block_rq_abort, TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), - __entry->sector, __entry->nr_sector, __entry->errors) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->errors) ); TRACE_EVENT(block_rq_insert, @@ -71,7 +72,8 @@ TRACE_EVENT(block_rq_insert, TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __entry->bytes, __get_str(cmd), - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_rq_issue, @@ -105,7 +107,8 @@ TRACE_EVENT(block_rq_issue, TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __entry->bytes, __get_str(cmd), - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_rq_requeue, @@ -137,7 +140,8 @@ TRACE_EVENT(block_rq_requeue, TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), - __entry->sector, __entry->nr_sector, __entry->errors) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->errors) ); TRACE_EVENT(block_rq_complete, @@ -169,7 +173,8 @@ TRACE_EVENT(block_rq_complete, TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), - __entry->sector, __entry->nr_sector, __entry->errors) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->errors) ); TRACE_EVENT(block_bio_bounce, @@ -195,7 +200,8 @@ TRACE_EVENT(block_bio_bounce, TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_bio_complete, @@ -221,7 +227,8 @@ TRACE_EVENT(block_bio_complete, TP_printk("%d,%d %s %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->error) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->error) ); TRACE_EVENT(block_bio_backmerge, @@ -248,7 +255,8 @@ TRACE_EVENT(block_bio_backmerge, TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_bio_frontmerge, @@ -275,7 +283,8 @@ TRACE_EVENT(block_bio_frontmerge, TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_bio_queue, @@ -302,7 +311,8 @@ TRACE_EVENT(block_bio_queue, TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_getrq, @@ -330,7 +340,8 @@ TRACE_EVENT(block_getrq, TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_sleeprq, @@ -358,7 +369,8 @@ TRACE_EVENT(block_sleeprq, TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_plug, @@ -441,7 +453,9 @@ TRACE_EVENT(block_split, TP_printk("%d,%d %s %llu / %llu [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->new_sector, __entry->comm) + (unsigned long long)__entry->sector, + (unsigned long long)__entry->new_sector, + __entry->comm) ); TRACE_EVENT(block_remap, @@ -471,9 +485,10 @@ TRACE_EVENT(block_remap, TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, + (unsigned long long)__entry->sector, + __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), - __entry->old_sector) + (unsigned long long)__entry->old_sector) ); #endif /* _TRACE_BLOCK_H */ From 725c624a58a10ef90a2ff889e122158fabf36147 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 8 Jun 2009 19:09:45 -0400 Subject: [PATCH 886/900] tracing: add trace_seq_vprint interface The code to update the print formats for events requires a vprintf format in the trace_seq. This patch adds that interface. Signed-off-by: Steven Rostedt --- include/linux/trace_seq.h | 2 ++ kernel/trace/trace_output.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index ba9627f00d3..c68bccba207 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -27,6 +27,8 @@ trace_seq_init(struct trace_seq *s) #ifdef CONFIG_TRACING extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); +extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) + __attribute__ ((format (printf, 2, 0))); extern int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 425725c1622..c05aff465dc 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -100,6 +100,38 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) } EXPORT_SYMBOL_GPL(trace_seq_printf); +/** + * trace_seq_vprintf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +int +trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (!len) + return 0; + + ret = vsnprintf(s->buffer + s->len, len, fmt, args); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) + return 0; + + s->len += ret; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_vprintf); + int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { int len = (PAGE_SIZE - 1) - s->len; From 110bf2b764eb6026b868d84499263cb24b1bcc8d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 9 Jun 2009 17:29:07 -0400 Subject: [PATCH 887/900] tracing: add protection around module events unload When reading the trace buffer, there is a race that when a module is unloaded it removes events that is stilled referenced in the buffers. This patch adds the protection around the unloading of the events from modules and the reading of the trace buffers. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 4 +++- kernel/trace/trace_output.c | 15 ++++++++++++--- kernel/trace/trace_output.h | 4 ++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6c81f9c2142..aa08be69a1b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1050,12 +1050,13 @@ static void trace_module_remove_events(struct module *mod) struct ftrace_event_call *call, *p; bool found = false; + down_write(&trace_event_mutex); list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; ftrace_event_enable_disable(call, 0); if (call->event) - unregister_ftrace_event(call->event); + __unregister_ftrace_event(call->event); debugfs_remove_recursive(call->dir); list_del(&call->list); trace_destroy_fields(call); @@ -1079,6 +1080,7 @@ static void trace_module_remove_events(struct module *mod) */ if (found) tracing_reset_current_online_cpus(); + up_write(&trace_event_mutex); } static int trace_module_notify(struct notifier_block *self, diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c05aff465dc..7938f3ae93e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@ /* must be a power of 2 */ #define EVENT_HASHSIZE 128 -static DECLARE_RWSEM(trace_event_mutex); +DECLARE_RWSEM(trace_event_mutex); DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); @@ -702,6 +702,16 @@ int register_ftrace_event(struct trace_event *event) } EXPORT_SYMBOL_GPL(register_ftrace_event); +/* + * Used by module code with the trace_event_mutex held for write. + */ +int __unregister_ftrace_event(struct trace_event *event) +{ + hlist_del(&event->node); + list_del(&event->list); + return 0; +} + /** * unregister_ftrace_event - remove a no longer used event * @event: the event to remove @@ -709,8 +719,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_event); int unregister_ftrace_event(struct trace_event *event) { down_write(&trace_event_mutex); - hlist_del(&event->node); - list_del(&event->list); + __unregister_ftrace_event(event); up_write(&trace_event_mutex); return 0; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index ac240e76eb0..d38bec4a9c3 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -27,6 +27,10 @@ extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); +/* used by module unregistering */ +extern int __unregister_ftrace_event(struct trace_event *event); +extern struct rw_semaphore trace_event_mutex; + #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) From 586c7e6a280580fd94b662bf486f9bb31098d14b Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 9 Jun 2009 16:26:23 -0700 Subject: [PATCH 888/900] shm: fix unused warnings on nommu The massive nommu update (8feae131) resulted in these warnings: ipc/shm.c: In function `sys_shmdt': ipc/shm.c:974: warning: unused variable `size' ipc/shm.c:972: warning: unused variable `next' Signed-off-by: Mike Frysinger Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index faa46da99eb..42597160048 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -969,10 +969,13 @@ SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *next; + struct vm_area_struct *vma; unsigned long addr = (unsigned long)shmaddr; - loff_t size = 0; int retval = -EINVAL; +#ifdef CONFIG_MMU + loff_t size = 0; + struct vm_area_struct *next; +#endif if (addr & ~PAGE_MASK) return retval; From 463aea1a1c49f1a7d4b50656dfd6c8bb33358b1b Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 9 Jun 2009 16:26:24 -0700 Subject: [PATCH 889/900] autofs4: remove hashed check in validate_wait() The recent ->lookup() deadlock correction required the directory inode mutex to be dropped while waiting for expire completion. We were concerned about side effects from this change and one has been identified. I saw several error messages. They cause autofs to become quite confused and don't really point to the actual problem. Things like: handle_packet_missing_direct:1376: can't find map entry for (43,1827932) which is usually totally fatal (although in this case it wouldn't be except that I treat is as such because it normally is). do_mount_direct: direct trigger not valid or already mounted /test/nested/g3c/s1/ss1 which is recoverable, however if this problem is at play it can cause autofs to become quite confused as to the dependencies in the mount tree because mount triggers end up mounted multiple times. It's hard to accurately check for this over mounting case and automount shouldn't need to if the kernel module is doing its job. There was one other message, similar in consequence of this last one but I can't locate a log example just now. When checking if a mount has already completed prior to adding a new mount request to the wait queue we check if the dentry is hashed and, if so, if it is a mount point. But, if a mount successfully completed while we slept on the wait queue mutex the dentry must exist for the mount to have completed so the test is not really needed. Mounts can also be done on top of a global root dentry, so for the above case, where a mount request completes and the wait queue entry has already been removed, the hashed test returning false can cause an incorrect callback to the daemon. Also, d_mountpoint() is not sufficient to check if a mount has completed for the multi-mount case when we don't have a real mount at the base of the tree. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/waitq.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index eeb24684590..2341375386f 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -297,20 +297,14 @@ static int validate_request(struct autofs_wait_queue **wait, */ if (notify == NFY_MOUNT) { /* - * If the dentry isn't hashed just go ahead and try the - * mount again with a new wait (not much else we can do). - */ - if (!d_unhashed(dentry)) { - /* - * But if the dentry is hashed, that means that we - * got here through the revalidate path. Thus, we - * need to check if the dentry has been mounted - * while we waited on the wq_mutex. If it has, - * simply return success. - */ - if (d_mountpoint(dentry)) - return 0; - } + * If the dentry was successfully mounted while we slept + * on the wait queue mutex we can return success. If it + * isn't mounted (doesn't have submounts for the case of + * a multi-mount with no mount at it's base) we can + * continue on and create a new request. + */ + if (have_submounts(dentry)) + return 0; } return 1; From a61d90d75d0f9e86432c45b496b4b0fbf0fd03dc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 9 Jun 2009 16:26:26 -0700 Subject: [PATCH 890/900] jbd: fix race in buffer processing in commit code In commit code, we scan buffers attached to a transaction. During this scan, we sometimes have to drop j_list_lock and then we recheck whether the journal buffer head didn't get freed by journal_try_to_free_buffers(). But checking for buffer_jbd(bh) isn't enough because a new journal head could get attached to our buffer head. So add a check whether the journal head remained the same and whether it's still at the same transaction and list. This is a nasty bug and can cause problems like memory corruption (use after free) or trigger various assertions in JBD code (observed). Signed-off-by: Jan Kara Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/commit.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 06560c520f4..618e21c0b7a 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -241,7 +241,7 @@ write_out_data: spin_lock(&journal->j_list_lock); } /* Someone already cleaned up the buffer? */ - if (!buffer_jbd(bh) + if (!buffer_jbd(bh) || bh2jh(bh) != jh || jh->b_transaction != commit_transaction || jh->b_jlist != BJ_SyncData) { jbd_unlock_bh_state(bh); @@ -478,7 +478,9 @@ void journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_list_lock); continue; } - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { + if (buffer_jbd(bh) && bh2jh(bh) == jh && + jh->b_transaction == commit_transaction && + jh->b_jlist == BJ_Locked) { __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); From 96050dfb25966612008dcea7d342e91fa01e993c Mon Sep 17 00:00:00 2001 From: Peter Botha Date: Tue, 9 Jun 2009 17:16:32 -0700 Subject: [PATCH 891/900] char: mxser, fix ISA board lookup There's a bug in the mxser kernel module that still appears in the 2.6.29.4 kernel. mxser_get_ISA_conf takes a ioaddress as its first argument, by passing the not of the ioaddr, you're effectively passing 0 which means it won't be able to talk to an ISA card. I have tested this, and removing the ! fixes the problem. Cc: "Peter Botha" Signed-off-by: Jiri Slaby Acked-by: Alan Cox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/mxser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c index a420e8d437d..13f8871e5b2 100644 --- a/drivers/char/mxser.c +++ b/drivers/char/mxser.c @@ -2711,7 +2711,7 @@ static int __init mxser_module_init(void) continue; brd = &mxser_boards[m]; - retval = mxser_get_ISA_conf(!ioaddr[b], brd); + retval = mxser_get_ISA_conf(ioaddr[b], brd); if (retval <= 0) { brd->info = NULL; continue; From 07a2039b8eb0af4ff464efd3dfd95de5c02648c6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 9 Jun 2009 20:05:27 -0700 Subject: [PATCH 892/900] Linux 2.6.30 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 10651549208..03373bb703c 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 30 -EXTRAVERSION = -rc8 +EXTRAVERSION = NAME = Man-Eating Seals of Antiquity # *DOCUMENTATION* From 2b83868723d090078ac0e2120e06a1cc94dbaef0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 9 Jun 2009 20:40:25 -0700 Subject: [PATCH 893/900] Make /dev/zero reads interruptible by signals This helps with bad latencies for large reads from /dev/zero, but might conceivably break some application that "knows" that a read of /dev/zero cannot return early. So do this early in the merge window to give us maximal test coverage, even if the patch is totally trivial. Obviously, no well-behaved application should ever depend on the read being uninterruptible, but hey, bugs happen. Signed-off-by: Linus Torvalds --- drivers/char/mem.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 65e12bca657..f96d0bef855 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -694,9 +694,8 @@ static ssize_t read_zero(struct file * file, char __user * buf, written += chunk - unwritten; if (unwritten) break; - /* Consider changing this to just 'signal_pending()' with lots of testing */ - if (fatal_signal_pending(current)) - return written ? written : -EINTR; + if (signal_pending(current)) + return written ? written : -ERESTARTSYS; buf += chunk; count -= chunk; cond_resched(); From 04dce7d9d429ea5ea04e9432d1726c930f4d67da Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 10 Jun 2009 16:59:46 +1000 Subject: [PATCH 894/900] spinlock: Add missing __raw_spin_lock_flags() stub for UP This was only defined with CONFIG_DEBUG_SPINLOCK set, but some obscure arch/powerpc code wants it always. Signed-off-by: Benjamin Herrenschmidt Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/spinlock_up.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 938234c4a99..d4841ed8215 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -60,6 +60,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock) #define __raw_spin_is_locked(lock) ((void)(lock), 0) /* for sched.c and kernel_lock.c: */ # define __raw_spin_lock(lock) do { (void)(lock); } while (0) +# define __raw_spin_lock_flags(lock, flags) do { (void)(lock); } while (0) # define __raw_spin_unlock(lock) do { (void)(lock); } while (0) # define __raw_spin_trylock(lock) ({ (void)(lock); 1; }) #endif /* DEBUG_SPINLOCK */ From f1db457ce6e2f63cb01022f58c0c023838958bd1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 10 Jun 2009 10:06:24 +0800 Subject: [PATCH 895/900] tracing/events: convert block trace points to TRACE_EVENT(), fix !CONFIG_BLOCK Fix building failures when CONFIG_BLOCK == n. Signed-off-by: Li Zefan LKML-Reference: <4A2F1520.8020003@cn.fujitsu.com> Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/blktrace_api.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index c7ec31dd04c..7e4350ece0f 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -218,7 +218,7 @@ static inline int blk_trace_init_sysfs(struct device *dev) #endif /* CONFIG_BLK_DEV_IO_TRACE */ -#ifdef CONFIG_EVENT_TRACING +#if defined(CONFIG_EVENT_TRACING) && defined(CONFIG_BLOCK) static inline int blk_cmd_buf_len(struct request *rq) { @@ -229,7 +229,7 @@ extern void blk_dump_cmd(char *buf, struct request *rq); extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes); extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq); -#endif /* CONFIG_EVENT_TRACING */ +#endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */ #endif /* __KERNEL__ */ #endif From bc5c6c043d8381676339fb3da59cc4cc5921d368 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 10 Jun 2009 04:48:41 -0400 Subject: [PATCH 896/900] ftrace/documentation: fix typo in function grapher name The function graph tracer is called just "function_graph" (no trailing "_tracer" needed). Signed-off-by: Mike Frysinger LKML-Reference: <1244623722-6325-1-git-send-email-vapier@gentoo.org> Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index fd9a3e69381..5ad2ded8aa6 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -179,7 +179,7 @@ Here is the list of current tracers that may be configured. Function call tracer to trace all kernel functions. - "function_graph_tracer" + "function_graph" Similar to the function tracer except that the function tracer probes the functions on their entry From 6ff9a64d2aaa6eae396adc95e9c91c0cbfa6dbe4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 10 Jun 2009 14:28:34 -0400 Subject: [PATCH 897/900] tracing: do not translate event helper macros in print format By moving the macro that creates the print format code above the defining of the event macro helpers (__get_str, __print_symbolic, and __get_dynamic_array), we get a little cleaner print format. Instead of: (char *)((void *)REC + REC->__data_loc_name) we get: __get_str(name) Instead of: ({ static const struct trace_print_flags symbols[] = { { HI_SOFTIRQ, "HI" }, { we get: __print_symbolic(REC->vec, { HI_SOFTIRQ, "HI" }, { Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 158 +++++++++++++++++++++-------------------- 1 file changed, 81 insertions(+), 77 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 40ede4db4d8..1867553c61e 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -80,6 +80,87 @@ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +/* + * Setup the showing format of trace point. + * + * int + * ftrace_format_##call(struct trace_seq *s) + * { + * struct ftrace_raw_##call field; + * int ret; + * + * ret = trace_seq_printf(s, #type " " #item ";" + * " offset:%u; size:%u;\n", + * offsetof(struct ftrace_raw_##call, item), + * sizeof(field.type)); + * + * } + */ + +#undef TP_STRUCT__entry +#define TP_STRUCT__entry(args...) args + +#undef __field +#define __field(type, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef __array +#define __array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef __dynamic_array +#define __dynamic_array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), \ + __data_loc_##item), \ + (unsigned int)sizeof(field.__data_loc_##item)); \ + if (!ret) \ + return 0; + +#undef __string +#define __string(item, src) __dynamic_array(char, item, -1) + +#undef __entry +#define __entry REC + +#undef __print_symbolic +#undef __get_dynamic_array +#undef __get_str + +#undef TP_printk +#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) + +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct ftrace_raw_##call field __attribute__((unused)); \ + int ret = 0; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: " print); \ + \ + return ret; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + /* * Stage 3 of the trace events. * @@ -179,83 +260,6 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " offset:%u; size:%u;\n", - * offsetof(struct ftrace_raw_##call, item), - * sizeof(field.type)); - * - * } - */ - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __dynamic_array -#define __dynamic_array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), \ - __data_loc_##item), \ - (unsigned int)sizeof(field.__data_loc_##item)); \ - if (!ret) \ - return 0; - -#undef __string -#define __string(item, src) __dynamic_array(char, item, -1) - -#undef __entry -#define __entry REC - -#undef TP_printk -#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field __attribute__((unused)); \ - int ret = 0; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ From 0de51088e6a82bc8413d3ca9e28bbca2788b5b53 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 8 Jun 2009 18:27:54 +0800 Subject: [PATCH 898/900] CPUFREQ: Enable acpi-cpufreq driver for VIA/Centaur CPUs The VIA/Centaur C7, C7-M and Nano CPU's all support ACPI based cpu p-states using a MSR interface. The Linux driver just never made use of it, since in addition to the check for the EST flag it also checked if the vendor is Intel. Signed-off-by: Harald Welte [ Removed the vendor checks entirely - Linus ] Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 752e8c6b2c7..ae9b503220c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid) { struct cpuinfo_x86 *cpu = &cpu_data(cpuid); - if (cpu->x86_vendor != X86_VENDOR_INTEL || - !cpu_has(cpu, X86_FEATURE_EST)) - return 0; - - return 1; + return cpu_has(cpu, X86_FEATURE_EST); } static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) From 0fea615e526b4b7eff0363ee02d5753e5f924089 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 8 Jun 2009 18:29:36 +0800 Subject: [PATCH 899/900] CPUFREQ: Mark e_powersaver driver as EXPERIMENTAL and DANGEROUS The e_powersaver driver for VIA's C7 CPU's needs to be marked as DANGEROUS as it configures the CPU to power states that are out of specification. According to Centaur, all systems with C7 and Nano CPU's support the ACPI p-state method. Thus, the acpi-cpufreq driver should be used instead. Signed-off-by: Harald Welte Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/cpufreq/Kconfig | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 52c83987547..f138c6c389b 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -220,11 +220,14 @@ config X86_LONGHAUL If in doubt, say N. config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver" + tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)" select CPU_FREQ_TABLE - depends on X86_32 + depends on X86_32 && EXPERIMENTAL help - This adds the CPUFreq driver for VIA C7 processors. + This adds the CPUFreq driver for VIA C7 processors. However, this driver + does not have any safeguards to prevent operating the CPU out of spec + and is thus considered dangerous. Please use the regular ACPI cpufreq + driver, enabled by CONFIG_X86_ACPI_CPUFREQ. If in doubt, say N. From 511b01bdf64ad8a38414096eab283c7784aebfc4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Jun 2009 00:32:00 +0200 Subject: [PATCH 900/900] Revert "x86, bts: reenable ptrace branch trace support" This reverts commit 7e0bfad24d85de7cf2202a7b0ce51de11a077b21. A late objection to the ABI has arrived: http://lkml.org/lkml/2009/6/10/253 Keep the ABI disabled out of caution, to not create premature user-space expectations. While the hw-branch-tracing variant uses and tests the BTS code. Cc: Peter Zijlstra Cc: Markus Metzger Cc: Oleg Nesterov Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 924e156a85a..8130334329c 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -506,6 +506,7 @@ config X86_PTRACE_BTS bool "Branch Trace Store" default y depends on X86_DEBUGCTLMSR + depends on BROKEN ---help--- This adds a ptrace interface to the hardware's branch trace store.