From 6ce51c431019310ca03371355a4366c4649fa349 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Wed, 1 Apr 2009 18:06:35 +0100 Subject: [PATCH 0001/2255] genirq: do not execute DEBUG_SHIRQ when irq setup failed When requesting an IRQ, the DEBUG_SHIRQ code executes a fake IRQ just to make sure the driver is ready to receive an IRQ immediately. The problem was that this fake IRQ was being executed even if interrupt line failed to be allocated by __setup_irq. Signed-off-by: Luis Henriques LKML-Reference: <20090401170635.GA4392@hades.domain.com> Signed-off-by: Thomas Gleixner [ fixed bug pointed out by a warning reported by Stephen Rothwell ] Cc: Stephen Rothwell Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1516ab77355..8c68d5b95d4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -768,7 +768,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, kfree(action); #ifdef CONFIG_DEBUG_SHIRQ - if (irqflags & IRQF_SHARED) { + if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... From 39fe05e58c5e448601ce46e6b03900d5bf31c4b0 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Aug 2009 11:16:12 +0800 Subject: [PATCH 0002/2255] x86, hpet: Disable per-cpu hpet timer if ARAT is supported If CPU support always running local APIC timer, per-cpu hpet timer could be disabled, which is useless and wasteful in such case. Let's leave the timers to others. The effect is that we reserve less timers. Signed-off-by: Shaohua Li Cc: venkatesh.pallipadi@intel.com LKML-Reference: <20090812031612.GA10062@sli10-desk.sh.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index dedc2bddf7a..5969e1078fc 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -584,6 +584,8 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int num_timers_used = 0; int i; + if (boot_cpu_has(X86_FEATURE_ARAT)) + return; id = hpet_readl(HPET_ID); num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); @@ -872,10 +874,8 @@ int __init hpet_enable(void) if (id & HPET_ID_LEGSUP) { hpet_legacy_clockevent_register(); - hpet_msi_capability_lookup(2); return 1; } - hpet_msi_capability_lookup(0); return 0; out_nohpet: @@ -908,9 +908,17 @@ static __init int hpet_late_init(void) if (!hpet_virt_address) return -ENODEV; + if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP) + hpet_msi_capability_lookup(2); + else + hpet_msi_capability_lookup(0); + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); hpet_print_config(); + if (boot_cpu_has(X86_FEATURE_ARAT)) + return 0; + for_each_online_cpu(cpu) { hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); } From 5946fa3d5cdeb846a647a1900026af9f8b08c8b5 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 19 Aug 2009 08:44:24 +0100 Subject: [PATCH 0003/2255] x86, hpet: Simplify the HPET code On 64-bits, using unsigned long when unsigned int suffices needlessly creates larger code (due to the need for REX prefixes), and most of the logic in hpet.c really doesn't need 64-bit operations. At once this avoids the need for a couple of type casts. Signed-off-by: Jan Beulich Cc: Shaohua Li Cc: Venkatesh Pallipadi LKML-Reference: <4A8BC9780200007800010832@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hpet.h | 2 +- arch/x86/kernel/hpet.c | 45 +++++++++++++++++++------------------ 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 1c22cb05ad6..65847c578b7 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -69,7 +69,7 @@ extern int hpet_force_user; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); -extern unsigned long hpet_readl(unsigned long a); +extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); extern void hpet_msi_unmask(unsigned int irq); diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 5969e1078fc..ba575f0f2e3 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -47,12 +47,12 @@ struct hpet_dev { char name[10]; }; -unsigned long hpet_readl(unsigned long a) +inline unsigned int hpet_readl(unsigned int a) { return readl(hpet_virt_address + a); } -static inline void hpet_writel(unsigned long d, unsigned long a) +static inline void hpet_writel(unsigned int d, unsigned int a) { writel(d, hpet_virt_address + a); } @@ -167,7 +167,7 @@ do { \ static void hpet_reserve_msi_timers(struct hpet_data *hd); -static void hpet_reserve_platform_timers(unsigned long id) +static void hpet_reserve_platform_timers(unsigned int id) { struct hpet __iomem *hpet = hpet_virt_address; struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; @@ -205,7 +205,7 @@ static void hpet_reserve_platform_timers(unsigned long id) } #else -static void hpet_reserve_platform_timers(unsigned long id) { } +static void hpet_reserve_platform_timers(unsigned int id) { } #endif /* @@ -246,7 +246,7 @@ static void hpet_reset_counter(void) static void hpet_start_counter(void) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } @@ -271,7 +271,7 @@ static void hpet_resume_counter(void) static void hpet_enable_legacy_int(void) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_LEGACY; hpet_writel(cfg, HPET_CFG); @@ -314,7 +314,7 @@ static int hpet_setup_msi_irq(unsigned int irq); static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { - unsigned long cfg, cmp, now; + unsigned int cfg, cmp, now; uint64_t delta; switch (mode) { @@ -323,7 +323,7 @@ static void hpet_set_mode(enum clock_event_mode mode, delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; delta >>= evt->shift; now = hpet_readl(HPET_COUNTER); - cmp = now + (unsigned long) delta; + cmp = now + (unsigned int) delta; cfg = hpet_readl(HPET_Tn_CFG(timer)); /* Make sure we use edge triggered interrupts */ cfg &= ~HPET_TN_LEVEL; @@ -339,7 +339,7 @@ static void hpet_set_mode(enum clock_event_mode mode, * (See AMD-8111 HyperTransport I/O Hub Data Sheet, * Publication # 24674) */ - hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); + hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer)); hpet_start_counter(); hpet_print_config(); break; @@ -387,9 +387,9 @@ static int hpet_next_event(unsigned long delta, * what we wrote hit the chip before we compare it to the * counter. */ - WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); + WARN_ON_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt); - return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; + return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } static void hpet_legacy_set_mode(enum clock_event_mode mode, @@ -415,7 +415,7 @@ static struct hpet_dev *hpet_devs; void hpet_msi_unmask(unsigned int irq) { struct hpet_dev *hdev = get_irq_data(irq); - unsigned long cfg; + unsigned int cfg; /* unmask it */ cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); @@ -425,7 +425,7 @@ void hpet_msi_unmask(unsigned int irq) void hpet_msi_mask(unsigned int irq) { - unsigned long cfg; + unsigned int cfg; struct hpet_dev *hdev = get_irq_data(irq); /* mask it */ @@ -600,7 +600,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { struct hpet_dev *hdev = &hpet_devs[num_timers_used]; - unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); + unsigned int cfg = hpet_readl(HPET_Tn_CFG(i)); /* Only consider HPET timer with MSI support */ if (!(cfg & HPET_TN_FSB_CAP)) @@ -815,7 +815,7 @@ static int hpet_clocksource_register(void) */ int __init hpet_enable(void) { - unsigned long id; + unsigned int id; int i; if (!is_hpet_capable()) @@ -933,7 +933,7 @@ fs_initcall(hpet_late_init); void hpet_disable(void) { if (is_hpet_capable()) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); if (hpet_legacy_int_enabled) { cfg &= ~HPET_CFG_LEGACY; @@ -973,8 +973,8 @@ static int hpet_prev_update_sec; static struct rtc_time hpet_alarm_time; static unsigned long hpet_pie_count; static u32 hpet_t1_cmp; -static unsigned long hpet_default_delta; -static unsigned long hpet_pie_delta; +static u32 hpet_default_delta; +static u32 hpet_pie_delta; static unsigned long hpet_pie_limit; static rtc_irq_handler irq_handler; @@ -1025,7 +1025,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); */ int hpet_rtc_timer_init(void) { - unsigned long cfg, cnt, delta, flags; + unsigned int cfg, cnt, delta; + unsigned long flags; if (!is_hpet_enabled()) return 0; @@ -1035,7 +1036,7 @@ int hpet_rtc_timer_init(void) clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; - hpet_default_delta = (unsigned long) clc; + hpet_default_delta = clc; } if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) @@ -1121,7 +1122,7 @@ int hpet_set_periodic_freq(unsigned long freq) clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; do_div(clc, freq); clc >>= hpet_clockevent.shift; - hpet_pie_delta = (unsigned long) clc; + hpet_pie_delta = clc; } return 1; } @@ -1135,7 +1136,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); static void hpet_rtc_timer_reinit(void) { - unsigned long cfg, delta; + unsigned int cfg, delta; int lost_ints = -1; if (unlikely(!hpet_rtc_flags)) { From 20f3097bfe5fb5ced0b14f9ea2620c4039bf1dde Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 4 Aug 2009 12:07:08 -0700 Subject: [PATCH 0004/2255] intr-remap: generic support for remapping HPET MSIs Generic support for remapping HPET MSI's by parsing the HPET timer block device scope in the ACPI DRHD tables. This is needed for platforms supporting interrupt-remapping and MSI capable HPET timer block. Signed-off-by: Suresh Siddha Cc: David Woodhouse Cc: Jesse Barnes Cc: Venkatesh Pallipadi Cc: Jay Fenlason LKML-Reference: <20090804190729.477649000@intel.com> Signed-off-by: Thomas Gleixner --- drivers/pci/intr_remapping.c | 89 ++++++++++++++++++++++++++++++++++-- drivers/pci/intr_remapping.h | 7 +++ include/linux/dmar.h | 10 ++++ include/linux/hpet.h | 2 + 4 files changed, 104 insertions(+), 4 deletions(-) diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index 4f5b8712931..2cc3f70ad42 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -14,7 +15,8 @@ #include "pci.h" static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; -static int ir_ioapic_num; +static struct hpet_scope ir_hpet[MAX_HPET_TBS]; +static int ir_ioapic_num, ir_hpet_num; int intr_remapping_enabled; static int disable_intremap; @@ -351,6 +353,16 @@ int flush_irte(int irq) return rc; } +struct intel_iommu *map_hpet_to_ir(u8 hpet_id) +{ + int i; + + for (i = 0; i < MAX_HPET_TBS; i++) + if (ir_hpet[i].id == hpet_id) + return ir_hpet[i].iommu; + return NULL; +} + struct intel_iommu *map_ioapic_to_ir(int apic) { int i; @@ -478,6 +490,36 @@ int set_ioapic_sid(struct irte *irte, int apic) return 0; } +int set_hpet_sid(struct irte *irte, u8 id) +{ + int i; + u16 sid = 0; + + if (!irte) + return -1; + + for (i = 0; i < MAX_HPET_TBS; i++) { + if (ir_hpet[i].id == id) { + sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn; + break; + } + } + + if (sid == 0) { + pr_warning("Failed to set source-id of HPET block (%d)\n", id); + return -1; + } + + /* + * Should really use SQ_ALL_16. Some platforms are broken. + * While we figure out the right quirks for these broken platforms, use + * SQ_13_IGNORE_3 for now. + */ + set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid); + + return 0; +} + int set_msi_sid(struct irte *irte, struct pci_dev *dev) { struct pci_dev *bridge; @@ -711,6 +753,34 @@ error: return -1; } +static void ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope, + struct intel_iommu *iommu) +{ + struct acpi_dmar_pci_path *path; + u8 bus; + int count; + + bus = scope->bus; + path = (struct acpi_dmar_pci_path *)(scope + 1); + count = (scope->length - sizeof(struct acpi_dmar_device_scope)) + / sizeof(struct acpi_dmar_pci_path); + + while (--count > 0) { + /* + * Access PCI directly due to the PCI + * subsystem isn't initialized yet. + */ + bus = read_pci_config_byte(bus, path->dev, path->fn, + PCI_SECONDARY_BUS); + path++; + } + ir_hpet[ir_hpet_num].bus = bus; + ir_hpet[ir_hpet_num].devfn = PCI_DEVFN(path->dev, path->fn); + ir_hpet[ir_hpet_num].iommu = iommu; + ir_hpet[ir_hpet_num].id = scope->enumeration_id; + ir_hpet_num++; +} + static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope, struct intel_iommu *iommu) { @@ -740,8 +810,8 @@ static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope, ir_ioapic_num++; } -static int ir_parse_ioapic_scope(struct acpi_dmar_header *header, - struct intel_iommu *iommu) +static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header, + struct intel_iommu *iommu) { struct acpi_dmar_hardware_unit *drhd; struct acpi_dmar_device_scope *scope; @@ -765,6 +835,17 @@ static int ir_parse_ioapic_scope(struct acpi_dmar_header *header, drhd->address); ir_parse_one_ioapic_scope(scope, iommu); + } else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) { + if (ir_hpet_num == MAX_HPET_TBS) { + printk(KERN_WARNING "Exceeded Max HPET blocks\n"); + return -1; + } + + printk(KERN_INFO "HPET id %d under DRHD base" + " 0x%Lx\n", scope->enumeration_id, + drhd->address); + + ir_parse_one_hpet_scope(scope, iommu); } start += scope->length; } @@ -785,7 +866,7 @@ int __init parse_ioapics_under_ir(void) struct intel_iommu *iommu = drhd->iommu; if (ecap_ir_support(iommu->ecap)) { - if (ir_parse_ioapic_scope(drhd->hdr, iommu)) + if (ir_parse_ioapic_hpet_scope(drhd->hdr, iommu)) return -1; ir_supported = 1; diff --git a/drivers/pci/intr_remapping.h b/drivers/pci/intr_remapping.h index 63a263c1841..5662fecfee6 100644 --- a/drivers/pci/intr_remapping.h +++ b/drivers/pci/intr_remapping.h @@ -7,4 +7,11 @@ struct ioapic_scope { unsigned int devfn; /* PCI devfn number */ }; +struct hpet_scope { + struct intel_iommu *iommu; + u8 id; + unsigned int bus; + unsigned int devfn; +}; + #define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0) diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 4a2b162c256..69a6fbac092 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -126,7 +126,9 @@ extern int free_irte(int irq); extern int irq_remapped(int irq); extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev); extern struct intel_iommu *map_ioapic_to_ir(int apic); +extern struct intel_iommu *map_hpet_to_ir(u8 id); extern int set_ioapic_sid(struct irte *irte, int apic); +extern int set_hpet_sid(struct irte *irte, u8 id); extern int set_msi_sid(struct irte *irte, struct pci_dev *dev); #else static inline int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) @@ -158,10 +160,18 @@ static inline struct intel_iommu *map_ioapic_to_ir(int apic) { return NULL; } +static inline struct intel_iommu *map_hpet_to_ir(unsigned int hpet_id) +{ + return NULL; +} static inline int set_ioapic_sid(struct irte *irte, int apic) { return 0; } +static inline int set_hpet_sid(struct irte *irte, u8 id) +{ + return -1; +} static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev) { return 0; diff --git a/include/linux/hpet.h b/include/linux/hpet.h index 79f63a27bce..219ca4f6bea 100644 --- a/include/linux/hpet.h +++ b/include/linux/hpet.h @@ -126,4 +126,6 @@ struct hpet_info { #define HPET_DPI _IO('h', 0x05) /* disable periodic */ #define HPET_IRQFREQ _IOW('h', 0x6, unsigned long) /* IRQFREQ usec */ +#define MAX_HPET_TBS 8 /* maximum hpet timer blocks */ + #endif /* !__HPET__ */ From c8bc6f3c806f1fcbfdbf0b1ff6c52dba59192d3b Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 4 Aug 2009 12:07:09 -0700 Subject: [PATCH 0005/2255] x86: arch specific support for remapping HPET MSIs x86 arch support for remapping HPET MSI's by associating the HPET timer block with the interrupt-remapping HW unit and setting up appropriate irq_chip Signed-off-by: Suresh Siddha Cc: Venkatesh Pallipadi Cc: David Woodhouse Cc: Jesse Barnes Cc: Jay Fenlason LKML-Reference: <20090804190729.630510000@intel.com> Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hpet.h | 5 ++-- arch/x86/kernel/acpi/boot.c | 1 + arch/x86/kernel/apic/io_apic.c | 49 ++++++++++++++++++++++++++++------ arch/x86/kernel/hpet.c | 3 ++- 4 files changed, 47 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 65847c578b7..5d89fd2a369 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -65,6 +65,7 @@ /* hpet memory map physical address */ extern unsigned long hpet_address; extern unsigned long force_hpet_address; +extern u8 hpet_blockid; extern int hpet_force_user; extern int is_hpet_enabled(void); extern int hpet_enable(void); @@ -78,9 +79,9 @@ extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg); extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); #ifdef CONFIG_PCI_MSI -extern int arch_setup_hpet_msi(unsigned int irq); +extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id); #else -static inline int arch_setup_hpet_msi(unsigned int irq) +static inline int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { return -EINVAL; } diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6b8ca3a0285..eae642b0f34 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -624,6 +624,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) } hpet_address = hpet_tbl->address.address; + hpet_blockid = hpet_tbl->sequence; /* * Some broken BIOSes advertise HPET at 0x0. We really do not diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d2ed6c5ddc8..d9c6f14d3b3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3254,7 +3254,8 @@ void destroy_irq(unsigned int irq) * MSI message composition */ #ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, + struct msi_msg *msg, u8 hpet_id) { struct irq_cfg *cfg; int err; @@ -3288,7 +3289,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms irte.dest_id = IRTE_DEST(dest); /* Set source-id of interrupt request */ - set_msi_sid(&irte, pdev); + if (pdev) + set_msi_sid(&irte, pdev); + else + set_hpet_sid(&irte, hpet_id); modify_irte(irq, &irte); @@ -3453,7 +3457,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) int ret; struct msi_msg msg; - ret = msi_compose_msg(dev, irq, &msg); + ret = msi_compose_msg(dev, irq, &msg, -1); if (ret < 0) return ret; @@ -3586,7 +3590,7 @@ int arch_setup_dmar_msi(unsigned int irq) int ret; struct msi_msg msg; - ret = msi_compose_msg(NULL, irq, &msg); + ret = msi_compose_msg(NULL, irq, &msg, -1); if (ret < 0) return ret; dmar_msi_write(irq, &msg); @@ -3626,6 +3630,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) #endif /* CONFIG_SMP */ +static struct irq_chip ir_hpet_msi_type = { + .name = "IR-HPET_MSI", + .unmask = hpet_msi_unmask, + .mask = hpet_msi_mask, +#ifdef CONFIG_INTR_REMAP + .ack = ir_ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif +#endif + .retrigger = ioapic_retrigger_irq, +}; + static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .unmask = hpet_msi_unmask, @@ -3637,20 +3654,36 @@ static struct irq_chip hpet_msi_type = { .retrigger = ioapic_retrigger_irq, }; -int arch_setup_hpet_msi(unsigned int irq) +int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { int ret; struct msi_msg msg; struct irq_desc *desc = irq_to_desc(irq); - ret = msi_compose_msg(NULL, irq, &msg); + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_hpet_to_ir(id); + int index; + + if (!iommu) + return -1; + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + return -1; + } + + ret = msi_compose_msg(NULL, irq, &msg, id); if (ret < 0) return ret; hpet_msi_write(irq, &msg); desc->status |= IRQ_MOVE_PCNTXT; - set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, - "edge"); + if (irq_remapped(irq)) + set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, + handle_edge_irq, "edge"); + else + set_irq_chip_and_handler_name(irq, &hpet_msi_type, + handle_edge_irq, "edge"); return 0; } diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba575f0f2e3..7f024ff47d1 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -33,6 +33,7 @@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; +u8 hpet_blockid; /* OS timer block num */ #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; #endif @@ -467,7 +468,7 @@ static int hpet_msi_next_event(unsigned long delta, static int hpet_setup_msi_irq(unsigned int irq) { - if (arch_setup_hpet_msi(irq)) { + if (arch_setup_hpet_msi(irq, hpet_blockid)) { destroy_irq(irq); return -EINVAL; } From c95b4502ad7fe8f3b9954aec794b00ac0046ab3a Mon Sep 17 00:00:00 2001 From: john stultz Date: Thu, 27 Aug 2009 17:04:42 -0700 Subject: [PATCH 0006/2255] ntp: Provide compability defines (You say MOD_NANO, I say ADJ_NANO) MOD_NANO, ADJ_NANO, MOD_NANO, ADJ_NANO! Lets call the whole thing off! But oh! If we call the whole thing off, Then we must part. And oh! If we ever part, Then that might break my heart^H^H^H^Hclock! So, if you like MOD_NANO and I like ADJ_NANO, I'll include MOD_NANO and give up ADJ_NANO (not really!). For we know we need each other, So we better call the calling off off. Let's call the whole thing off! The tumultuous NTP and Linux relationship has hit another snag: Ends up NTPd still uses the "xntp 3.4 compatability names" and when the STA_NANO value was added (along with ADJ_NANO), NTPd expected MOD_NANO to be added and has apparently hit some build errors. Report to ntp hackers: https://lists.ntp.org/pipermail/hackers/2009-August/004455.html Related Bugs: https://support.ntp.org/bugs/show_bug.cgi?id=1219 https://bugzilla.redhat.com/show_bug.cgi?id=505566 So in an effort to make peace, here's a patch to help get things building again. I also have updated the comment to make sure folks don't think the MOD_* values are just legacy constants. Of course, NTPd really uses the glibc-headers, so those will need to be similarly updated before things are working again (the RH bug above should probably cover that). Thanks to Michael Tatarinov and Hal Murray for finding and reporting the issue! Signed-off-by: John Stultz Cc: Miroslav Lichvar Cc: hmurray@megapathdsl.net Cc: Ulrich Drepper Cc: Michael Tatarinov LKML-Reference: <1251417882.7905.42.camel@localhost.localdomain> Signed-off-by: Thomas Gleixner --- include/linux/timex.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/timex.h b/include/linux/timex.h index e6967d10d9e..782ccd45c0d 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -115,13 +115,16 @@ struct timex { #define ADJ_OFFSET_SS_READ 0xa001 /* read-only adjtime */ #endif -/* xntp 3.4 compatibility names */ +/* NTP userland likes the MOD_ prefix better */ #define MOD_OFFSET ADJ_OFFSET #define MOD_FREQUENCY ADJ_FREQUENCY #define MOD_MAXERROR ADJ_MAXERROR #define MOD_ESTERROR ADJ_ESTERROR #define MOD_STATUS ADJ_STATUS #define MOD_TIMECONST ADJ_TIMECONST +#define MOD_TAI ADJ_TAI +#define MOD_MICRO ADJ_MICRO +#define MOD_NANO ADJ_NANO /* From 654d364e26c797e8a5f9e2a1393607e6ca0106eb Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 9 Sep 2009 14:04:06 +0900 Subject: [PATCH 0007/2255] sh: sh4_flush_cache_mm() optimizations. The i-cache flush in the case of VM_EXEC was added way back when as a sanity measure, and in practice we only care about evicting aliases from the d-cache. As a result, it's possible to drop the i-cache flush completely here. After careful profiling it's also come up that all of the work associated with hunting down aliases and doing ranged flushing ends up generating more overhead than simply blasting away the entire dcache, particularly if there are many mm's that need to be iterated over. As a result of that, just move back to flush_dcache_all() in these cases, which restores the old behaviour, and vastly simplifies the path. Additionally, on platforms without aliases at all, this can simply be nopped out. Presently we have the alias check in the SH-4 specific version, but this is true for all of the platforms, so move the check up to a generic location. This cuts down quite a bit on superfluous cacheop IPIs. Signed-off-by: Paul Mundt --- arch/sh/mm/cache-sh4.c | 124 ++--------------------------------------- arch/sh/mm/cache.c | 6 ++ 2 files changed, 10 insertions(+), 120 deletions(-) diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index b2453bbef4c..a5c339bca8a 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -170,89 +170,13 @@ static void sh4_flush_cache_all(void *unused) flush_icache_all(); } -static void __flush_cache_mm(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - unsigned long d = 0, p = start & PAGE_MASK; - unsigned long alias_mask = boot_cpu_data.dcache.alias_mask; - unsigned long n_aliases = boot_cpu_data.dcache.n_aliases; - unsigned long select_bit; - unsigned long all_aliases_mask; - unsigned long addr_offset; - pgd_t *dir; - pmd_t *pmd; - pud_t *pud; - pte_t *pte; - int i; - - dir = pgd_offset(mm, p); - pud = pud_offset(dir, p); - pmd = pmd_offset(pud, p); - end = PAGE_ALIGN(end); - - all_aliases_mask = (1 << n_aliases) - 1; - - do { - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) { - p &= PMD_MASK; - p += PMD_SIZE; - pmd++; - - continue; - } - - pte = pte_offset_kernel(pmd, p); - - do { - unsigned long phys; - pte_t entry = *pte; - - if (!(pte_val(entry) & _PAGE_PRESENT)) { - pte++; - p += PAGE_SIZE; - continue; - } - - phys = pte_val(entry) & PTE_PHYS_MASK; - - if ((p ^ phys) & alias_mask) { - d |= 1 << ((p & alias_mask) >> PAGE_SHIFT); - d |= 1 << ((phys & alias_mask) >> PAGE_SHIFT); - - if (d == all_aliases_mask) - goto loop_exit; - } - - pte++; - p += PAGE_SIZE; - } while (p < end && ((unsigned long)pte & ~PAGE_MASK)); - pmd++; - } while (p < end); - -loop_exit: - addr_offset = 0; - select_bit = 1; - - for (i = 0; i < n_aliases; i++) { - if (d & select_bit) { - (*__flush_dcache_segment_fn)(addr_offset, PAGE_SIZE); - wmb(); - } - - select_bit <<= 1; - addr_offset += PAGE_SIZE; - } -} - /* * Note : (RPC) since the caches are physically tagged, the only point * of flush_cache_mm for SH-4 is to get rid of aliases from the * D-cache. The assumption elsewhere, e.g. flush_cache_range, is that * lines can stay resident so long as the virtual address they were * accessed with (hence cache set) is in accord with the physical - * address (i.e. tag). It's no different here. So I reckon we don't - * need to flush the I-cache, since aliases don't matter for that. We - * should try that. + * address (i.e. tag). It's no different here. * * Caller takes mm->mmap_sem. */ @@ -263,33 +187,7 @@ static void sh4_flush_cache_mm(void *arg) if (cpu_context(smp_processor_id(), mm) == NO_CONTEXT) return; - /* - * If cache is only 4k-per-way, there are never any 'aliases'. Since - * the cache is physically tagged, the data can just be left in there. - */ - if (boot_cpu_data.dcache.n_aliases == 0) - return; - - /* - * Don't bother groveling around the dcache for the VMA ranges - * if there are too many PTEs to make it worthwhile. - */ - if (mm->nr_ptes >= MAX_DCACHE_PAGES) - flush_dcache_all(); - else { - struct vm_area_struct *vma; - - /* - * In this case there are reasonably sized ranges to flush, - * iterate through the VMA list and take care of any aliases. - */ - for (vma = mm->mmap; vma; vma = vma->vm_next) - __flush_cache_mm(mm, vma->vm_start, vma->vm_end); - } - - /* Only touch the icache if one of the VMAs has VM_EXEC set. */ - if (mm->exec_vm) - flush_icache_all(); + flush_dcache_all(); } /* @@ -372,24 +270,10 @@ static void sh4_flush_cache_range(void *args) if (boot_cpu_data.dcache.n_aliases == 0) return; - /* - * Don't bother with the lookup and alias check if we have a - * wide range to cover, just blow away the dcache in its - * entirety instead. -- PFM. - */ - if (((end - start) >> PAGE_SHIFT) >= MAX_DCACHE_PAGES) - flush_dcache_all(); - else - __flush_cache_mm(vma->vm_mm, start, end); + flush_dcache_all(); - if (vma->vm_flags & VM_EXEC) { - /* - * TODO: Is this required??? Need to look at how I-cache - * coherency is assured when new programs are loaded to see if - * this matters. - */ + if (vma->vm_flags & VM_EXEC) flush_icache_all(); - } } /** diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c index 35c37b7f717..4aa92605453 100644 --- a/arch/sh/mm/cache.c +++ b/arch/sh/mm/cache.c @@ -164,11 +164,17 @@ void flush_cache_all(void) void flush_cache_mm(struct mm_struct *mm) { + if (boot_cpu_data.dcache.n_aliases == 0) + return; + cacheop_on_each_cpu(local_flush_cache_mm, mm, 1); } void flush_cache_dup_mm(struct mm_struct *mm) { + if (boot_cpu_data.dcache.n_aliases == 0) + return; + cacheop_on_each_cpu(local_flush_cache_dup_mm, mm, 1); } From 31c9efde786252112cc3d04a1ed3513b6ec63a7b Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 9 Sep 2009 14:10:28 +0900 Subject: [PATCH 0008/2255] sh: Kill off broken PHYSADDR() usage in sh4_flush_dcache_page(). PHYSADDR() runs in to issues in 32-bit mode when we do not have the legacy P1/P2 areas mapped, as such, we need to use page_to_phys() directly, which also happens to do the right thing in legacy 29-bit mode. Signed-off-by: Paul Mundt --- arch/sh/mm/cache-sh4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index a5c339bca8a..f0999606686 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -123,12 +123,12 @@ static void sh4_flush_dcache_page(void *arg) else #endif { - unsigned long phys = PHYSADDR(page_address(page)); + unsigned long phys = page_to_phys(page); unsigned long addr = CACHE_OC_ADDRESS_ARRAY; int i, n; /* Loop all the D-cache */ - n = boot_cpu_data.dcache.n_aliases; + n = boot_cpu_data.dcache.way_incr >> 12; for (i = 0; i < n; i++, addr += 4096) flush_cache_4096(addr, phys); } From bd6df57481b329dfeeb4889068848ee4f4761561 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 9 Sep 2009 14:22:15 +0900 Subject: [PATCH 0009/2255] sh: Kill off segment-based d-cache flushing on SH-4. This kills off the unrolled segment based flushers on SH-4 and switches over to a generic unrolled approach derived from the writethrough segment flusher. Signed-off-by: Paul Mundt --- arch/sh/mm/cache-sh4.c | 291 +++-------------------------------------- 1 file changed, 20 insertions(+), 271 deletions(-) diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index f0999606686..92b7d947db9 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -29,14 +29,6 @@ static void __flush_cache_4096(unsigned long addr, unsigned long phys, unsigned long exec_offset); -/* - * This is initialised here to ensure that it is not placed in the BSS. If - * that were to happen, note that cache_init gets called before the BSS is - * cleared, so this would get nulled out which would be hopeless. - */ -static void (*__flush_dcache_segment_fn)(unsigned long, unsigned long) = - (void (*)(unsigned long, unsigned long))0xdeadbeef; - /* * Write back the range of D-cache, and purge the I-cache. * @@ -158,10 +150,27 @@ static void __uses_jump_to_uncached flush_icache_all(void) local_irq_restore(flags); } -static inline void flush_dcache_all(void) +static void flush_dcache_all(void) { - (*__flush_dcache_segment_fn)(0UL, boot_cpu_data.dcache.way_size); - wmb(); + unsigned long addr, end_addr, entry_offset; + + end_addr = CACHE_OC_ADDRESS_ARRAY + + (current_cpu_data.dcache.sets << + current_cpu_data.dcache.entry_shift) * + current_cpu_data.dcache.ways; + + entry_offset = 1 << current_cpu_data.dcache.entry_shift; + + for (addr = CACHE_OC_ADDRESS_ARRAY; addr < end_addr; ) { + __raw_writel(0, addr); addr += entry_offset; + __raw_writel(0, addr); addr += entry_offset; + __raw_writel(0, addr); addr += entry_offset; + __raw_writel(0, addr); addr += entry_offset; + __raw_writel(0, addr); addr += entry_offset; + __raw_writel(0, addr); addr += entry_offset; + __raw_writel(0, addr); addr += entry_offset; + __raw_writel(0, addr); addr += entry_offset; + } } static void sh4_flush_cache_all(void *unused) @@ -347,245 +356,6 @@ static void __flush_cache_4096(unsigned long addr, unsigned long phys, } while (--way_count != 0); } -/* - * Break the 1, 2 and 4 way variants of this out into separate functions to - * avoid nearly all the overhead of having the conditional stuff in the function - * bodies (+ the 1 and 2 way cases avoid saving any registers too). - * - * We want to eliminate unnecessary bus transactions, so this code uses - * a non-obvious technique. - * - * Loop over a cache way sized block of, one cache line at a time. For each - * line, use movca.a to cause the current cache line contents to be written - * back, but without reading anything from main memory. However this has the - * side effect that the cache is now caching that memory location. So follow - * this with a cache invalidate to mark the cache line invalid. And do all - * this with interrupts disabled, to avoid the cache line being accidently - * evicted while it is holding garbage. - * - * This also breaks in a number of circumstances: - * - if there are modifications to the region of memory just above - * empty_zero_page (for example because a breakpoint has been placed - * there), then these can be lost. - * - * This is because the the memory address which the cache temporarily - * caches in the above description is empty_zero_page. So the - * movca.l hits the cache (it is assumed that it misses, or at least - * isn't dirty), modifies the line and then invalidates it, losing the - * required change. - * - * - If caches are disabled or configured in write-through mode, then - * the movca.l writes garbage directly into memory. - */ -static void __flush_dcache_segment_writethrough(unsigned long start, - unsigned long extent_per_way) -{ - unsigned long addr; - int i; - - addr = CACHE_OC_ADDRESS_ARRAY | (start & cpu_data->dcache.entry_mask); - - while (extent_per_way) { - for (i = 0; i < cpu_data->dcache.ways; i++) - __raw_writel(0, addr + cpu_data->dcache.way_incr * i); - - addr += cpu_data->dcache.linesz; - extent_per_way -= cpu_data->dcache.linesz; - } -} - -static void __flush_dcache_segment_1way(unsigned long start, - unsigned long extent_per_way) -{ - unsigned long orig_sr, sr_with_bl; - unsigned long base_addr; - unsigned long way_incr, linesz, way_size; - struct cache_info *dcache; - register unsigned long a0, a0e; - - asm volatile("stc sr, %0" : "=r" (orig_sr)); - sr_with_bl = orig_sr | (1<<28); - base_addr = ((unsigned long)&empty_zero_page[0]); - - /* - * The previous code aligned base_addr to 16k, i.e. the way_size of all - * existing SH-4 D-caches. Whilst I don't see a need to have this - * aligned to any better than the cache line size (which it will be - * anyway by construction), let's align it to at least the way_size of - * any existing or conceivable SH-4 D-cache. -- RPC - */ - base_addr = ((base_addr >> 16) << 16); - base_addr |= start; - - dcache = &boot_cpu_data.dcache; - linesz = dcache->linesz; - way_incr = dcache->way_incr; - way_size = dcache->way_size; - - a0 = base_addr; - a0e = base_addr + extent_per_way; - do { - asm volatile("ldc %0, sr" : : "r" (sr_with_bl)); - asm volatile("movca.l r0, @%0\n\t" - "ocbi @%0" : : "r" (a0)); - a0 += linesz; - asm volatile("movca.l r0, @%0\n\t" - "ocbi @%0" : : "r" (a0)); - a0 += linesz; - asm volatile("movca.l r0, @%0\n\t" - "ocbi @%0" : : "r" (a0)); - a0 += linesz; - asm volatile("movca.l r0, @%0\n\t" - "ocbi @%0" : : "r" (a0)); - asm volatile("ldc %0, sr" : : "r" (orig_sr)); - a0 += linesz; - } while (a0 < a0e); -} - -static void __flush_dcache_segment_2way(unsigned long start, - unsigned long extent_per_way) -{ - unsigned long orig_sr, sr_with_bl; - unsigned long base_addr; - unsigned long way_incr, linesz, way_size; - struct cache_info *dcache; - register unsigned long a0, a1, a0e; - - asm volatile("stc sr, %0" : "=r" (orig_sr)); - sr_with_bl = orig_sr | (1<<28); - base_addr = ((unsigned long)&empty_zero_page[0]); - - /* See comment under 1-way above */ - base_addr = ((base_addr >> 16) << 16); - base_addr |= start; - - dcache = &boot_cpu_data.dcache; - linesz = dcache->linesz; - way_incr = dcache->way_incr; - way_size = dcache->way_size; - - a0 = base_addr; - a1 = a0 + way_incr; - a0e = base_addr + extent_per_way; - do { - asm volatile("ldc %0, sr" : : "r" (sr_with_bl)); - asm volatile("movca.l r0, @%0\n\t" - "movca.l r0, @%1\n\t" - "ocbi @%0\n\t" - "ocbi @%1" : : - "r" (a0), "r" (a1)); - a0 += linesz; - a1 += linesz; - asm volatile("movca.l r0, @%0\n\t" - "movca.l r0, @%1\n\t" - "ocbi @%0\n\t" - "ocbi @%1" : : - "r" (a0), "r" (a1)); - a0 += linesz; - a1 += linesz; - asm volatile("movca.l r0, @%0\n\t" - "movca.l r0, @%1\n\t" - "ocbi @%0\n\t" - "ocbi @%1" : : - "r" (a0), "r" (a1)); - a0 += linesz; - a1 += linesz; - asm volatile("movca.l r0, @%0\n\t" - "movca.l r0, @%1\n\t" - "ocbi @%0\n\t" - "ocbi @%1" : : - "r" (a0), "r" (a1)); - asm volatile("ldc %0, sr" : : "r" (orig_sr)); - a0 += linesz; - a1 += linesz; - } while (a0 < a0e); -} - -static void __flush_dcache_segment_4way(unsigned long start, - unsigned long extent_per_way) -{ - unsigned long orig_sr, sr_with_bl; - unsigned long base_addr; - unsigned long way_incr, linesz, way_size; - struct cache_info *dcache; - register unsigned long a0, a1, a2, a3, a0e; - - asm volatile("stc sr, %0" : "=r" (orig_sr)); - sr_with_bl = orig_sr | (1<<28); - base_addr = ((unsigned long)&empty_zero_page[0]); - - /* See comment under 1-way above */ - base_addr = ((base_addr >> 16) << 16); - base_addr |= start; - - dcache = &boot_cpu_data.dcache; - linesz = dcache->linesz; - way_incr = dcache->way_incr; - way_size = dcache->way_size; - - a0 = base_addr; - a1 = a0 + way_incr; - a2 = a1 + way_incr; - a3 = a2 + way_incr; - a0e = base_addr + extent_per_way; - do { - asm volatile("ldc %0, sr" : : "r" (sr_with_bl)); - asm volatile("movca.l r0, @%0\n\t" - "movca.l r0, @%1\n\t" - "movca.l r0, @%2\n\t" - "movca.l r0, @%3\n\t" - "ocbi @%0\n\t" - "ocbi @%1\n\t" - "ocbi @%2\n\t" - "ocbi @%3\n\t" : : - "r" (a0), "r" (a1), "r" (a2), "r" (a3)); - a0 += linesz; - a1 += linesz; - a2 += linesz; - a3 += linesz; - asm volatile("movca.l r0, @%0\n\t" - "movca.l r0, @%1\n\t" - "movca.l r0, @%2\n\t" - "movca.l r0, @%3\n\t" - "ocbi @%0\n\t" - "ocbi @%1\n\t" - "ocbi @%2\n\t" - "ocbi @%3\n\t" : : - "r" (a0), "r" (a1), "r" (a2), "r" (a3)); - a0 += linesz; - a1 += linesz; - a2 += linesz; - a3 += linesz; - asm volatile("movca.l r0, @%0\n\t" - "movca.l r0, @%1\n\t" - "movca.l r0, @%2\n\t" - "movca.l r0, @%3\n\t" - "ocbi @%0\n\t" - "ocbi @%1\n\t" - "ocbi @%2\n\t" - "ocbi @%3\n\t" : : - "r" (a0), "r" (a1), "r" (a2), "r" (a3)); - a0 += linesz; - a1 += linesz; - a2 += linesz; - a3 += linesz; - asm volatile("movca.l r0, @%0\n\t" - "movca.l r0, @%1\n\t" - "movca.l r0, @%2\n\t" - "movca.l r0, @%3\n\t" - "ocbi @%0\n\t" - "ocbi @%1\n\t" - "ocbi @%2\n\t" - "ocbi @%3\n\t" : : - "r" (a0), "r" (a1), "r" (a2), "r" (a3)); - asm volatile("ldc %0, sr" : : "r" (orig_sr)); - a0 += linesz; - a1 += linesz; - a2 += linesz; - a3 += linesz; - } while (a0 < a0e); -} - extern void __weak sh4__flush_region_init(void); /* @@ -593,32 +363,11 @@ extern void __weak sh4__flush_region_init(void); */ void __init sh4_cache_init(void) { - unsigned int wt_enabled = !!(__raw_readl(CCR) & CCR_CACHE_WT); - printk("PVR=%08x CVR=%08x PRR=%08x\n", ctrl_inl(CCN_PVR), ctrl_inl(CCN_CVR), ctrl_inl(CCN_PRR)); - if (wt_enabled) - __flush_dcache_segment_fn = __flush_dcache_segment_writethrough; - else { - switch (boot_cpu_data.dcache.ways) { - case 1: - __flush_dcache_segment_fn = __flush_dcache_segment_1way; - break; - case 2: - __flush_dcache_segment_fn = __flush_dcache_segment_2way; - break; - case 4: - __flush_dcache_segment_fn = __flush_dcache_segment_4way; - break; - default: - panic("unknown number of cache ways\n"); - break; - } - } - local_flush_icache_range = sh4_flush_icache_range; local_flush_dcache_page = sh4_flush_dcache_page; local_flush_cache_all = sh4_flush_cache_all; From deaef20e9789d93c06d2d3b5ffc99939814802ca Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 9 Sep 2009 16:06:39 +0900 Subject: [PATCH 0010/2255] sh: Rework sh4_flush_cache_page() for coherent kmap mapping. This builds on top of the MIPS r4k code that does roughly the same thing. This permits the use of kmap_coherent() for mapped pages with dirty dcache lines and falls back on kmap_atomic() otherwise. This also fixes up a problem with the alias check and defers to shm_align_mask directly. Signed-off-by: Paul Mundt --- arch/sh/mm/cache-sh4.c | 75 +++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 27 deletions(-) diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index 92b7d947db9..e3fbd99b323 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -2,7 +2,7 @@ * arch/sh/mm/cache-sh4.c * * Copyright (C) 1999, 2000, 2002 Niibe Yutaka - * Copyright (C) 2001 - 2007 Paul Mundt + * Copyright (C) 2001 - 2009 Paul Mundt * Copyright (C) 2003 Richard Curnow * Copyright (c) 2007 STMicroelectronics (R&D) Ltd. * @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include @@ -23,7 +25,6 @@ * flushing. Anything exceeding this will simply flush the dcache in its * entirety. */ -#define MAX_DCACHE_PAGES 64 /* XXX: Tune for ways */ #define MAX_ICACHE_PAGES 32 static void __flush_cache_4096(unsigned long addr, unsigned long phys, @@ -209,44 +210,64 @@ static void sh4_flush_cache_page(void *args) { struct flusher_data *data = args; struct vm_area_struct *vma; + struct page *page; unsigned long address, pfn, phys; - unsigned int alias_mask; + int map_coherent = 0; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + void *vaddr; vma = data->vma; address = data->addr1; pfn = data->addr2; phys = pfn << PAGE_SHIFT; + page = pfn_to_page(pfn); if (cpu_context(smp_processor_id(), vma->vm_mm) == NO_CONTEXT) return; - alias_mask = boot_cpu_data.dcache.alias_mask; + address &= PAGE_MASK; + pgd = pgd_offset(vma->vm_mm, address); + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); + pte = pte_offset_kernel(pmd, address); - /* We only need to flush D-cache when we have alias */ - if ((address^phys) & alias_mask) { - /* Loop 4K of the D-cache */ - flush_cache_4096( - CACHE_OC_ADDRESS_ARRAY | (address & alias_mask), - phys); - /* Loop another 4K of the D-cache */ - flush_cache_4096( - CACHE_OC_ADDRESS_ARRAY | (phys & alias_mask), - phys); + /* If the page isn't present, there is nothing to do here. */ + if (!(pte_val(*pte) & _PAGE_PRESENT)) + return; + + if ((vma->vm_mm == current->active_mm)) + vaddr = NULL; + else { + /* + * Use kmap_coherent or kmap_atomic to do flushes for + * another ASID than the current one. + */ + map_coherent = (current_cpu_data.dcache.n_aliases && + !test_bit(PG_dcache_dirty, &page->flags) && + page_mapped(page)); + if (map_coherent) + vaddr = kmap_coherent(page, address); + else + vaddr = kmap_atomic(page, KM_USER0); + + address = (unsigned long)vaddr; } - alias_mask = boot_cpu_data.icache.alias_mask; - if (vma->vm_flags & VM_EXEC) { - /* - * Evict entries from the portion of the cache from which code - * may have been executed at this address (virtual). There's - * no need to evict from the portion corresponding to the - * physical address as for the D-cache, because we know the - * kernel has never executed the code through its identity - * translation. - */ - flush_cache_4096( - CACHE_IC_ADDRESS_ARRAY | (address & alias_mask), - phys); + if (pages_do_alias(address, phys)) + flush_cache_4096(CACHE_OC_ADDRESS_ARRAY | + (address & shm_align_mask), phys); + + if (vma->vm_flags & VM_EXEC) + flush_icache_all(); + + if (vaddr) { + if (map_coherent) + kunmap_coherent(vaddr); + else + kunmap_atomic(vaddr, KM_USER0); } } From c4845a4b2288a9e5d96a0558e474809028c8aff3 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 9 Sep 2009 17:13:07 +0900 Subject: [PATCH 0011/2255] sh: Fix up redundant cache flushing for PAGE_SIZE > 4k. If PAGE_SIZE is presently over 4k we do a lot of extra flushing given that we purge the cache 4k at a time. Make it explicitly 4k per iteration, rather than iterating for PAGE_SIZE before looping over again. Signed-off-by: Paul Mundt --- arch/sh/mm/cache-sh4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index e3fbd99b323..8362d312ad9 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -357,7 +357,7 @@ static void __flush_cache_4096(unsigned long addr, unsigned long phys, * pointless nead-of-loop check for 0 iterations. */ do { - ea = base_addr + PAGE_SIZE; + ea = base_addr + 4096; a = base_addr; p = phys; From f9e2bdfdbb4c9da13422b349227be8c7b41dbd44 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 9 Sep 2009 17:14:19 +0900 Subject: [PATCH 0012/2255] sh: Factor in cpu id for selection of cache colour fixmap. In the SMP VIPT case the page copy/clear ops still perform colouring, care needs to be taken that CPUs don't end up stepping on each other, so we give them a bit of room to work with. At the same time, we reduce the worst-case colouring given that these pages are always consumed. Signed-off-by: Paul Mundt --- arch/sh/include/asm/fixmap.h | 6 +++--- arch/sh/mm/kmap.c | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/sh/include/asm/fixmap.h b/arch/sh/include/asm/fixmap.h index 721fcc4d5e9..76c5a3099cb 100644 --- a/arch/sh/include/asm/fixmap.h +++ b/arch/sh/include/asm/fixmap.h @@ -14,9 +14,9 @@ #define _ASM_FIXMAP_H #include +#include #include #ifdef CONFIG_HIGHMEM -#include #include #endif @@ -46,9 +46,9 @@ * fix-mapped? */ enum fixed_addresses { -#define FIX_N_COLOURS 16 +#define FIX_N_COLOURS 8 FIX_CMAP_BEGIN, - FIX_CMAP_END = FIX_CMAP_BEGIN + FIX_N_COLOURS, + FIX_CMAP_END = FIX_CMAP_BEGIN + (FIX_N_COLOURS * NR_CPUS), FIX_UNCACHED, #ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ diff --git a/arch/sh/mm/kmap.c b/arch/sh/mm/kmap.c index 16e01b5fed0..15d74ea4209 100644 --- a/arch/sh/mm/kmap.c +++ b/arch/sh/mm/kmap.c @@ -39,7 +39,9 @@ void *kmap_coherent(struct page *page, unsigned long addr) pagefault_disable(); idx = FIX_CMAP_END - - ((addr & current_cpu_data.dcache.alias_mask) >> PAGE_SHIFT); + (((addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1)) + + (FIX_N_COLOURS * smp_processor_id())); + vaddr = __fix_to_virt(idx); BUG_ON(!pte_none(*(kmap_coherent_pte - idx))); From 8ebc423238341b52912c7295b045a32477b33f09 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Apr 2009 04:19:49 +0200 Subject: [PATCH 0013/2255] reiserfs: kill-the-BKL This patch is an attempt to remove the Bkl based locking scheme from reiserfs and is intended. It is a bit inspired from an old attempt by Peter Zijlstra: http://lkml.indiana.edu/hypermail/linux/kernel/0704.2/2174.html The bkl is heavily used in this filesystem to prevent from concurrent write accesses on the filesystem. Reiserfs makes a deep use of the specific properties of the Bkl: - It can be acqquired recursively by a same task - It is released on the schedule() calls and reacquired when schedule() returns The two properties above are a roadmap for the reiserfs write locking so it's very hard to simply replace it with a common mutex. - We need a recursive-able locking unless we want to restructure several blocks of the code. - We need to identify the sites where the bkl was implictly relaxed (schedule, wait, sync, etc...) so that we can in turn release and reacquire our new lock explicitly. Such implicit releases of the lock are often required to let other resources producer/consumer do their job or we can suffer unexpected starvations or deadlocks. So the new lock that replaces the bkl here is a per superblock mutex with a specific property: it can be acquired recursively by a same task, like the bkl. For such purpose, we integrate a lock owner and a lock depth field on the superblock information structure. The first axis on this patch is to turn reiserfs_write_(un)lock() function into a wrapper to manage this mutex. Also some explicit calls to lock_kernel() have been converted to reiserfs_write_lock() helpers. The second axis is to find the important blocking sites (schedule...(), wait_on_buffer(), sync_dirty_buffer(), etc...) and then apply an explicit release of the write lock on these locations before blocking. Then we can safely wait for those who can give us resources or those who need some. Typically this is a fight between the current writer, the reiserfs workqueue (aka the async commiter) and the pdflush threads. The third axis is a consequence of the second. The write lock is usually on top of a lock dependency chain which can include the journal lock, the flush lock or the commit lock. So it's dangerous to release and trying to reacquire the write lock while we still hold other locks. This is fine with the bkl: T1 T2 lock_kernel() mutex_lock(A) unlock_kernel() // do something lock_kernel() mutex_lock(A) -> already locked by T1 schedule() (and then unlock_kernel()) lock_kernel() mutex_unlock(A) .... This is not fine with a mutex: T1 T2 mutex_lock(write) mutex_lock(A) mutex_unlock(write) // do something mutex_lock(write) mutex_lock(A) -> already locked by T1 schedule() mutex_lock(write) -> already locked by T2 deadlock The solution in this patch is to provide a helper which releases the write lock and sleep a bit if we can't lock a mutex that depend on it. It's another simulation of the bkl behaviour. The last axis is to locate the fs callbacks that are called with the bkl held, according to Documentation/filesystem/Locking. Those are: - reiserfs_remount - reiserfs_fill_super - reiserfs_put_super Reiserfs didn't need to explicitly lock because of the context of these callbacks. But now we must take care of that with the new locking. After this patch, reiserfs suffers from a slight performance regression (for now). On UP, a high volume write with dd reports an average of 27 MB/s instead of 30 MB/s without the patch applied. Signed-off-by: Frederic Weisbecker Reviewed-by: Ingo Molnar Cc: Jeff Mahoney Cc: Peter Zijlstra Cc: Bron Gondwana Cc: Andrew Morton Cc: Linus Torvalds Cc: Alexander Viro LKML-Reference: <1239070789-13354-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- fs/reiserfs/Makefile | 2 +- fs/reiserfs/bitmap.c | 2 + fs/reiserfs/dir.c | 8 ++ fs/reiserfs/fix_node.c | 10 +++ fs/reiserfs/inode.c | 23 ++++-- fs/reiserfs/ioctl.c | 6 +- fs/reiserfs/journal.c | 134 ++++++++++++++++++++++++++------- fs/reiserfs/lock.c | 63 ++++++++++++++++ fs/reiserfs/resize.c | 2 + fs/reiserfs/stree.c | 2 + fs/reiserfs/super.c | 37 +++++++-- include/linux/reiserfs_fs.h | 12 +-- include/linux/reiserfs_fs_sb.h | 9 +++ 13 files changed, 261 insertions(+), 49 deletions(-) create mode 100644 fs/reiserfs/lock.c diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile index 7c5ab6330dd..6a9e30c041d 100644 --- a/fs/reiserfs/Makefile +++ b/fs/reiserfs/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ hashes.o tail_conversion.o journal.o resize.o \ - item_ops.o ioctl.o procfs.o xattr.o + item_ops.o ioctl.o procfs.o xattr.o lock.o ifeq ($(CONFIG_REISERFS_FS_XATTR),y) reiserfs-objs += xattr_user.o xattr_trusted.o diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index e716161ab32..147033461b8 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1256,7 +1256,9 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, else { if (buffer_locked(bh)) { PROC_INFO_INC(sb, scan_bitmap.wait); + reiserfs_write_unlock(sb); __wait_on_buffer(bh); + reiserfs_write_lock(sb); } BUG_ON(!buffer_uptodate(bh)); BUG_ON(atomic_read(&bh->b_count) == 0); diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 6d2668fdc38..17f31ad379c 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -174,14 +174,22 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, // user space buffer is swapped out. At that time // entry can move to somewhere else memcpy(local_buf, d_name, d_reclen); + + /* + * Since filldir might sleep, we can release + * the write lock here for other waiters + */ + reiserfs_write_unlock(inode->i_sb); if (filldir (dirent, local_buf, d_reclen, d_off, d_ino, DT_UNKNOWN) < 0) { + reiserfs_write_lock(inode->i_sb); if (local_buf != small_buf) { kfree(local_buf); } goto end; } + reiserfs_write_lock(inode->i_sb); if (local_buf != small_buf) { kfree(local_buf); } diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 5e5a4e6fbaf..bf5f2cbdb06 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -1022,7 +1022,11 @@ static int get_far_parent(struct tree_balance *tb, /* Check whether the common parent is locked. */ if (buffer_locked(*pcom_father)) { + + /* Release the write lock while the buffer is busy */ + reiserfs_write_unlock(tb->tb_sb); __wait_on_buffer(*pcom_father); + reiserfs_write_lock(tb->tb_sb); if (FILESYSTEM_CHANGED_TB(tb)) { brelse(*pcom_father); return REPEAT_SEARCH; @@ -1927,7 +1931,9 @@ static int get_direct_parent(struct tree_balance *tb, int h) return REPEAT_SEARCH; if (buffer_locked(bh)) { + reiserfs_write_unlock(tb->tb_sb); __wait_on_buffer(bh); + reiserfs_write_lock(tb->tb_sb); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } @@ -2278,7 +2284,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) REPEAT_SEARCH : CARRY_ON; } #endif + reiserfs_write_unlock(tb->tb_sb); __wait_on_buffer(locked); + reiserfs_write_lock(tb->tb_sb); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } @@ -2349,7 +2357,9 @@ int fix_nodes(int op_mode, struct tree_balance *tb, /* if it possible in indirect_to_direct conversion */ if (buffer_locked(tbS0)) { + reiserfs_write_unlock(tb->tb_sb); __wait_on_buffer(tbS0); + reiserfs_write_lock(tb->tb_sb); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index a14d6cd9eed..1893c819843 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -489,10 +489,14 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode, disappeared */ if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { int err; - lock_kernel(); + + reiserfs_write_lock(inode->i_sb); + err = reiserfs_commit_for_inode(inode); REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; - unlock_kernel(); + + reiserfs_write_unlock(inode->i_sb); + if (err < 0) ret = err; } @@ -616,7 +620,6 @@ int reiserfs_get_block(struct inode *inode, sector_t block, loff_t new_offset = (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; - /* bad.... */ reiserfs_write_lock(inode->i_sb); version = get_inode_item_key_version(inode); @@ -997,10 +1000,14 @@ int reiserfs_get_block(struct inode *inode, sector_t block, if (retval) goto failure; } - /* inserting indirect pointers for a hole can take a - ** long time. reschedule if needed + /* + * inserting indirect pointers for a hole can take a + * long time. reschedule if needed and also release the write + * lock for others. */ + reiserfs_write_unlock(inode->i_sb); cond_resched(); + reiserfs_write_lock(inode->i_sb); retval = search_for_position_by_key(inode->i_sb, &key, &path); if (retval == IO_ERROR) { @@ -2608,7 +2615,10 @@ int reiserfs_prepare_write(struct file *f, struct page *page, int ret; int old_ref = 0; + reiserfs_write_unlock(inode->i_sb); reiserfs_wait_on_write_block(inode->i_sb); + reiserfs_write_lock(inode->i_sb); + fix_tail_page_for_writing(page); if (reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th; @@ -2758,7 +2768,10 @@ int reiserfs_commit_write(struct file *f, struct page *page, int update_sd = 0; struct reiserfs_transaction_handle *th = NULL; + reiserfs_write_unlock(inode->i_sb); reiserfs_wait_on_write_block(inode->i_sb); + reiserfs_write_lock(inode->i_sb); + if (reiserfs_transaction_running(inode->i_sb)) { th = current->journal_info; } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 0ccc3fdda7b..5e40b0cd4c3 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -141,9 +141,11 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, default: return -ENOIOCTLCMD; } - lock_kernel(); + + reiserfs_write_lock(inode->i_sb); ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); - unlock_kernel(); + reiserfs_write_unlock(inode->i_sb); + return ret; } #endif diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 90622200b39..438c71f0bc9 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -429,21 +429,6 @@ static void clear_prepared_bits(struct buffer_head *bh) clear_buffer_journal_restore_dirty(bh); } -/* utility function to force a BUG if it is called without the big -** kernel lock held. caller is the string printed just before calling BUG() -*/ -void reiserfs_check_lock_depth(struct super_block *sb, char *caller) -{ -#ifdef CONFIG_SMP - if (current->lock_depth < 0) { - reiserfs_panic(sb, "journal-1", "%s called without kernel " - "lock held", caller); - } -#else - ; -#endif -} - /* return a cnode with same dev, block number and size in table, or null if not found */ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct super_block @@ -552,11 +537,48 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, journal_hash(table, cn->sb, cn->blocknr) = cn; } +/* + * Several mutexes depend on the write lock. + * However sometimes we want to relax the write lock while we hold + * these mutexes, according to the release/reacquire on schedule() + * properties of the Bkl that were used. + * Reiserfs performances and locking were based on this scheme. + * Now that the write lock is a mutex and not the bkl anymore, doing so + * may result in a deadlock: + * + * A acquire write_lock + * A acquire j_commit_mutex + * A release write_lock and wait for something + * B acquire write_lock + * B can't acquire j_commit_mutex and sleep + * A can't acquire write lock anymore + * deadlock + * + * What we do here is avoiding such deadlock by playing the same game + * than the Bkl: if we can't acquire a mutex that depends on the write lock, + * we release the write lock, wait a bit and then retry. + * + * The mutexes concerned by this hack are: + * - The commit mutex of a journal list + * - The flush mutex + * - The journal lock + */ +static inline void reiserfs_mutex_lock_safe(struct mutex *m, + struct super_block *s) +{ + while (!mutex_trylock(m)) { + reiserfs_write_unlock(s); + schedule(); + reiserfs_write_lock(s); + } +} + /* lock the current transaction */ static inline void lock_journal(struct super_block *sb) { PROC_INFO_INC(sb, journal.lock_journal); - mutex_lock(&SB_JOURNAL(sb)->j_mutex); + + reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb); } /* unlock the current transaction */ @@ -708,7 +730,9 @@ static void check_barrier_completion(struct super_block *s, disable_barrier(s); set_buffer_uptodate(bh); set_buffer_dirty(bh); + reiserfs_write_unlock(s); sync_dirty_buffer(bh); + reiserfs_write_lock(s); } } @@ -996,8 +1020,13 @@ static int reiserfs_async_progress_wait(struct super_block *s) { DEFINE_WAIT(wait); struct reiserfs_journal *j = SB_JOURNAL(s); - if (atomic_read(&j->j_async_throttle)) + + if (atomic_read(&j->j_async_throttle)) { + reiserfs_write_unlock(s); congestion_wait(BLK_RW_ASYNC, HZ / 10); + reiserfs_write_lock(s); + } + return 0; } @@ -1043,7 +1072,8 @@ static int flush_commit_list(struct super_block *s, } /* make sure nobody is trying to flush this one at the same time */ - mutex_lock(&jl->j_commit_mutex); + reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s); + if (!journal_list_still_alive(s, trans_id)) { mutex_unlock(&jl->j_commit_mutex); goto put_jl; @@ -1061,12 +1091,17 @@ static int flush_commit_list(struct super_block *s, if (!list_empty(&jl->j_bh_list)) { int ret; - unlock_kernel(); + + /* + * We might sleep in numerous places inside + * write_ordered_buffers. Relax the write lock. + */ + reiserfs_write_unlock(s); ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, journal, jl, &jl->j_bh_list); if (ret < 0 && retval == 0) retval = ret; - lock_kernel(); + reiserfs_write_lock(s); } BUG_ON(!list_empty(&jl->j_bh_list)); /* @@ -1114,12 +1149,19 @@ static int flush_commit_list(struct super_block *s, bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn); + + reiserfs_write_unlock(s); wait_on_buffer(tbh); + reiserfs_write_lock(s); // since we're using ll_rw_blk above, it might have skipped over // a locked buffer. Double check here // - if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */ + /* redundant, sync_dirty_buffer() checks */ + if (buffer_dirty(tbh)) { + reiserfs_write_unlock(s); sync_dirty_buffer(tbh); + reiserfs_write_lock(s); + } if (unlikely(!buffer_uptodate(tbh))) { #ifdef CONFIG_REISERFS_CHECK reiserfs_warning(s, "journal-601", @@ -1143,10 +1185,15 @@ static int flush_commit_list(struct super_block *s, if (buffer_dirty(jl->j_commit_bh)) BUG(); mark_buffer_dirty(jl->j_commit_bh) ; + reiserfs_write_unlock(s); sync_dirty_buffer(jl->j_commit_bh) ; + reiserfs_write_lock(s); } - } else + } else { + reiserfs_write_unlock(s); wait_on_buffer(jl->j_commit_bh); + reiserfs_write_lock(s); + } check_barrier_completion(s, jl->j_commit_bh); @@ -1286,7 +1333,9 @@ static int _update_journal_header_block(struct super_block *sb, if (trans_id >= journal->j_last_flush_trans_id) { if (buffer_locked((journal->j_header_bh))) { + reiserfs_write_unlock(sb); wait_on_buffer((journal->j_header_bh)); + reiserfs_write_lock(sb); if (unlikely(!buffer_uptodate(journal->j_header_bh))) { #ifdef CONFIG_REISERFS_CHECK reiserfs_warning(sb, "journal-699", @@ -1312,12 +1361,16 @@ static int _update_journal_header_block(struct super_block *sb, disable_barrier(sb); goto sync; } + reiserfs_write_unlock(sb); wait_on_buffer(journal->j_header_bh); + reiserfs_write_lock(sb); check_barrier_completion(sb, journal->j_header_bh); } else { sync: set_buffer_dirty(journal->j_header_bh); + reiserfs_write_unlock(sb); sync_dirty_buffer(journal->j_header_bh); + reiserfs_write_lock(sb); } if (!buffer_uptodate(journal->j_header_bh)) { reiserfs_warning(sb, "journal-837", @@ -1409,7 +1462,7 @@ static int flush_journal_list(struct super_block *s, /* if flushall == 0, the lock is already held */ if (flushall) { - mutex_lock(&journal->j_flush_mutex); + reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); } else if (mutex_trylock(&journal->j_flush_mutex)) { BUG(); } @@ -1553,7 +1606,11 @@ static int flush_journal_list(struct super_block *s, reiserfs_panic(s, "journal-1011", "cn->bh is NULL"); } + + reiserfs_write_unlock(s); wait_on_buffer(cn->bh); + reiserfs_write_lock(s); + if (!cn->bh) { reiserfs_panic(s, "journal-1012", "cn->bh is NULL"); @@ -1973,11 +2030,19 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, reiserfs_mounted_fs_count--; /* wait for all commits to finish */ cancel_delayed_work(&SB_JOURNAL(sb)->j_work); + + /* + * We must release the write lock here because + * the workqueue job (flush_async_commit) needs this lock + */ + reiserfs_write_unlock(sb); flush_workqueue(commit_wq); + if (!reiserfs_mounted_fs_count) { destroy_workqueue(commit_wq); commit_wq = NULL; } + reiserfs_write_lock(sb); free_journal_ram(sb); @@ -2243,7 +2308,11 @@ static int journal_read_transaction(struct super_block *sb, /* read in the log blocks, memcpy to the corresponding real block */ ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); for (i = 0; i < get_desc_trans_len(desc); i++) { + + reiserfs_write_unlock(sb); wait_on_buffer(log_blocks[i]); + reiserfs_write_lock(sb); + if (!buffer_uptodate(log_blocks[i])) { reiserfs_warning(sb, "journal-1212", "REPLAY FAILURE fsck required! " @@ -2964,8 +3033,11 @@ static void queue_log_writer(struct super_block *s) init_waitqueue_entry(&wait, current); add_wait_queue(&journal->j_join_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); - if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) + if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) { + reiserfs_write_unlock(s); schedule(); + reiserfs_write_lock(s); + } __set_current_state(TASK_RUNNING); remove_wait_queue(&journal->j_join_wait, &wait); } @@ -2982,7 +3054,9 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id) struct reiserfs_journal *journal = SB_JOURNAL(sb); unsigned long bcount = journal->j_bcount; while (1) { + reiserfs_write_unlock(sb); schedule_timeout_uninterruptible(1); + reiserfs_write_lock(sb); journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; while ((atomic_read(&journal->j_wcount) > 0 || atomic_read(&journal->j_jlock)) && @@ -3033,7 +3107,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { unlock_journal(sb); + reiserfs_write_unlock(sb); reiserfs_wait_on_write_block(sb); + reiserfs_write_lock(sb); PROC_INFO_INC(sb, journal.journal_relock_writers); goto relock; } @@ -3506,14 +3582,14 @@ static void flush_async_commits(struct work_struct *work) struct reiserfs_journal_list *jl; struct list_head *entry; - lock_kernel(); + reiserfs_write_lock(sb); if (!list_empty(&journal->j_journal_list)) { /* last entry is the youngest, commit it and you get everything */ entry = journal->j_journal_list.prev; jl = JOURNAL_LIST_ENTRY(entry); flush_commit_list(sb, jl, 1); } - unlock_kernel(); + reiserfs_write_unlock(sb); } /* @@ -4041,7 +4117,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, * the new transaction is fully setup, and we've already flushed the * ordered bh list */ - mutex_lock(&jl->j_commit_mutex); + reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb); /* save the transaction id in case we need to commit it later */ commit_trans_id = jl->j_trans_id; @@ -4203,10 +4279,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, * is lost. */ if (!list_empty(&jl->j_tail_bh_list)) { - unlock_kernel(); + reiserfs_write_unlock(sb); write_ordered_buffers(&journal->j_dirty_buffers_lock, journal, jl, &jl->j_tail_bh_list); - lock_kernel(); + reiserfs_write_lock(sb); } BUG_ON(!list_empty(&jl->j_tail_bh_list)); mutex_unlock(&jl->j_commit_mutex); diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c new file mode 100644 index 00000000000..cdd8d9ef048 --- /dev/null +++ b/fs/reiserfs/lock.c @@ -0,0 +1,63 @@ +#include +#include + +/* + * The previous reiserfs locking scheme was heavily based on + * the tricky properties of the Bkl: + * + * - it was acquired recursively by a same task + * - the performances relied on the release-while-schedule() property + * + * Now that we replace it by a mutex, we still want to keep the same + * recursive property to avoid big changes in the code structure. + * We use our own lock_owner here because the owner field on a mutex + * is only available in SMP or mutex debugging, also we only need this field + * for this mutex, no need for a system wide mutex facility. + * + * Also this lock is often released before a call that could block because + * reiserfs performances were partialy based on the release while schedule() + * property of the Bkl. + */ +void reiserfs_write_lock(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + if (sb_i->lock_owner != current) { + mutex_lock(&sb_i->lock); + sb_i->lock_owner = current; + } + + /* No need to protect it, only the current task touches it */ + sb_i->lock_depth++; +} + +void reiserfs_write_unlock(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + /* + * Are we unlocking without even holding the lock? + * Such a situation could even raise a BUG() if we don't + * want the data become corrupted + */ + WARN_ONCE(sb_i->lock_owner != current, + "Superblock write lock imbalance"); + + if (--sb_i->lock_depth == -1) { + sb_i->lock_owner = NULL; + mutex_unlock(&sb_i->lock); + } +} + +/* + * Utility function to force a BUG if it is called without the superblock + * write lock held. caller is the string printed just before calling BUG() + */ +void reiserfs_check_lock_depth(struct super_block *sb, char *caller) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); + + if (sb_i->lock_depth < 0) + reiserfs_panic(sb, "%s called without kernel lock held %d", + caller); +} diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 18b315d3d10..b3a94d20f0f 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -141,7 +141,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) set_buffer_uptodate(bh); mark_buffer_dirty(bh); + reiserfs_write_unlock(s); sync_dirty_buffer(bh); + reiserfs_write_lock(s); // update bitmap_info stuff bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; brelse(bh); diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index d036ee5b1c8..6bd99a99a65 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -629,7 +629,9 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s search_by_key_reada(sb, reada_bh, reada_blocks, reada_count); ll_rw_block(READ, 1, &bh); + reiserfs_write_unlock(sb); wait_on_buffer(bh); + reiserfs_write_lock(sb); if (!buffer_uptodate(bh)) goto io_error; } else { diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 7adea74d6a8..e1cfb80d0bf 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -465,7 +465,7 @@ static void reiserfs_put_super(struct super_block *s) struct reiserfs_transaction_handle th; th.t_trans_id = 0; - lock_kernel(); + reiserfs_write_lock(s); if (s->s_dirt) reiserfs_write_super(s); @@ -499,10 +499,10 @@ static void reiserfs_put_super(struct super_block *s) reiserfs_proc_info_done(s); + reiserfs_write_unlock(s); + mutex_destroy(&REISERFS_SB(s)->lock); kfree(s->s_fs_info); s->s_fs_info = NULL; - - unlock_kernel(); } static struct kmem_cache *reiserfs_inode_cachep; @@ -1168,11 +1168,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) unsigned int qfmt = 0; #ifdef CONFIG_QUOTA int i; +#endif + reiserfs_write_lock(s); + +#ifdef CONFIG_QUOTA memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); #endif - lock_kernel(); rs = SB_DISK_SUPER_BLOCK(s); if (!reiserfs_parse_options @@ -1295,12 +1298,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) out_ok: replace_mount_options(s, new_opts); - unlock_kernel(); + reiserfs_write_unlock(s); return 0; out_err: kfree(new_opts); - unlock_kernel(); + reiserfs_write_unlock(s); return err; } @@ -1404,7 +1407,9 @@ static int read_super_block(struct super_block *s, int offset) static int reread_meta_blocks(struct super_block *s) { ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); + reiserfs_write_unlock(s); wait_on_buffer(SB_BUFFER_WITH_SB(s)); + reiserfs_write_lock(s); if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); return 1; @@ -1613,7 +1618,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); if (!sbi) { errval = -ENOMEM; - goto error; + goto error_alloc; } s->s_fs_info = sbi; /* Set default values for options: non-aggressive tails, RO on errors */ @@ -1627,6 +1632,20 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) /* setup default block allocator options */ reiserfs_init_alloc_options(s); + mutex_init(&REISERFS_SB(s)->lock); + REISERFS_SB(s)->lock_depth = -1; + + /* + * This function is called with the bkl, which also was the old + * locking used here. + * do_journal_begin() will soon check if we hold the lock (ie: was the + * bkl). This is likely because do_journal_begin() has several another + * callers because at this time, it doesn't seem to be necessary to + * protect against anything. + * Anyway, let's be conservative and lock for now. + */ + reiserfs_write_lock(s); + jdev_name = NULL; if (reiserfs_parse_options (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, @@ -1852,9 +1871,13 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) init_waitqueue_head(&(sbi->s_wait)); spin_lock_init(&sbi->bitmap_lock); + reiserfs_write_unlock(s); + return (0); error: + reiserfs_write_unlock(s); +error_alloc: if (jinit_done) { /* kill the commit thread, free journal ram */ journal_release_error(NULL, s); } diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index dd31e7bae35..e47328f5180 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -52,11 +52,13 @@ #define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION #define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION -/* Locking primitives */ -/* Right now we are still falling back to (un)lock_kernel, but eventually that - would evolve into real per-fs locks */ -#define reiserfs_write_lock( sb ) lock_kernel() -#define reiserfs_write_unlock( sb ) unlock_kernel() +/* + * Locking primitives. The write lock is a per superblock + * special mutex that has properties close to the Big Kernel Lock + * which was used in the previous locking scheme. + */ +void reiserfs_write_lock(struct super_block *s); +void reiserfs_write_unlock(struct super_block *s); struct fid; diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h index dab68bbed67..045c3721367 100644 --- a/include/linux/reiserfs_fs_sb.h +++ b/include/linux/reiserfs_fs_sb.h @@ -7,6 +7,8 @@ #ifdef __KERNEL__ #include #include +#include +#include #endif typedef enum { @@ -355,6 +357,13 @@ struct reiserfs_sb_info { struct reiserfs_journal *s_journal; /* pointer to journal information */ unsigned short s_mount_state; /* reiserfs state (valid, invalid) */ + /* Serialize writers access, replace the old bkl */ + struct mutex lock; + /* Owner of the lock (can be recursive) */ + struct task_struct *lock_owner; + /* Depth of the lock, start from -1 like the bkl */ + int lock_depth; + /* Comment? -Hans */ void (*end_io_handler) (struct buffer_head *, int); hashf_t s_hash_function; /* pointer to function which is used From a412f9efdd6424bf4bf28c8e8c92060b5e975482 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 14 Apr 2009 00:10:35 +0200 Subject: [PATCH 0014/2255] reiserfs, kill-the-BKL: fix unsafe j_flush_mutex lock Impact: fix a deadlock The j_flush_mutex is acquired safely in journal.c: if we can't take it, we free the reiserfs per superblock lock and wait a bit. But we have a remaining place in kupdate_transactions() where j_flush_mutex is still acquired traditionnaly. Thus the following scenario (warned by lockdep) can happen: A B mutex_lock(&write_lock) mutex_lock(&write_lock) mutex_lock(&j_flush_mutex) mutex_lock(&j_flush_mutex) //block mutex_unlock(&write_lock) sleep... mutex_lock(&write_lock) //deadlock Fix this by using reiserfs_mutex_lock_safe() in kupdate_transactions(). Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Jeff Mahoney LKML-Reference: <1239660635-12940-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- fs/reiserfs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 438c71f0bc9..0ba98ca367c 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1826,7 +1826,7 @@ static int kupdate_transactions(struct super_block *s, struct reiserfs_journal *journal = SB_JOURNAL(s); chunk.nr = 0; - mutex_lock(&journal->j_flush_mutex); + reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); if (!journal_list_still_alive(s, orig_trans_id)) { goto done; } From daf88c898312a22b5385655bc6e0b064eaa2efba Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 14 Apr 2009 05:34:23 +0200 Subject: [PATCH 0015/2255] kill-the-BKL/reiserfs: provide a tool to lock only once the write lock Sometimes we don't want to recursively hold the per superblock write lock because we want to be sure it is actually released when we come to sleep. This patch introduces the necessary tools for that. reiserfs_write_lock_once() does the same job than reiserfs_write_lock() except that it won't try to acquire recursively the lock if the current task already owns it. Also the lock_depth before the call of this function is returned. reiserfs_write_unlock_once() unlock only if reiserfs_write_lock_once() returned a depth equal to -1, ie: only if it actually locked. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Jeff Mahoney Cc: Alexander Beregalov Cc: Chris Mason LKML-Reference: <1239680065-25013-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- fs/reiserfs/lock.c | 26 ++++++++++++++++++++++++++ include/linux/reiserfs_fs.h | 2 ++ 2 files changed, 28 insertions(+) diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c index cdd8d9ef048..cb1bba3802d 100644 --- a/fs/reiserfs/lock.c +++ b/fs/reiserfs/lock.c @@ -49,6 +49,32 @@ void reiserfs_write_unlock(struct super_block *s) } } +/* + * If we already own the lock, just exit and don't increase the depth. + * Useful when we don't want to lock more than once. + * + * We always return the lock_depth we had before calling + * this function. + */ +int reiserfs_write_lock_once(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + if (sb_i->lock_owner != current) { + mutex_lock(&sb_i->lock); + sb_i->lock_owner = current; + return sb_i->lock_depth++; + } + + return sb_i->lock_depth; +} + +void reiserfs_write_unlock_once(struct super_block *s, int lock_depth) +{ + if (lock_depth == -1) + reiserfs_write_unlock(s); +} + /* * Utility function to force a BUG if it is called without the superblock * write lock held. caller is the string printed just before calling BUG() diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index e47328f5180..4a2df57c8b1 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -59,6 +59,8 @@ */ void reiserfs_write_lock(struct super_block *s); void reiserfs_write_unlock(struct super_block *s); +int reiserfs_write_lock_once(struct super_block *s); +void reiserfs_write_unlock_once(struct super_block *s, int lock_depth); struct fid; From 22c963addcf426bef97a43f6e601f985f8082ed5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 14 Apr 2009 05:34:24 +0200 Subject: [PATCH 0016/2255] kill-the-BKL/reiserfs: lock only once in reiserfs_truncate_file Impact: fix a deadlock reiserfs_truncate_file() can be called from multiple context where the write lock can be already hold or not. This function also acquire (possibly recursively) the write lock. Subsequent releases before sleeping will not actually release the lock because we may be in more than one lock depth degree. A typical case is: reiserfs_file_release { acquire_the_lock() reiserfs_truncate_file() reacquire_the_lock() journal_begin() { do_journal_begin_r() { reiserfs_wait_on_write_block() { /* * Not released because still one * depth owned */ release_lock() wait_for_event() At this stage the event never happen because the one which provides it needs the write lock. We use reiserfs_write_lock_once() here to ensure that we don't acquire the write lock recursively. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Jeff Mahoney Cc: Alexander Beregalov Cc: Chris Mason LKML-Reference: <1239680065-25013-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- fs/reiserfs/inode.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 1893c819843..cc70b56bf6f 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2079,8 +2079,9 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) int error; struct buffer_head *bh = NULL; int err2; + int lock_depth; - reiserfs_write_lock(inode->i_sb); + lock_depth = reiserfs_write_lock_once(inode->i_sb); if (inode->i_size > 0) { error = grab_tail_page(inode, &page, &bh); @@ -2149,14 +2150,17 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) page_cache_release(page); } - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + return 0; out: if (page) { unlock_page(page); page_cache_release(page); } - reiserfs_write_unlock(inode->i_sb); + + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + return error; } From dc8f6d8936eb244eea452af689df5ee19e635206 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 14 Apr 2009 05:34:25 +0200 Subject: [PATCH 0017/2255] kill-the-BKL/reiserfs: only acquire the write lock once in reiserfs_dirty_inode Impact: fix a deadlock reiserfs_dirty_inode() is the super_operations::dirty_inode() callback of reiserfs. It can be called from different contexts where the write lock can be already held. But this function also grab the write lock (possibly recursively). Subsequent release of the lock before sleep will actually not release the lock if the caller of mark_inode_dirty() (which in turn calls reiserfs_dirty_inode()) already owns the lock. A typical case: reiserfs_write_end() { acquire_write_lock() mark_inode_dirty() { reiserfs_dirty_inode() { reacquire_write_lock() { journal_begin() { do_journal_begin_r() { /* * fail to release, still * one depth of lock */ release_write_lock() reiserfs_wait_on_write_block() { wait_event() The event is usually provided by something which needs the write lock but it hasn't been released. We use reiserfs_write_lock_once() here to ensure we only grab the write lock in one level. Signed-off-by: Frederic Weisbecker Cc: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Jeff Mahoney Cc: Chris Mason LKML-Reference: <1239680065-25013-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- fs/reiserfs/super.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index e1cfb80d0bf..58727b5b435 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -554,25 +554,28 @@ static void reiserfs_dirty_inode(struct inode *inode) struct reiserfs_transaction_handle th; int err = 0; + int lock_depth; + if (inode->i_sb->s_flags & MS_RDONLY) { reiserfs_warning(inode->i_sb, "clm-6006", "writing inode %lu on readonly FS", inode->i_ino); return; } - reiserfs_write_lock(inode->i_sb); + lock_depth = reiserfs_write_lock_once(inode->i_sb); /* this is really only used for atime updates, so they don't have ** to be included in O_SYNC or fsync */ err = journal_begin(&th, inode->i_sb, 1); - if (err) { - reiserfs_write_unlock(inode->i_sb); - return; - } + if (err) + goto out; + reiserfs_update_sd(&th, inode); journal_end(&th, inode->i_sb, 1); - reiserfs_write_unlock(inode->i_sb); + +out: + reiserfs_write_unlock_once(inode->i_sb, lock_depth); } #ifdef CONFIG_QUOTA From f32049dc244f4d394c8faa161b4f13cb8c4f5c8c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 Apr 2009 22:05:25 +0200 Subject: [PATCH 0018/2255] kill-the-BKL/reiserfs: release write lock on fs_changed() fs_changed() is a macro used by reiserfs to check whether its tree has been rebalanced. It has been designed to check parallel changes on the tree after calling a sleeping function, which released the Bkl. fs_changed() also calls cond_resched(), so that if rescheduling is needed, we are in the best place to do that, since we check if the tree has changed just after (because of the bkl release on schedule()). Even if we are not anymore using the Bkl, we still want to release the lock while we reschedule, so that other waiters for the lock can acquire it safely, because of the following __fs_changed() check. [ Impact: release the reiserfs write lock when it is not needed ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- include/linux/reiserfs_fs.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 4a2df57c8b1..fa5dbf307c4 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -1333,7 +1333,13 @@ static inline loff_t max_reiserfs_offset(struct inode *inode) #define get_generation(s) atomic_read (&fs_generation(s)) #define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen) #define __fs_changed(gen,s) (gen != get_generation (s)) -#define fs_changed(gen,s) ({cond_resched(); __fs_changed(gen, s);}) +#define fs_changed(gen,s) \ +({ \ + reiserfs_write_unlock(s); \ + cond_resched(); \ + reiserfs_write_lock(s); \ + __fs_changed(gen, s); \ +}) /***************************************************************************/ /* FIXATE NODES */ From e6950a4da33fa84ef4402d29e2331cd5e5df9cb8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 Apr 2009 23:04:32 +0200 Subject: [PATCH 0019/2255] kill-the-BKL/reiserfs: release the write lock before rescheduling on do_journal_end() When do_journal_end() copies data to the journal blocks buffers in memory, it reschedules if needed between each block copied and dirtyfied. We can also release the write lock at this rescheduling stage, like did the bkl implicitly. [ Impact: release the reiserfs write lock when it is not needed ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/journal.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 0ba98ca367c..5cd600baf8c 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -4232,7 +4232,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, next = cn->next; free_cnode(sb, cn); cn = next; + reiserfs_write_unlock(sb); cond_resched(); + reiserfs_write_lock(sb); } /* we are done with both the c_bh and d_bh, but From 5e69e3a4492ea5abfd2e8ddc575448becf28e4d9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 Apr 2009 23:36:33 +0200 Subject: [PATCH 0020/2255] kill-the-BKL/reiserfs: release write lock while rescheduling on prepare_for_delete_or_cut() prepare_for_delete_or_cut() can process several types of items, including indirect items, ie: items which contain no file data but pointers to unformatted nodes scattering the datas of a file. In this case it has to zero out these pointers to block numbers of unformatted nodes and release the bitmap from these block numbers. It can take some time, so a rescheduling() is performed between each block processed. We can safely release the write lock while rescheduling(), like the bkl did, because the code checks just after if the item has moved after sleeping. [ Impact: release the reiserfs write lock when it is not needed ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/stree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 6bd99a99a65..6ddcecb4e8a 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1026,7 +1026,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st reiserfs_free_block(th, inode, block, 1); } + reiserfs_write_unlock(sb); cond_resched(); + reiserfs_write_lock(sb); if (item_moved (&s_ih, path)) { need_re_search = 1; From 148d3504c1d9f964cf14fafc46d2b7d1f0bed2b1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 May 2009 01:10:52 +0200 Subject: [PATCH 0021/2255] kill-the-BKL/reiserfs: release the write lock inside get_neighbors() get_neighbors() is used to get the left and/or right blocks against a given one in order to balance a tree. sb_bread() is used to read the buffer of these neighors blocks and while it waits for this operation, it might sleep. The bkl was released at this point, and then we can also release the write lock before calling sb_bread(). This is safe because if the filesystem is changed after this lock release, the function returns REPEAT_SEARCH (aka SCHEDULE_OCCURRED in the function header comments) in order to repeat the neighbhor research. [ Impact: release the reiserfs write lock when it is not needed ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/fix_node.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index bf5f2cbdb06..3a685e3f754 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -1971,7 +1971,9 @@ static int get_neighbors(struct tree_balance *tb, int h) tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> FL[h]); son_number = B_N_CHILD_NUM(tb->FL[h], child_position); + reiserfs_write_unlock(sb); bh = sb_bread(sb, son_number); + reiserfs_write_lock(sb); if (!bh) return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { @@ -2009,7 +2011,9 @@ static int get_neighbors(struct tree_balance *tb, int h) child_position = (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; son_number = B_N_CHILD_NUM(tb->FR[h], child_position); + reiserfs_write_unlock(sb); bh = sb_bread(sb, son_number); + reiserfs_write_lock(sb); if (!bh) return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { From 4c5eface5d0e4eb7f77be346193c2850e7e3b983 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 May 2009 01:44:57 +0200 Subject: [PATCH 0022/2255] kill-the-BKL/reiserfs: release the write lock inside reiserfs_read_bitmap_block() reiserfs_read_bitmap_block() uses sb_bread() to read the bitmap block. This helper might sleep. Then, when the bkl was used, it was released at this point. We can then relax the write lock too here. [ Impact: release the reiserfs write lock when it is not needed ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/bitmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 147033461b8..68549570718 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1249,7 +1249,9 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, else if (bitmap == 0) block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1; + reiserfs_write_unlock(sb); bh = sb_bread(sb, block); + reiserfs_write_lock(sb); if (bh == NULL) reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) " "reading failed", __func__, block); From 6e3647acb4f200add1d8e0203514f7ac925ae463 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 May 2009 02:27:39 +0200 Subject: [PATCH 0023/2255] kill-the-BKL/reiserfs: release the write lock on flush_commit_list() flush_commit_list() uses ll_rw_block() to commit the pending log blocks. ll_rw_block() might sleep, and the bkl was released at this point. Then we can also relax the write lock at this point. [ Impact: release the reiserfs write lock when it is not needed ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/journal.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 5cd600baf8c..ffb7f50abc2 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1120,8 +1120,11 @@ static int flush_commit_list(struct super_block *s, SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn); if (tbh) { - if (buffer_dirty(tbh)) - ll_rw_block(WRITE, 1, &tbh) ; + if (buffer_dirty(tbh)) { + reiserfs_write_unlock(s); + ll_rw_block(WRITE, 1, &tbh); + reiserfs_write_lock(s); + } put_bh(tbh) ; } } From e43d3f21c502dec786f2885a75e25859f18d6ffa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 7 May 2009 22:51:20 +0200 Subject: [PATCH 0024/2255] kill-the-BKL/reiserfs: add reiserfs_cond_resched() Usually, when we call cond_resched(), we want the write lock to be released and then reacquired once we return from scheduling. Not only does it follow the previous bkl based locking scheme, but it also let other waiters to get the lock. But if we aren't going to reschedule(), such as in !TIF_NEED_RESCHED case, it's useless to release the lock. Worse, if we release and reacquire the lock whereas it is not needed, we create useless contentions. Also if someone takes the lock while we are modifying or reading the tree, there are good chances we'll have to retry our operation, eg if the block we were seeeking has moved. So this patch introduces a helper which only unlock the write lock if we are going to schedule. [ Impact: prepare to inject less lock contention and less tree operation attempts ] Reported-by: Andi Kleen Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- include/linux/reiserfs_fs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index fa5dbf307c4..27f4ecc2818 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -62,6 +62,19 @@ void reiserfs_write_unlock(struct super_block *s); int reiserfs_write_lock_once(struct super_block *s); void reiserfs_write_unlock_once(struct super_block *s, int lock_depth); +/* + * When we schedule, we usually want to also release the write lock, + * according to the previous bkl based locking scheme of reiserfs. + */ +static inline void reiserfs_cond_resched(struct super_block *s) +{ + if (need_resched()) { + reiserfs_write_unlock(s); + schedule(); + reiserfs_write_lock(s); + } +} + struct fid; /* in reading the #defines, it may help to understand that they employ From d663af807d8bb226394cb7e02f4665f6141a8140 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 7 May 2009 23:25:29 +0200 Subject: [PATCH 0025/2255] kill-the-bkl/reiserfs: conditionaly release the write lock on fs_changed() The goal of fs_changed() is to check whether the tree changed during a schedule(). This is a BKL legacy. A recent patch added an explicit unconditional release/reacquire of the write lock around the cond_resched() called inside fs_changed. But it's wasteful to unconditionally do that, we are creating superfluous lock contention in !TIF_NEED_RESCHED case. This patch manage that by calling reiserfs_cond_resched() from fs_changed() which only releases the lock if we are going to reschedule. [ Impact: inject less lock contention and tree job retries ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- include/linux/reiserfs_fs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 27f4ecc2818..508fb523863 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -1348,9 +1348,7 @@ static inline loff_t max_reiserfs_offset(struct inode *inode) #define __fs_changed(gen,s) (gen != get_generation (s)) #define fs_changed(gen,s) \ ({ \ - reiserfs_write_unlock(s); \ - cond_resched(); \ - reiserfs_write_lock(s); \ + reiserfs_cond_resched(s); \ __fs_changed(gen, s); \ }) From 26931309a47747fd31b2ef029c29d47794c2d93d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 7 May 2009 23:48:44 +0200 Subject: [PATCH 0026/2255] kill-the-bkl/reiserfs: lock only once on reiserfs_get_block() reiserfs_get_block() is one of these sites where the write lock might be acquired recursively. It's a particular problem because this function is called very often. It's a hot spot which needs to reschedule() periodically while converting direct items to indirect ones because it can take some time. Then if we are applying the write lock release/reacquire pattern on schedule() here, it may not produce the desired effect since we may have locked in more than one depth. The solution is to use reiserfs_write_lock_once() which won't try to reacquire the lock recursively. Then the lock will be *really* released before schedule(). Also, we only release the lock if TIF_NEED_RESCHED is set to not create wasteful numerous contentions. [ Impact: fix a too long holded lock case in reiserfs_get_block() ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/inode.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index cc70b56bf6f..6114050f342 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -605,6 +605,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, __le32 *item; int done; int fs_gen; + int lock_depth; struct reiserfs_transaction_handle *th = NULL; /* space reserved in transaction batch: . 3 balancings in direct->indirect conversion @@ -620,11 +621,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block, loff_t new_offset = (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; - reiserfs_write_lock(inode->i_sb); + lock_depth = reiserfs_write_lock_once(inode->i_sb); version = get_inode_item_key_version(inode); if (!file_capable(inode, block)) { - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, lock_depth); return -EFBIG; } @@ -636,7 +637,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, /* find number of block-th logical block of the file */ ret = _get_block_create_0(inode, block, bh_result, create | GET_BLOCK_READ_DIRECT); - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, lock_depth); return ret; } /* @@ -754,7 +755,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, if (!dangle && th) retval = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, lock_depth); /* the item was found, so new blocks were not added to the file ** there is no need to make sure the inode is updated with this @@ -1005,9 +1006,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block, * long time. reschedule if needed and also release the write * lock for others. */ - reiserfs_write_unlock(inode->i_sb); - cond_resched(); - reiserfs_write_lock(inode->i_sb); + if (need_resched()) { + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + schedule(); + lock_depth = reiserfs_write_lock_once(inode->i_sb); + } retval = search_for_position_by_key(inode->i_sb, &key, &path); if (retval == IO_ERROR) { @@ -1042,7 +1045,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, retval = err; } - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, lock_depth); reiserfs_check_path(&path); return retval; } From b1c839bb2d8d6f1f6bf48f5c657752b4963f88f8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 May 2009 01:05:06 +0200 Subject: [PATCH 0027/2255] kill-the-bkl/reiserfs: don't hold the write recursively in reiserfs_lookup() The write lock can be acquired recursively in reiserfs_lookup(). But we may want to *really* release the lock before possible rescheduling from a reiserfs_lookup() callee. Hence we want to only acquire the lock once (ie: not recursively). [ Impact: prevent from possible false unreleased write lock on sleeping ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/namei.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 27157912863..b3973c9f0bf 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -324,6 +324,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { int retval; + int lock_depth; struct inode *inode = NULL; struct reiserfs_dir_entry de; INITIALIZE_PATH(path_to_entry); @@ -331,7 +332,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len) return ERR_PTR(-ENAMETOOLONG); - reiserfs_write_lock(dir->i_sb); + /* + * Might be called with or without the write lock, must be careful + * to not recursively hold it in case we want to release the lock + * before rescheduling. + */ + lock_depth = reiserfs_write_lock_once(dir->i_sb); + de.de_gen_number_bit_string = NULL; retval = reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, @@ -341,7 +348,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); if (!inode || IS_ERR(inode)) { - reiserfs_write_unlock(dir->i_sb); + reiserfs_write_unlock_once(dir->i_sb, lock_depth); return ERR_PTR(-EACCES); } @@ -350,7 +357,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, if (IS_PRIVATE(dir)) inode->i_flags |= S_PRIVATE; } - reiserfs_write_unlock(dir->i_sb); + reiserfs_write_unlock_once(dir->i_sb, lock_depth); if (retval == IO_ERROR) { return ERR_PTR(-EIO); } From 09eb47a7c52ad535aafca889e0b936c445c375ce Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 May 2009 14:21:33 +0200 Subject: [PATCH 0028/2255] kill-the-bkl/reiserfs: reduce number of contentions in search_by_key() search_by_key() is a central function in reiserfs which searches the patch in the fs tree from the root to a node given its key. It is the function that is most requesting the write lock because it's a path very often used. Also we forget to release the lock while reading the next tree node, making us holding the lock in a wasteful way. Then we release the lock while reading the current node and its childs, all-in-one. It should be safe because we have a reference to these blocks and even if we read a block that will be concurrently changed, we have an fs_changed check later that will make us retry the path from the root. [ Impact: release the write lock while unused in a hot path ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/stree.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 6ddcecb4e8a..960c9114f6d 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -529,6 +529,14 @@ static void search_by_key_reada(struct super_block *s, for (i = 0; i < num; i++) { bh[i] = sb_getblk(s, b[i]); } + /* + * We are going to read some blocks on which we + * have a reference. It's safe, though we might be + * reading blocks concurrently changed if we release + * the lock. But it's still fine because we check later + * if the tree changed + */ + reiserfs_write_unlock(s); for (j = 0; j < i; j++) { /* * note, this needs attention if we are getting rid of the BKL @@ -626,10 +634,12 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s if ((bh = last_element->pe_buffer = sb_getblk(sb, block_number))) { if (!buffer_uptodate(bh) && reada_count > 1) + /* will unlock the write lock */ search_by_key_reada(sb, reada_bh, reada_blocks, reada_count); + else + reiserfs_write_unlock(sb); ll_rw_block(READ, 1, &bh); - reiserfs_write_unlock(sb); wait_on_buffer(bh); reiserfs_write_lock(sb); if (!buffer_uptodate(bh)) From d6f5b0aa08078c3dabe377d5b1a6077e9c9352d3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 May 2009 14:53:52 +0200 Subject: [PATCH 0029/2255] kill-the-bkl/reiserfs: factorize the locking in reiserfs_write_end() reiserfs_write_end() is a hot path in reiserfs. We have two wasteful write lock lock/release inside that can be gathered without changing the code logic. This patch factorizes them out in a single protected section, reducing the number of contentions inside. [ Impact: reduce lock contention in a reiserfs hotpath ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/inode.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 6114050f342..853f4f6fe92 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2681,6 +2681,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, int update_sd = 0; struct reiserfs_transaction_handle *th; unsigned start; + int lock_depth = 0; + bool locked = false; if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) pos ++; @@ -2707,9 +2709,11 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, ** to do the i_size updates here. */ pos += copied; + if (pos > inode->i_size) { struct reiserfs_transaction_handle myth; - reiserfs_write_lock(inode->i_sb); + lock_depth = reiserfs_write_lock_once(inode->i_sb); + locked = true; /* If the file have grown beyond the border where it can have a tail, unmark it as needing a tail packing */ @@ -2720,10 +2724,9 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; ret = journal_begin(&myth, inode->i_sb, 1); - if (ret) { - reiserfs_write_unlock(inode->i_sb); + if (ret) goto journal_error; - } + reiserfs_update_inode_transaction(inode); inode->i_size = pos; /* @@ -2735,34 +2738,36 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, reiserfs_update_sd(&myth, inode); update_sd = 1; ret = journal_end(&myth, inode->i_sb, 1); - reiserfs_write_unlock(inode->i_sb); if (ret) goto journal_error; } if (th) { - reiserfs_write_lock(inode->i_sb); + if (!locked) { + lock_depth = reiserfs_write_lock_once(inode->i_sb); + locked = true; + } if (!update_sd) mark_inode_dirty(inode); ret = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); if (ret) goto out; } out: + if (locked) + reiserfs_write_unlock_once(inode->i_sb, lock_depth); unlock_page(page); page_cache_release(page); return ret == 0 ? copied : ret; journal_error: + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + locked = false; if (th) { - reiserfs_write_lock(inode->i_sb); if (!update_sd) reiserfs_update_sd(th, inode); ret = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); } - goto out; } From c63e3c0b2498adec921b06c670d12c8c74b85538 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 May 2009 20:01:09 +0200 Subject: [PATCH 0030/2255] kill-the-bkl/reiserfs: use mutex_lock in reiserfs_mutex_lock_safe reiserfs_mutex_lock_safe() is a hack to avoid any dependency between an internal reiserfs mutex and the write lock, it has been proposed to follow the old bkl logic. The code does the following: while (!mutex_trylock(m)) { reiserfs_write_unlock(s); schedule(); reiserfs_write_lock(s); } It then imitate the implicit behaviour of the lock when it was a Bkl and hadn't such dependency: mutex_lock(m) { if (fastpath) let's go else { wait_for_mutex() { schedule() { unlock_kernel() reacquire_lock_kernel() } } } } The problem is that by using such explicit schedule(), we don't benefit of the adaptive mutex spinning on owner. The logic in use now is: reiserfs_write_unlock(s); mutex_lock(m); // -> possible adaptive spinning reiserfs_write_lock(s); [ Impact: restore the use of adaptive spinning mutexes in reiserfs ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/journal.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index ffb7f50abc2..e9a972bd032 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -566,11 +566,9 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, static inline void reiserfs_mutex_lock_safe(struct mutex *m, struct super_block *s) { - while (!mutex_trylock(m)) { - reiserfs_write_unlock(s); - schedule(); - reiserfs_write_lock(s); - } + reiserfs_write_unlock(s); + mutex_lock(m); + reiserfs_write_lock(s); } /* lock the current transaction */ From 2ac626955ed62ee8596f00581f959cc86e6198d1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 14 May 2009 02:56:39 +0200 Subject: [PATCH 0031/2255] kill-the-bkl/reiserfs: unlock only when needed in search_by_key search_by_key() is the site which most requires the lock. This is mostly because it is a very central function and also because it releases/reaqcuires the write lock at least once each time it is called. Such release/reacquire creates a lot of contention in this place and also opens more the window which let another thread changing the tree. When it happens, the current path searching over the tree must be retried from the beggining (the root) which is a wasteful and time consuming recovery. This patch factorizes two release/reacquire sequences: - reading leaf nodes blocks - reading current block The latter immediately follows the former. The whole sequence is safe as a single unlocked section because we check just after if the tree has changed during these operations. Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/stree.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 960c9114f6d..6b025a42d51 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -519,12 +519,22 @@ static int is_tree_node(struct buffer_head *bh, int level) #define SEARCH_BY_KEY_READA 16 -/* The function is NOT SCHEDULE-SAFE! */ -static void search_by_key_reada(struct super_block *s, +/* + * The function is NOT SCHEDULE-SAFE! + * It might unlock the write lock if we needed to wait for a block + * to be read. Note that in this case it won't recover the lock to avoid + * high contention resulting from too much lock requests, especially + * the caller (search_by_key) will perform other schedule-unsafe + * operations just after calling this function. + * + * @return true if we have unlocked + */ +static bool search_by_key_reada(struct super_block *s, struct buffer_head **bh, b_blocknr_t *b, int num) { int i, j; + bool unlocked = false; for (i = 0; i < num; i++) { bh[i] = sb_getblk(s, b[i]); @@ -536,16 +546,21 @@ static void search_by_key_reada(struct super_block *s, * the lock. But it's still fine because we check later * if the tree changed */ - reiserfs_write_unlock(s); for (j = 0; j < i; j++) { /* * note, this needs attention if we are getting rid of the BKL * you have to make sure the prepared bit isn't set on this buffer */ - if (!buffer_uptodate(bh[j])) + if (!buffer_uptodate(bh[j])) { + if (!unlocked) { + reiserfs_write_unlock(s); + unlocked = true; + } ll_rw_block(READA, 1, bh + j); + } brelse(bh[j]); } + return unlocked; } /************************************************************************** @@ -633,15 +648,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s have a pointer to it. */ if ((bh = last_element->pe_buffer = sb_getblk(sb, block_number))) { + bool unlocked = false; + if (!buffer_uptodate(bh) && reada_count > 1) - /* will unlock the write lock */ - search_by_key_reada(sb, reada_bh, + /* may unlock the write lock */ + unlocked = search_by_key_reada(sb, reada_bh, reada_blocks, reada_count); - else + /* + * If we haven't already unlocked the write lock, + * then we need to do that here before reading + * the current block + */ + if (!buffer_uptodate(bh) && !unlocked) { reiserfs_write_unlock(sb); + unlocked = true; + } ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); - reiserfs_write_lock(sb); + + if (unlocked) + reiserfs_write_lock(sb); if (!buffer_uptodate(bh)) goto io_error; } else { From c72e05756b900b3be24cd73a16de52bab80984c0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 May 2009 18:12:08 +0200 Subject: [PATCH 0032/2255] kill-the-bkl/reiserfs: acquire the inode mutex safely While searching a pathname, an inode mutex can be acquired in do_lookup() which calls reiserfs_lookup() which in turn acquires the write lock. On the other side reiserfs_fill_super() can acquire the write_lock and then call reiserfs_lookup_privroot() which can acquire an inode mutex (the root of the mount point). So we theoretically risk an AB - BA lock inversion that could lead to a deadlock. As for other lock dependencies found since the bkl to mutex conversion, the fix is to use reiserfs_mutex_lock_safe() which drops the lock dependency to the write lock. [ Impact: fix a possible deadlock with reiserfs ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/journal.c | 34 ---------------------------------- fs/reiserfs/xattr.c | 4 ++-- include/linux/reiserfs_fs.h | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 36 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index e9a972bd032..d23d6d7a45a 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -537,40 +537,6 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, journal_hash(table, cn->sb, cn->blocknr) = cn; } -/* - * Several mutexes depend on the write lock. - * However sometimes we want to relax the write lock while we hold - * these mutexes, according to the release/reacquire on schedule() - * properties of the Bkl that were used. - * Reiserfs performances and locking were based on this scheme. - * Now that the write lock is a mutex and not the bkl anymore, doing so - * may result in a deadlock: - * - * A acquire write_lock - * A acquire j_commit_mutex - * A release write_lock and wait for something - * B acquire write_lock - * B can't acquire j_commit_mutex and sleep - * A can't acquire write lock anymore - * deadlock - * - * What we do here is avoiding such deadlock by playing the same game - * than the Bkl: if we can't acquire a mutex that depends on the write lock, - * we release the write lock, wait a bit and then retry. - * - * The mutexes concerned by this hack are: - * - The commit mutex of a journal list - * - The flush mutex - * - The journal lock - */ -static inline void reiserfs_mutex_lock_safe(struct mutex *m, - struct super_block *s) -{ - reiserfs_write_unlock(s); - mutex_lock(m); - reiserfs_write_lock(s); -} - /* lock the current transaction */ static inline void lock_journal(struct super_block *sb) { diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 6925b835a43..59870a4751c 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -975,7 +975,7 @@ int reiserfs_lookup_privroot(struct super_block *s) int err = 0; /* If we don't have the privroot located yet - go find it */ - mutex_lock(&s->s_root->d_inode->i_mutex); + reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s); dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { @@ -1011,7 +1011,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) if (privroot->d_inode) { s->s_xattr = reiserfs_xattr_handlers; - mutex_lock(&privroot->d_inode->i_mutex); + reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; dentry = lookup_one_len(XAROOT_NAME, privroot, diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 508fb523863..a498d9266d8 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -62,6 +62,41 @@ void reiserfs_write_unlock(struct super_block *s); int reiserfs_write_lock_once(struct super_block *s); void reiserfs_write_unlock_once(struct super_block *s, int lock_depth); +/* + * Several mutexes depend on the write lock. + * However sometimes we want to relax the write lock while we hold + * these mutexes, according to the release/reacquire on schedule() + * properties of the Bkl that were used. + * Reiserfs performances and locking were based on this scheme. + * Now that the write lock is a mutex and not the bkl anymore, doing so + * may result in a deadlock: + * + * A acquire write_lock + * A acquire j_commit_mutex + * A release write_lock and wait for something + * B acquire write_lock + * B can't acquire j_commit_mutex and sleep + * A can't acquire write lock anymore + * deadlock + * + * What we do here is avoiding such deadlock by playing the same game + * than the Bkl: if we can't acquire a mutex that depends on the write lock, + * we release the write lock, wait a bit and then retry. + * + * The mutexes concerned by this hack are: + * - The commit mutex of a journal list + * - The flush mutex + * - The journal lock + * - The inode mutex + */ +static inline void reiserfs_mutex_lock_safe(struct mutex *m, + struct super_block *s) +{ + reiserfs_write_unlock(s); + mutex_lock(m); + reiserfs_write_lock(s); +} + /* * When we schedule, we usually want to also release the write lock, * according to the previous bkl based locking scheme of reiserfs. From 08f14fc8963e585e65b71212ce8050607b9b6c36 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 May 2009 19:10:38 +0200 Subject: [PATCH 0033/2255] kill-the-bkl/reiserfs: move the concurrent tree accesses checks per superblock When do_balance() balances the tree, a trick is performed to provide the ability for other tree writers/readers to check whether do_balance() is executing concurrently (requires CONFIG_REISERFS_CHECK). This is done to protect concurrent accesses to the tree. The trick is the following: When do_balance is called, a unique global variable called cur_tb takes a pointer to the current tree to be rebalanced. Once do_balance finishes its work, cur_tb takes the NULL value. Then, concurrent tree readers/writers just have to check the value of cur_tb to ensure do_balance isn't executing concurrently. If it is, then it proves that schedule() occured on do_balance(), which then relaxed the bkl that protected the tree. Now that the bkl has be turned into a mutex, this check is still fine even though do_balance() becomes preemptible: the write lock will not be automatically released on schedule(), so the tree is still protected. But this is only fine if we have a single reiserfs mountpoint. Indeed, because the bkl is a global lock, it didn't allowed concurrent executions between a tree reader/writer in a mount point and a do_balance() on another tree from another mountpoint. So assuming all these readers/writers weren't supposed to be reentrant, the current check now sometimes detect false positives with the current per-superblock mutex which allows this reentrancy. This patch keeps the concurrent tree accesses check but moves it per superblock, so that only trees from a same mount point are checked to be not accessed concurrently. [ Impact: fix spurious panic while running several reiserfs mount-points ] Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Signed-off-by: Frederic Weisbecker --- fs/reiserfs/do_balan.c | 17 +++++------------ fs/reiserfs/fix_node.c | 5 +---- fs/reiserfs/prints.c | 4 ---- fs/reiserfs/stree.c | 5 +---- include/linux/reiserfs_fs_sb.h | 11 +++++++++++ 5 files changed, 18 insertions(+), 24 deletions(-) diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 128d3f7c8aa..60c08044066 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -21,14 +21,6 @@ #include #include -#ifdef CONFIG_REISERFS_CHECK - -struct tree_balance *cur_tb = NULL; /* detects whether more than one - copy of tb exists as a means - of checking whether schedule - is interrupting do_balance */ -#endif - static inline void buffer_info_init_left(struct tree_balance *tb, struct buffer_info *bi) { @@ -1840,11 +1832,12 @@ static int check_before_balancing(struct tree_balance *tb) { int retval = 0; - if (cur_tb) { + if (REISERFS_SB(tb->tb_sb)->cur_tb) { reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule " "occurred based on cur_tb not being null at " "this point in code. do_balance cannot properly " - "handle schedule occurring while it runs."); + "handle concurrent tree accesses on a same " + "mount point."); } /* double check that buffers that we will modify are unlocked. (fix_nodes should already have @@ -1986,7 +1979,7 @@ static inline void do_balance_starts(struct tree_balance *tb) "check");*/ RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB"); #ifdef CONFIG_REISERFS_CHECK - cur_tb = tb; + REISERFS_SB(tb->tb_sb)->cur_tb = tb; #endif } @@ -1996,7 +1989,7 @@ static inline void do_balance_completed(struct tree_balance *tb) #ifdef CONFIG_REISERFS_CHECK check_leaf_level(tb); check_internal_levels(tb); - cur_tb = NULL; + REISERFS_SB(tb->tb_sb)->cur_tb = NULL; #endif /* reiserfs_free_block is no longer schedule safe. So, we need to diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 3a685e3f754..d2f31330dca 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -563,9 +563,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, return needed_nodes; } -#ifdef CONFIG_REISERFS_CHECK -extern struct tree_balance *cur_tb; -#endif /* Set parameters for balancing. * Performs write of results of analysis of balancing into structure tb, @@ -2368,7 +2365,7 @@ int fix_nodes(int op_mode, struct tree_balance *tb, return REPEAT_SEARCH; } #ifdef CONFIG_REISERFS_CHECK - if (cur_tb) { + if (REISERFS_SB(tb->tb_sb)->cur_tb) { print_cur_tb("fix_nodes"); reiserfs_panic(tb->tb_sb, "PAP-8305", "there is pending do_balance"); diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 536eacaeb71..adbc6f53851 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -349,10 +349,6 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...) . */ -#ifdef CONFIG_REISERFS_CHECK -extern struct tree_balance *cur_tb; -#endif - void __reiserfs_panic(struct super_block *sb, const char *id, const char *function, const char *fmt, ...) { diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 6b025a42d51..5fa7118f04e 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -222,9 +222,6 @@ static inline int bin_search(const void *key, /* Key to search for. */ return ITEM_NOT_FOUND; } -#ifdef CONFIG_REISERFS_CHECK -extern struct tree_balance *cur_tb; -#endif /* Minimal possible key. It is never in the tree. */ const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} }; @@ -711,7 +708,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s !key_in_buffer(search_path, key, sb), "PAP-5130: key is not in the buffer"); #ifdef CONFIG_REISERFS_CHECK - if (cur_tb) { + if (REISERFS_SB(sb)->cur_tb) { print_cur_tb("5140"); reiserfs_panic(sb, "PAP-5140", "schedule occurred in do_balance!"); diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h index 045c3721367..52c83b6a758 100644 --- a/include/linux/reiserfs_fs_sb.h +++ b/include/linux/reiserfs_fs_sb.h @@ -417,6 +417,17 @@ struct reiserfs_sb_info { char *s_qf_names[MAXQUOTAS]; int s_jquota_fmt; #endif +#ifdef CONFIG_REISERFS_CHECK + + struct tree_balance *cur_tb; /* + * Detects whether more than one + * copy of tb exists per superblock + * as a means of checking whether + * do_balance is executing concurrently + * against another tree reader/writer + * on a same mount point. + */ +#endif }; /* Definitions of reiserfs on-disk properties: */ From ae635c0bbd6c10aa62bf5149c6f41add59fbf4d2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 25 Aug 2009 02:24:45 +0200 Subject: [PATCH 0034/2255] kill-the-bkl/reiserfs: fix "reiserfs lock" / "inode mutex" lock inversion dependency reiserfs_xattr_init is called with the reiserfs write lock held, but if the ".reiserfs_priv" entry is not created, we take the superblock root directory inode mutex until .reiserfs_priv is created. This creates a lock dependency inversion against other sites such as reiserfs_file_release() which takes an inode mutex and the reiserfs lock after. Signed-off-by: Frederic Weisbecker Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Cc: Laurent Riffard --- fs/reiserfs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 59870a4751c..58aa8e75f7f 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -1004,7 +1004,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) goto error; if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) { - mutex_lock(&s->s_root->d_inode->i_mutex); + reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s); err = create_privroot(REISERFS_SB(s)->priv_root); mutex_unlock(&s->s_root->d_inode->i_mutex); } From b10ab4c337a600456ed2d9daea0331016f7cdeeb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 25 Aug 2009 02:44:21 +0200 Subject: [PATCH 0035/2255] kill-the-bkl/reiserfs: fix recursive reiserfs lock in reiserfs_mkdir() reiserfs_mkdir() acquires the reiserfs lock, assuming it has been called from the dir inodes callbacks, without the lock held. But it can also be called from other internal sites such as reiserfs_xattr_init() which already holds the lock. This recursive locking leads to further wrong assumptions. For example, later calls to reiserfs_mutex_lock_safe() won't actually unlock the reiserfs lock the time we acquire a given mutex, creating unexpected lock inversions. Signed-off-by: Frederic Weisbecker Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Cc: Laurent Riffard --- fs/reiserfs/namei.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index b3973c9f0bf..e296ff72a6c 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -732,6 +732,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct inode *inode; struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; + int lock_depth; /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + @@ -755,7 +756,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return retval; } jbegin_count += retval; - reiserfs_write_lock(dir->i_sb); + lock_depth = reiserfs_write_lock_once(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) { @@ -805,8 +806,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) d_instantiate(dentry, inode); unlock_new_inode(inode); retval = journal_end(&th, dir->i_sb, jbegin_count); - out_failed: - reiserfs_write_unlock(dir->i_sb); +out_failed: + reiserfs_write_unlock_once(dir->i_sb, lock_depth); return retval; } From 7e94277050e31aa4204060f03953bba72598cf7d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 25 Aug 2009 03:38:12 +0200 Subject: [PATCH 0036/2255] kill-the-bkl/reiserfs: fix recursive reiserfs write lock in reiserfs_commit_write() reiserfs_commit_write() is always called with the write lock held. Thus the current calls to reiserfs_write_lock() in this function are acquiring the lock recursively. We can safely drop them. This also solves further assumptions for this lock to be really released while calling reiserfs_write_unlock(). Signed-off-by: Frederic Weisbecker Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Cc: Laurent Riffard --- fs/reiserfs/inode.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 853f4f6fe92..965c8eaadb1 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2795,7 +2795,6 @@ int reiserfs_commit_write(struct file *f, struct page *page, */ if (pos > inode->i_size) { struct reiserfs_transaction_handle myth; - reiserfs_write_lock(inode->i_sb); /* If the file have grown beyond the border where it can have a tail, unmark it as needing a tail packing */ @@ -2806,10 +2805,9 @@ int reiserfs_commit_write(struct file *f, struct page *page, REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; ret = journal_begin(&myth, inode->i_sb, 1); - if (ret) { - reiserfs_write_unlock(inode->i_sb); + if (ret) goto journal_error; - } + reiserfs_update_inode_transaction(inode); inode->i_size = pos; /* @@ -2821,16 +2819,13 @@ int reiserfs_commit_write(struct file *f, struct page *page, reiserfs_update_sd(&myth, inode); update_sd = 1; ret = journal_end(&myth, inode->i_sb, 1); - reiserfs_write_unlock(inode->i_sb); if (ret) goto journal_error; } if (th) { - reiserfs_write_lock(inode->i_sb); if (!update_sd) mark_inode_dirty(inode); ret = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); if (ret) goto out; } @@ -2840,11 +2835,9 @@ int reiserfs_commit_write(struct file *f, struct page *page, journal_error: if (th) { - reiserfs_write_lock(inode->i_sb); if (!update_sd) reiserfs_update_sd(th, inode); ret = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); } return ret; From 80503185989b2dd84170bb842e23d3fd45ebdf40 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 25 Aug 2009 04:18:06 +0200 Subject: [PATCH 0037/2255] kill-the-bkl/reiserfs: panic in case of lock imbalance Until now, trying to unlock the reiserfs write lock whereas the current task doesn't hold it lead to a simple warning. We should actually warn and panic in this case to avoid the user datas to reach an unstable state. Signed-off-by: Frederic Weisbecker Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Cc: Laurent Riffard --- fs/reiserfs/lock.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c index cb1bba3802d..ee2cfc0fd8a 100644 --- a/fs/reiserfs/lock.c +++ b/fs/reiserfs/lock.c @@ -37,11 +37,10 @@ void reiserfs_write_unlock(struct super_block *s) /* * Are we unlocking without even holding the lock? - * Such a situation could even raise a BUG() if we don't - * want the data become corrupted + * Such a situation must raise a BUG() if we don't want + * to corrupt the data. */ - WARN_ONCE(sb_i->lock_owner != current, - "Superblock write lock imbalance"); + BUG_ON(sb_i->lock_owner != current); if (--sb_i->lock_depth == -1) { sb_i->lock_owner = NULL; From 193be0ee17dd7ea309ddab1093da17e5924d7f36 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 17 Sep 2009 05:31:37 +0200 Subject: [PATCH 0038/2255] kill-the-bkl/reiserfs: Fix induced mm->mmap_sem to sysfs_mutex dependency Alexander Beregalov reported the following warning: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.31-03149-gdcc030a #1 ------------------------------------------------------- udevadm/716 is trying to acquire lock: (&mm->mmap_sem){++++++}, at: [] might_fault+0x4a/0xa0 but task is already holding lock: (sysfs_mutex){+.+.+.}, at: [] sysfs_readdir+0x5a/0x200 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (sysfs_mutex){+.+.+.}: [...] -> #2 (&bdev->bd_mutex){+.+.+.}: [...] -> #1 (&REISERFS_SB(s)->lock){+.+.+.}: [...] -> #0 (&mm->mmap_sem){++++++}: [...] On reiserfs mount path, we take the reiserfs lock and while initializing the journal, we open the device, taking the bdev->bd_mutex. Then rescan_partition() may signal the change to sysfs. We have then the following dependency: reiserfs_lock -> bd_mutex -> sysfs_mutex Later, while entering reiserfs_readpage() after a pagefault in an mmaped reiserfs file, we are holding the mm->mmap_sem, and we are going to take the reiserfs lock too. We have then the following dependency: mm->mmap_sem -> reiserfs_lock which, expanded with the previous dependency gives us: mm->mmap_sem -> reiserfs_lock -> bd_mutex -> sysfs_mutex Now while entering the sysfs readdir path, we are holding the sysfs_mutex. And when we copy a directory entry to the user buffer, we might fault and then take the mm->mmap_sem lock. Which leads to the circular locking dependency reported. We can fix that by relaxing the reiserfs lock during the call to journal_init_dev(), which is the place where we open the mounted device. This is fine to relax the lock here because we are in the begining of the reiserfs mount path and there is nothing to protect at this time, the journal is not intialized. We just keep this lock around for paranoid reasons. Reported-by: Alexander Beregalov Tested-by: Alexander Beregalov Signed-off-by: Frederic Weisbecker Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Cc: Laurent Riffard --- fs/reiserfs/journal.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index d23d6d7a45a..04e3c42a085 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2801,11 +2801,27 @@ int journal_init(struct super_block *sb, const char *j_dev_name, goto free_and_return; } + /* + * We need to unlock here to avoid creating the following + * dependency: + * reiserfs_lock -> sysfs_mutex + * Because the reiserfs mmap path creates the following dependency: + * mm->mmap -> reiserfs_lock, hence we have + * mm->mmap -> reiserfs_lock ->sysfs_mutex + * This would ends up in a circular dependency with sysfs readdir path + * which does sysfs_mutex -> mm->mmap_sem + * This is fine because the reiserfs lock is useless in mount path, + * at least until we call journal_begin. We keep it for paranoid + * reasons. + */ + reiserfs_write_unlock(sb); if (journal_init_dev(sb, journal, j_dev_name) != 0) { + reiserfs_write_lock(sb); reiserfs_warning(sb, "sh-462", "unable to initialize jornal device"); goto free_and_return; } + reiserfs_write_lock(sb); rs = SB_DISK_SUPER_BLOCK(sb); From a092ff0f90cae22b2ac8028ecd2c6f6c1a9e4601 Mon Sep 17 00:00:00 2001 From: john stultz Date: Fri, 2 Oct 2009 16:17:53 -0700 Subject: [PATCH 0039/2255] time: Implement logarithmic time accumulation Accumulating one tick at a time works well unless we're using NOHZ. Then it can be an issue, since we may have to run through the loop a few thousand times, which can increase timer interrupt caused latency. The current solution was to accumulate in half-second intervals with NOHZ. This kept the number of loops down, however it did slightly change how we make NTP adjustments. While not an issue with NTPd users, as NTPd makes adjustments over a longer period of time, other adjtimex() users have noticed the half-second granularity with which we can apply frequency changes to the clock. For instance, if a application tries to apply a 100ppm frequency correction for 20ms to correct a 2us offset, with NOHZ they either get no correction, or a 50us correction. Now, there will always be some granularity error for applying frequency corrections. However with users sensitive to this error have seen a 50-500x increase with NOHZ compared to running without NOHZ. So I figured I'd try another approach then just simply increasing the interval. My approach is to consume the time interval logarithmically. This reduces the number of times through the loop needed keeping latency down, while still preserving the original granularity error for adjtimex() changes. Further, this change allows us to remove the xtime_cache code (patch to follow), as xtime is always within one tick of the current time, instead of the half-second updates it saw before. An earlier version of this patch has been shipping to x86 users in the RedHat MRG releases for awhile without issue, but I've reworked this version to be even more careful about avoiding possible overflows if the shift value gets too large. Signed-off-by: John Stultz Acked-by: Thomas Gleixner Reviewed-by: John Kacur Cc: Clark Williams Cc: Martin Schwidefsky Cc: Andrew Morton LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- include/linux/timex.h | 4 -- kernel/time/timekeeping.c | 85 +++++++++++++++++++++++++++------------ 2 files changed, 60 insertions(+), 29 deletions(-) diff --git a/include/linux/timex.h b/include/linux/timex.h index e6967d10d9e..0c0ef7d4db7 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -261,11 +261,7 @@ static inline int ntp_synced(void) #define NTP_SCALE_SHIFT 32 -#ifdef CONFIG_NO_HZ -#define NTP_INTERVAL_FREQ (2) -#else #define NTP_INTERVAL_FREQ (HZ) -#endif #define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ) /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb0f46fa1ec..5fdd78e0858 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -721,6 +721,51 @@ static void timekeeping_adjust(s64 offset) timekeeper.ntp_error_shift; } + +/** + * logarithmic_accumulation - shifted accumulation of cycles + * + * This functions accumulates a shifted interval of cycles into + * into a shifted interval nanoseconds. Allows for O(log) accumulation + * loop. + * + * Returns the unconsumed cycles. + */ +static cycle_t logarithmic_accumulation(cycle_t offset, int shift) +{ + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; + + /* If the offset is smaller then a shifted interval, do nothing */ + if (offset < timekeeper.cycle_interval<cycle_last += timekeeper.cycle_interval << shift; + + timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; + while (timekeeper.xtime_nsec >= nsecps) { + timekeeper.xtime_nsec -= nsecps; + xtime.tv_sec++; + second_overflow(); + } + + /* Accumulate into raw time */ + raw_time.tv_nsec += timekeeper.raw_interval << shift;; + while (raw_time.tv_nsec >= NSEC_PER_SEC) { + raw_time.tv_nsec -= NSEC_PER_SEC; + raw_time.tv_sec++; + } + + /* Accumulate error between NTP and clock interval */ + timekeeper.ntp_error += tick_length << shift; + timekeeper.ntp_error -= timekeeper.xtime_interval << + (timekeeper.ntp_error_shift + shift); + + return offset; +} + + /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -731,6 +776,7 @@ void update_wall_time(void) struct clocksource *clock; cycle_t offset; u64 nsecs; + int shift = 0, maxshift; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) @@ -744,33 +790,22 @@ void update_wall_time(void) #endif timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; - /* normally this loop will run just once, however in the - * case of lost or late ticks, it will accumulate correctly. + /* + * With NO_HZ we may have to accumulate many cycle_intervals + * (think "ticks") worth of time at once. To do this efficiently, + * we calculate the largest doubling multiple of cycle_intervals + * that is smaller then the offset. We then accumulate that + * chunk in one go, and then try to consume the next smaller + * doubled multiple. */ + shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); + shift = max(0, shift); + /* Bound shift to one less then what overflows tick_length */ + maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; + shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; - - /* accumulate one interval */ - offset -= timekeeper.cycle_interval; - clock->cycle_last += timekeeper.cycle_interval; - - timekeeper.xtime_nsec += timekeeper.xtime_interval; - if (timekeeper.xtime_nsec >= nsecps) { - timekeeper.xtime_nsec -= nsecps; - xtime.tv_sec++; - second_overflow(); - } - - raw_time.tv_nsec += timekeeper.raw_interval; - if (raw_time.tv_nsec >= NSEC_PER_SEC) { - raw_time.tv_nsec -= NSEC_PER_SEC; - raw_time.tv_sec++; - } - - /* accumulate error between NTP and clock interval */ - timekeeper.ntp_error += tick_length; - timekeeper.ntp_error -= timekeeper.xtime_interval << - timekeeper.ntp_error_shift; + offset = logarithmic_accumulation(offset, shift); + shift--; } /* correct the clock when NTP error is too big */ From 7bc7d637452383d56ba4368d4336b0dde1bb476d Mon Sep 17 00:00:00 2001 From: john stultz Date: Fri, 2 Oct 2009 16:24:15 -0700 Subject: [PATCH 0040/2255] time: Remove xtime_cache With the prior logarithmic time accumulation patch, xtime will now always be within one "tick" of the current time, instead of possibly half a second off. This removes the need for the xtime_cache value, which always stored the time at the last interrupt, so this patch cleans that up removing the xtime_cache related code. This is a bit simpler, but still could use some wider testing. Signed-off-by: John Stultz Acked-by: Thomas Gleixner Reviewed-by: John Kacur Cc: Clark Williams Cc: Martin Schwidefsky Cc: Andrew Morton LKML-Reference: <1254525855.7741.95.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- kernel/time.c | 1 - kernel/time/timekeeping.c | 27 ++++----------------------- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/kernel/time.c b/kernel/time.c index 2e2e469a7fe..2ef4fe2079b 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -136,7 +136,6 @@ static inline void warp_clock(void) write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; - update_xtime_cache(0); write_sequnlock_irq(&xtime_lock); clock_was_set(); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5fdd78e0858..96b3f0dfa5d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -164,13 +164,6 @@ struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -static struct timespec xtime_cache __attribute__ ((aligned (16))); -void update_xtime_cache(u64 nsec) -{ - xtime_cache = xtime; - timespec_add_ns(&xtime_cache, nsec); -} - /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { @@ -331,8 +324,6 @@ int do_settimeofday(struct timespec *tv) xtime = *tv; - update_xtime_cache(0); - timekeeper.ntp_error = 0; ntp_clear(); @@ -547,7 +538,6 @@ void __init timekeeping_init(void) } set_normalized_timespec(&wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); - update_xtime_cache(0); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -581,7 +571,6 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); total_sleep_time = timespec_add_safe(total_sleep_time, ts); } - update_xtime_cache(0); /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; @@ -721,7 +710,6 @@ static void timekeeping_adjust(s64 offset) timekeeper.ntp_error_shift; } - /** * logarithmic_accumulation - shifted accumulation of cycles * @@ -765,7 +753,6 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) return offset; } - /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -775,7 +762,6 @@ void update_wall_time(void) { struct clocksource *clock; cycle_t offset; - u64 nsecs; int shift = 0, maxshift; /* Make sure we're fully resumed: */ @@ -841,9 +827,6 @@ void update_wall_time(void) timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; - nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); - update_xtime_cache(nsecs); - /* check to see if there is a new clocksource to use */ update_vsyscall(&xtime, timekeeper.clock); } @@ -880,13 +863,13 @@ void monotonic_to_bootbased(struct timespec *ts) unsigned long get_seconds(void) { - return xtime_cache.tv_sec; + return xtime.tv_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return xtime_cache; + return xtime; } struct timespec current_kernel_time(void) @@ -896,8 +879,7 @@ struct timespec current_kernel_time(void) do { seq = read_seqbegin(&xtime_lock); - - now = xtime_cache; + now = xtime; } while (read_seqretry(&xtime_lock, seq)); return now; @@ -911,8 +893,7 @@ struct timespec get_monotonic_coarse(void) do { seq = read_seqbegin(&xtime_lock); - - now = xtime_cache; + now = xtime; mono = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); From 48f6ba5e691948caba2e7bc362153fb28e4f1e09 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 5 Oct 2009 16:31:37 +0200 Subject: [PATCH 0041/2255] kill-the-bkl/reiserfs: fix reiserfs lock to cpu_add_remove_lock dependency While creating the reiserfs workqueue during the journal initialization, we are holding the reiserfs lock, but create_workqueue() also holds the cpu_add_remove_lock, creating then the following dependency: - reiserfs lock -> cpu_add_remove_lock But we also have the following existing dependencies: - mm->mmap_sem -> reiserfs lock - cpu_add_remove_lock -> cpu_hotplug.lock -> slub_lock -> sysfs_mutex The merged dependency chain then becomes: - mm->mmap_sem -> reiserfs lock -> cpu_add_remove_lock -> cpu_hotplug.lock -> slub_lock -> sysfs_mutex But when we fill a dir entry in sysfs_readir(), we are holding the sysfs_mutex and we also might fault while copying the directory entry to the user, leading to the following dependency: - sysfs_mutex -> mm->mmap_sem The end result is then a lock inversion between sysfs_mutex and mm->mmap_sem, as reported in the following lockdep warning: [ INFO: possible circular locking dependency detected ] 2.6.31-07095-g25a3912 #4 ------------------------------------------------------- udevadm/790 is trying to acquire lock: (&mm->mmap_sem){++++++}, at: [] might_fault+0x72/0xc0 but task is already holding lock: (sysfs_mutex){+.+.+.}, at: [] sysfs_readdir+0x7c/0x260 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #5 (sysfs_mutex){+.+.+.}: [...] -> #4 (slub_lock){+++++.}: [...] -> #3 (cpu_hotplug.lock){+.+.+.}: [...] -> #2 (cpu_add_remove_lock){+.+.+.}: [...] -> #1 (&REISERFS_SB(s)->lock){+.+.+.}: [...] -> #0 (&mm->mmap_sem){++++++}: [...] This can be fixed by relaxing the reiserfs lock while creating the workqueue. This is fine to relax the lock here, we just keep it around to pass through reiserfs lock checks and for paranoid reasons. Reported-by: Alexander Beregalov Tested-by: Alexander Beregalov Signed-off-by: Frederic Weisbecker Cc: Jeff Mahoney Cc: Chris Mason Cc: Ingo Molnar Cc: Alexander Beregalov Cc: Laurent Riffard --- fs/reiserfs/journal.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 04e3c42a085..2f8a7e7b8da 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2933,8 +2933,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name, } reiserfs_mounted_fs_count++; - if (reiserfs_mounted_fs_count <= 1) + if (reiserfs_mounted_fs_count <= 1) { + reiserfs_write_unlock(sb); commit_wq = create_workqueue("reiserfs"); + reiserfs_write_lock(sb); + } INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); journal->j_work_sb = sb; From ba69ea42f8ba8286cbe0e939bd1ce781b7905b84 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 8 Oct 2009 13:23:08 -0400 Subject: [PATCH 0042/2255] Fix toogle whether xenbus driver should be built as module or part of kernel. The "select" statement in the Kconfig ensures that all other users of the xenbus will have the same dependency. Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Jeremy Fitzhardinge --- drivers/input/Kconfig | 1 + drivers/video/Kconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig index cd50c00ab20..b8e0424d0f8 100644 --- a/drivers/input/Kconfig +++ b/drivers/input/Kconfig @@ -153,6 +153,7 @@ config XEN_KBDDEV_FRONTEND tristate "Xen virtual keyboard and mouse support" depends on XEN_FBDEV_FRONTEND default y + select XEN_XENBUS_FRONTEND help This driver implements the front-end of the Xen virtual keyboard and mouse device driver. It communicates with a back-end diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index 3b54b394017..1b332d182a6 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -2070,6 +2070,7 @@ config XEN_FBDEV_FRONTEND select FB_SYS_IMAGEBLIT select FB_SYS_FOPS select FB_DEFERRED_IO + select XEN_XENBUS_FRONTEND default y help This driver implements the front-end of the Xen virtual From a6f05a6a0a1713d5b019f096799d49226807d3df Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 8 Oct 2009 18:02:54 -0700 Subject: [PATCH 0043/2255] x86-64: make compat_start_thread() match start_thread() For no real good reason, compat_start_thread() was embedded inline in whereas the native start_thread() lives in process_*.c. Move compat_start_thread() to process_64.c, remove gratuitious differences, and fix a few items which mostly look like bit rot. In particular, compat_start_thread() didn't do free_thread_xstate(), which means it was hanging on to the xstate store area even when it was not needed. It was also not setting old_rsp, but it looks like that generally shouldn't matter for a 32-bit process. Note: compat_start_thread *has* to be a macro, since it is tested with start_thread_ia32() as the out of line function name. Signed-off-by: H. Peter Anvin Acked-by: Suresh Siddha --- arch/x86/include/asm/elf.h | 20 ++------------------ arch/x86/kernel/process_64.c | 23 ++++++++++++++++++++++- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 456a304b817..8a024babe5e 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -157,19 +157,6 @@ do { \ #define compat_elf_check_arch(x) elf_check_arch_ia32(x) -static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp) -{ - loadsegment(fs, 0); - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); - load_gs_index(0); - regs->ip = ip; - regs->sp = sp; - regs->flags = X86_EFLAGS_IF; - regs->cs = __USER32_CS; - regs->ss = __USER32_DS; -} - static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { @@ -191,11 +178,8 @@ do { \ #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) -#define compat_start_thread(regs, ip, sp) \ -do { \ - start_ia32_thread(regs, ip, sp); \ - set_fs(USER_DS); \ -} while (0) +void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); +#define compat_start_thread start_thread_ia32 #define COMPAT_SET_PERSONALITY(ex) \ do { \ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad535b68317..7cf0a6b6d4b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -356,7 +356,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) percpu_write(old_rsp, new_sp); regs->cs = __USER_CS; regs->ss = __USER_DS; - regs->flags = 0x200; + regs->flags = X86_EFLAGS_IF; set_fs(USER_DS); /* * Free the old FP and other extended state @@ -365,6 +365,27 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) } EXPORT_SYMBOL_GPL(start_thread); +#ifdef CONFIG_IA32_EMULATION +void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) +{ + loadsegment(fs, 0); + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); + load_gs_index(0); + regs->ip = new_ip; + regs->sp = new_sp; + percpu_write(old_rsp, new_sp); + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; + regs->flags = X86_EFLAGS_IF; + set_fs(USER_DS); + /* + * Free the old FP and other extended state + */ + free_thread_xstate(current); +} +#endif + /* * switch_to(x,y) should switch tasks from x to y. * From e634d8fc792c66c3d4ff45518c04848c1e28f221 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 9 Oct 2009 15:56:53 -0700 Subject: [PATCH 0044/2255] x86-64: merge the standard and compat start_thread() functions The only thing left that differs between the standard and compat start_thread functions is the actual segment numbers and the prototype, so have a single common function which contains the guts and two very small wrappers. Signed-off-by: H. Peter Anvin Acked-by: Suresh Siddha --- arch/x86/kernel/process_64.c | 39 ++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7cf0a6b6d4b..eb261c582a4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -344,18 +344,20 @@ out: return err; } -void -start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +static void +start_thread_common(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp, + unsigned int _cs, unsigned int _ss, unsigned int _ds) { loadsegment(fs, 0); - loadsegment(es, 0); - loadsegment(ds, 0); + loadsegment(es, _ds); + loadsegment(ds, _ds); load_gs_index(0); regs->ip = new_ip; regs->sp = new_sp; percpu_write(old_rsp, new_sp); - regs->cs = __USER_CS; - regs->ss = __USER_DS; + regs->cs = _cs; + regs->ss = _ss; regs->flags = X86_EFLAGS_IF; set_fs(USER_DS); /* @@ -363,26 +365,19 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) */ free_thread_xstate(current); } -EXPORT_SYMBOL_GPL(start_thread); + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER_CS, __USER_DS, 0); +} #ifdef CONFIG_IA32_EMULATION void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) { - loadsegment(fs, 0); - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); - load_gs_index(0); - regs->ip = new_ip; - regs->sp = new_sp; - percpu_write(old_rsp, new_sp); - regs->cs = __USER32_CS; - regs->ss = __USER32_DS; - regs->flags = X86_EFLAGS_IF; - set_fs(USER_DS); - /* - * Free the old FP and other extended state - */ - free_thread_xstate(current); + start_thread_common(regs, new_ip, new_sp, + __USER32_CS, __USER32_DS, __USER32_DS); } #endif From a469f627c15de2af392be23508e6094d7268e2b7 Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Sat, 3 Oct 2009 11:21:30 +0000 Subject: [PATCH 0045/2255] SH: add support for the RJ54N1CB0C camera for the kfr2r09 platform Signed-off-by: Guennadi Liakhovetski Signed-off-by: Paul Mundt --- arch/sh/boards/mach-kfr2r09/setup.c | 139 ++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/arch/sh/boards/mach-kfr2r09/setup.c b/arch/sh/boards/mach-kfr2r09/setup.c index c08d33fe210..ce01d6a953b 100644 --- a/arch/sh/boards/mach-kfr2r09/setup.c +++ b/arch/sh/boards/mach-kfr2r09/setup.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include