From 9f67675a249f6f993edd41bc084827411366a652 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Tue, 18 Dec 2012 14:21:18 -0800 Subject: [PATCH 01/48] backlight: locomolcd: fix checkpatch error and warning This patch fixes the checkpatch error and warning as below: WARNING: space prohibited between function name and open parenthesis '(' ERROR: trailing statements should be on next line Also, long comments are fixed for the preferred style and unnecessary lines are removed. Signed-off-by: Jingoo Han Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/locomolcd.c | 38 +++++++++++++++++++---------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/drivers/video/backlight/locomolcd.c b/drivers/video/backlight/locomolcd.c index 3a6d5419e3e..146fea8aa43 100644 --- a/drivers/video/backlight/locomolcd.c +++ b/drivers/video/backlight/locomolcd.c @@ -107,7 +107,6 @@ void locomolcd_power(int on) } EXPORT_SYMBOL(locomolcd_power); - static int current_intensity; static int locomolcd_set_intensity(struct backlight_device *bd) @@ -122,13 +121,25 @@ static int locomolcd_set_intensity(struct backlight_device *bd) intensity = 0; switch (intensity) { - /* AC and non-AC are handled differently, but produce same results in sharp code? */ - case 0: locomo_frontlight_set(locomolcd_dev, 0, 0, 161); break; - case 1: locomo_frontlight_set(locomolcd_dev, 117, 0, 161); break; - case 2: locomo_frontlight_set(locomolcd_dev, 163, 0, 148); break; - case 3: locomo_frontlight_set(locomolcd_dev, 194, 0, 161); break; - case 4: locomo_frontlight_set(locomolcd_dev, 194, 1, 161); break; - + /* + * AC and non-AC are handled differently, + * but produce same results in sharp code? + */ + case 0: + locomo_frontlight_set(locomolcd_dev, 0, 0, 161); + break; + case 1: + locomo_frontlight_set(locomolcd_dev, 117, 0, 161); + break; + case 2: + locomo_frontlight_set(locomolcd_dev, 163, 0, 148); + break; + case 3: + locomo_frontlight_set(locomolcd_dev, 194, 0, 161); + break; + case 4: + locomo_frontlight_set(locomolcd_dev, 194, 1, 161); + break; default: return -ENODEV; } @@ -175,9 +186,11 @@ static int locomolcd_probe(struct locomo_dev *ldev) locomo_gpio_set_dir(ldev->dev.parent, LOCOMO_GPIO_FL_VR, 0); - /* the poodle_lcd_power function is called for the first time + /* + * the poodle_lcd_power function is called for the first time * from fs_initcall, which is before locomo is activated. - * We need to recall poodle_lcd_power here*/ + * We need to recall poodle_lcd_power here + */ if (machine_is_poodle()) locomolcd_power(1); @@ -190,8 +203,8 @@ static int locomolcd_probe(struct locomo_dev *ldev) &ldev->dev, NULL, &locomobl_data, &props); - if (IS_ERR (locomolcd_bl_device)) - return PTR_ERR (locomolcd_bl_device); + if (IS_ERR(locomolcd_bl_device)) + return PTR_ERR(locomolcd_bl_device); /* Set up frontlight so that screen is readable */ locomolcd_bl_device->props.brightness = 2; @@ -226,7 +239,6 @@ static struct locomo_driver poodle_lcd_driver = { .resume = locomolcd_resume, }; - static int __init locomolcd_init(void) { return locomo_driver_register(&poodle_lcd_driver); From c24bf9b4cc6a0f330ea355d73bfdf1dae7e63a05 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Tue, 18 Dec 2012 14:21:19 -0800 Subject: [PATCH 02/48] CRIS: fix I/O macros The inb/outb macros for CRIS are broken from a number of points of view, missing () around parameters and they have an unprotected if statement in them. This was breaking the compile of IPMI on CRIS and thus I was being annoyed by build regressions, so I fixed them. Plus I don't think they would have worked at all, since the data values were missing "&" and the outsl had a "3" instead of a "4" for the size. From what I can tell, this stuff is not used at all, so this can't be any more broken than it was before, anyway. Signed-off-by: Corey Minyard Cc: Jesper Nilsson Cc: Mikael Starvik Acked-by: Geert Uytterhoeven Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/cris/include/asm/io.h | 39 ++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/arch/cris/include/asm/io.h b/arch/cris/include/asm/io.h index 32567bc2a42..ac12ae2b928 100644 --- a/arch/cris/include/asm/io.h +++ b/arch/cris/include/asm/io.h @@ -133,12 +133,39 @@ static inline void writel(unsigned int b, volatile void __iomem *addr) #define insb(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,1,count) : 0) #define insw(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,2,count) : 0) #define insl(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,4,count) : 0) -#define outb(data,port) if (cris_iops) cris_iops->write_io(port,(void*)(unsigned)data,1,1) -#define outw(data,port) if (cris_iops) cris_iops->write_io(port,(void*)(unsigned)data,2,1) -#define outl(data,port) if (cris_iops) cris_iops->write_io(port,(void*)(unsigned)data,4,1) -#define outsb(port,addr,count) if(cris_iops) cris_iops->write_io(port,(void*)addr,1,count) -#define outsw(port,addr,count) if(cris_iops) cris_iops->write_io(port,(void*)addr,2,count) -#define outsl(port,addr,count) if(cris_iops) cris_iops->write_io(port,(void*)addr,3,count) +static inline void outb(unsigned char data, unsigned int port) +{ + if (cris_iops) + cris_iops->write_io(port, (void *) &data, 1, 1); +} +static inline void outw(unsigned short data, unsigned int port) +{ + if (cris_iops) + cris_iops->write_io(port, (void *) &data, 2, 1); +} +static inline void outl(unsigned int data, unsigned int port) +{ + if (cris_iops) + cris_iops->write_io(port, (void *) &data, 4, 1); +} +static inline void outsb(unsigned int port, const void *addr, + unsigned long count) +{ + if (cris_iops) + cris_iops->write_io(port, (void *)addr, 1, count); +} +static inline void outsw(unsigned int port, const void *addr, + unsigned long count) +{ + if (cris_iops) + cris_iops->write_io(port, (void *)addr, 2, count); +} +static inline void outsl(unsigned int port, const void *addr, + unsigned long count) +{ + if (cris_iops) + cris_iops->write_io(port, (void *)addr, 4, count); +} /* * Convert a physical pointer to a virtual kernel pointer for /dev/mem From 88d67ee3ec52d19e52ae58ef4303464720215d3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sh=C3=A9rab?= Date: Tue, 18 Dec 2012 14:21:21 -0800 Subject: [PATCH 03/48] arch/x86/platform/iris/iris.c: register a platform device and a platform driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes the iris driver use the platform API, so it is properly exposed in /sys. [akpm@linux-foundation.org: remove commented-out code, add missing space to printk, clean up code layout] Signed-off-by: Shérab Cc: Len Brown Cc: Matthew Garrett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/platform/iris/iris.c | 67 +++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index 5917eb56b31..e6cb80f620a 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -62,29 +63,75 @@ static void iris_power_off(void) * by reading its input port and seeing whether the read value is * meaningful. */ -static int iris_init(void) +static int iris_probe(struct platform_device *pdev) { - unsigned char status; - if (force != 1) { - printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n"); - return -ENODEV; - } - status = inb(IRIS_GIO_INPUT); + unsigned char status = inb(IRIS_GIO_INPUT); if (status == IRIS_GIO_NODEV) { - printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n"); + printk(KERN_ERR "This machine does not seem to be an Iris. " + "Power off handler not installed.\n"); return -ENODEV; } old_pm_power_off = pm_power_off; pm_power_off = &iris_power_off; printk(KERN_INFO "Iris power_off handler installed.\n"); + return 0; +} +static int iris_remove(struct platform_device *pdev) +{ + pm_power_off = old_pm_power_off; + printk(KERN_INFO "Iris power_off handler uninstalled.\n"); + return 0; +} + +static struct platform_driver iris_driver = { + .driver = { + .name = "iris", + .owner = THIS_MODULE, + }, + .probe = iris_probe, + .remove = iris_remove, +}; + +static struct resource iris_resources[] = { + { + .start = IRIS_GIO_BASE, + .end = IRIS_GIO_OUTPUT, + .flags = IORESOURCE_IO, + .name = "address" + } +}; + +static struct platform_device *iris_device; + +static int iris_init(void) +{ + int ret; + if (force != 1) { + printk(KERN_ERR "The force parameter has not been set to 1." + " The Iris poweroff handler will not be installed.\n"); + return -ENODEV; + } + ret = platform_driver_register(&iris_driver); + if (ret < 0) { + printk(KERN_ERR "Failed to register iris platform driver: %d\n", + ret); + return ret; + } + iris_device = platform_device_register_simple("iris", (-1), + iris_resources, ARRAY_SIZE(iris_resources)); + if (IS_ERR(iris_device)) { + printk(KERN_ERR "Failed to register iris platform device\n"); + platform_driver_unregister(&iris_driver); + return PTR_ERR(iris_device); + } return 0; } static void iris_exit(void) { - pm_power_off = old_pm_power_off; - printk(KERN_INFO "Iris power_off handler uninstalled.\n"); + platform_device_unregister(iris_device); + platform_driver_unregister(&iris_driver); } module_init(iris_init); From f6af75dac3978d0b4d83939cb5d244b2a844820e Mon Sep 17 00:00:00 2001 From: Cyril Roelandt Date: Tue, 18 Dec 2012 14:21:23 -0800 Subject: [PATCH 04/48] ceph: fix dentry reference leak in ceph_encode_fh() dput() was not called in the error path. Signed-off-by: Cyril Roelandt Cc: Sage Weil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ceph/export.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 9349bb37a2f..ca3ab3f9ca7 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -56,13 +56,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct ceph_nfs_confh *cfh = (void *)rawfh; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; - struct dentry *dentry = d_find_alias(inode); + struct dentry *dentry; struct dentry *parent; /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; + dentry = d_find_alias(inode); + /* if we found an alias, generate a connectable fh */ if (*max_len >= connected_handle_length && dentry) { dout("encode_fh %p connectable\n", dentry); From 19af395d7c0daaafdebd441a162128aaac575912 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 18 Dec 2012 14:21:25 -0800 Subject: [PATCH 05/48] irq: tsk->comm is an array The array check is useless so remove it. [akpm@linux-foundation.org: remove comment, per David] Signed-off-by: Alan Cox Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 35c70c9e24d..e49a288fa47 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -818,7 +818,7 @@ static void irq_thread_dtor(struct callback_head *unused) action = kthread_data(tsk); pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", - tsk->comm ? tsk->comm : "", tsk->pid, action->irq); + tsk->comm, tsk->pid, action->irq); desc = irq_to_desc(action->irq); From af56e3f017bae54b9c3b5f7877d5eff990a2eed9 Mon Sep 17 00:00:00 2001 From: Cyril Roelandt Date: Tue, 18 Dec 2012 14:21:28 -0800 Subject: [PATCH 06/48] Coccinelle: add api/d_find_alias.cocci Ensure that calls to d_find_alias() have a corresponding dput(). Signed-off-by: Cyril Roelandt Cc: Julia Lawall Cc: Gilles Muller Cc: Nicolas Palix Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/coccinelle/api/d_find_alias.cocci | 80 +++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 scripts/coccinelle/api/d_find_alias.cocci diff --git a/scripts/coccinelle/api/d_find_alias.cocci b/scripts/coccinelle/api/d_find_alias.cocci new file mode 100644 index 00000000000..a9694a8d3e5 --- /dev/null +++ b/scripts/coccinelle/api/d_find_alias.cocci @@ -0,0 +1,80 @@ +/// Make sure calls to d_find_alias() have a corresponding call to dput(). +// +// Keywords: d_find_alias, dput +// +// Confidence: Moderate +// URL: http://coccinelle.lip6.fr/ +// Options: -include_headers + +virtual context +virtual org +virtual patch +virtual report + +@r exists@ +local idexpression struct dentry *dent; +expression E, E1; +statement S1, S2; +position p1, p2; +@@ +( + if (!(dent@p1 = d_find_alias(...))) S1 +| + dent@p1 = d_find_alias(...) +) + +<...when != dput(dent) + when != if (...) { <+... dput(dent) ...+> } + when != true !dent || ... + when != dent = E + when != E = dent +if (!dent || ...) S2 +...> +( + return <+...dent...+>; +| + return @p2 ...; +| + dent@p2 = E1; +| + E1 = dent; +) + +@depends on context@ +local idexpression struct dentry *r.dent; +position r.p1,r.p2; +@@ +* dent@p1 = ... + ... +( +* return@p2 ...; +| +* dent@p2 +) + + +@script:python depends on org@ +p1 << r.p1; +p2 << r.p2; +@@ +cocci.print_main("Missing call to dput()",p1) +cocci.print_secs("",p2) + +@depends on patch@ +local idexpression struct dentry *r.dent; +position r.p2; +@@ +( ++ dput(dent); + return @p2 ...; +| ++ dput(dent); + dent@p2 = ...; +) + +@script:python depends on report@ +p1 << r.p1; +p2 << r.p2; +@@ +msg = "Missing call to dput() at line %s." +coccilib.report.print_report(p1[0], msg % (p2[0].line)) From d95bfe464bae35daadaa46477e3ef984747abfff Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Tue, 18 Dec 2012 14:21:29 -0800 Subject: [PATCH 07/48] h8300: select generic atomic64_t support Rationales from Eric: So I just looked a little deeper and it appears architectures that do not support atomic64_t are broken. The generic atomic64 support came in 2009 to support the perf subsystem with the expectation that all architectures would implement atomic64 support. Furthermore upon inspection of the kernel atomic64_t is used in a fair number of places beyond the performance counters: block/blk-cgroup.c drivers/acpi/apei/ drivers/block/rbd.c drivers/crypto/nx/nx.h drivers/gpu/drm/radeon/radeon.h drivers/infiniband/hw/ipath/ drivers/infiniband/hw/qib/ drivers/staging/octeon/ fs/xfs/ include/linux/perf_event.h include/net/netfilter/nf_conntrack_acct.h kernel/events/ kernel/trace/ net/mac80211/key.h net/rds/ The block control group, infiniband, xfs, crypto, 802.11, netfilter. Nothing quite so fundamental as fs/namespace.c but definitely in multiplatform-code that should work, and is already broken on those architecutres. Looking at the implementation of atomic64_add_return in lib/atomic64.c the code looks as efficient as these kinds of things get. Which leads me to the conclusion that we need atomic64 support on all architectures. Signed-off-by: Fengguang Wu Cc: "Eric W. Biederman" Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/h8300/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 04bef4d25b4..0ae44508760 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -3,6 +3,7 @@ config H8300 default y select HAVE_IDE select HAVE_GENERIC_HARDIRQS + select GENERIC_ATOMIC64 select HAVE_UID16 select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_IRQ_SHOW From 3012d60b39761b0bf73e3be677d2f8e424f0b294 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 18 Dec 2012 14:21:30 -0800 Subject: [PATCH 08/48] drivers/message/fusion/mptscsih.c: missing break This happens to do the right thing in all cases on fibre channel but not on other media types Signed-off-by: Alan Cox Cc: James Bottomley Cc: Nagalakshmi Nandigama Cc: Kashyap Desai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/message/fusion/mptscsih.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/message/fusion/mptscsih.c b/drivers/message/fusion/mptscsih.c index 0c3ced70707..164afa71bba 100644 --- a/drivers/message/fusion/mptscsih.c +++ b/drivers/message/fusion/mptscsih.c @@ -792,6 +792,7 @@ mptscsih_io_done(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *mr) * than an unsolicited DID_ABORT. */ sc->result = DID_RESET << 16; + break; case MPI_IOCSTATUS_SCSI_EXT_TERMINATED: /* 0x004C */ if (ioc->bus_type == FC) From 0bb2c7637ef0db4f44528698fa725179fb4917ad Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 18 Dec 2012 14:21:32 -0800 Subject: [PATCH 09/48] mm/page_alloc.c: remove duplicate check While allocating pages using buddy allocator, the compound page is probably split up to free pages. Under these circumstances, the compound page should be destroyed by destroy_compound_page(). However, there is a duplicate check to judge if the page is compound. Remove the duplicate check since the compound_order() returns 0 when the page doesn't have PG_head set in destroy_compound_page(). That is to say, destroy_compound_page() needn't check PageHead(). Signed-off-by: Gavin Shan Acked-by: Johannes Weiner Acked-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d037c8bc151..62496edbd8d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -371,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) int nr_pages = 1 << order; int bad = 0; - if (unlikely(compound_order(page) != order) || - unlikely(!PageHead(page))) { + if (unlikely(compound_order(page) != order)) { bad_page(page); bad++; } From c2974058a9caa82c9dd9e1e11e4f2c6ce42ff17e Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 18 Dec 2012 14:21:33 -0800 Subject: [PATCH 10/48] memory-hotplug: document and enable CONFIG_MOVABLE_NODE Add help info for CONFIG_MOVABLE_NODE and permit its selection. This option allows the user to online all memory of a node as movable memory. So that the whole node can be hotplugged. Users who don't use the hotplug feature are also fine with this option on since they won't online memory as movable. Signed-off-by: Tang Chen Reviewed-by: Yasuaki Ishimatsu Cc: Lai Jiangshan Cc: Wen Congyang Cc: Ingo Molnar [akpm@linux-foundation.org: tweak help text] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 71259e052ce..278e3ab1f16 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -149,7 +149,18 @@ config MOVABLE_NODE depends on NO_BOOTMEM depends on X86_64 depends on NUMA - depends on BROKEN + default n + help + Allow a node to have only movable memory. Pages used by the kernel, + such as direct mapping pages cannot be migrated. So the corresponding + memory device cannot be hotplugged. This option allows users to + online all the memory of a node as movable memory so that the whole + node can be hotplugged. Users who don't use the memory hotplug + feature are fine with this option on since they don't online memory + as movable. + + Say Y here if you want to hotplug a whole node. + Say N here if you want kernel to use memory on all nodes evenly. # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG From a0956d54492eb7257b09230680a8812b42cdee92 Mon Sep 17 00:00:00 2001 From: Suleiman Souhlal Date: Tue, 18 Dec 2012 14:21:36 -0800 Subject: [PATCH 11/48] memcg: make it possible to use the stock for more than one page We currently have a percpu stock cache scheme that charges one page at a time from memcg->res, the user counter. When the kernel memory controller comes into play, we'll need to charge more than that. This is because kernel memory allocations will also draw from the user counter, and can be bigger than a single page, as it is the case with the stack (usually 2 pages) or some higher order slabs. [glommer@parallels.com: added a changelog ] Signed-off-by: Suleiman Souhlal Signed-off-by: Glauber Costa Acked-by: David Rientjes Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bbfac5063ca..92887b28624 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2060,20 +2060,28 @@ struct memcg_stock_pcp { static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static DEFINE_MUTEX(percpu_charge_mutex); -/* - * Try to consume stocked charge on this cpu. If success, one page is consumed - * from local stock and true is returned. If the stock is 0 or charges from a - * cgroup which is not current target, returns false. This stock will be - * refilled. +/** + * consume_stock: Try to consume stocked charge on this cpu. + * @memcg: memcg to consume from. + * @nr_pages: how many pages to charge. + * + * The charges will only happen if @memcg matches the current cpu's memcg + * stock, and at least @nr_pages are available in that stock. Failure to + * service an allocation will refill the stock. + * + * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; bool ret = true; + if (nr_pages > CHARGE_BATCH) + return false; + stock = &get_cpu_var(memcg_stock); - if (memcg == stock->cached && stock->nr_pages) - stock->nr_pages--; + if (memcg == stock->cached && stock->nr_pages >= nr_pages) + stock->nr_pages -= nr_pages; else /* need to call res_counter_charge */ ret = false; put_cpu_var(memcg_stock); @@ -2371,7 +2379,7 @@ again: memcg = *ptr; if (mem_cgroup_is_root(memcg)) goto done; - if (nr_pages == 1 && consume_stock(memcg)) + if (consume_stock(memcg, nr_pages)) goto done; css_get(&memcg->css); } else { @@ -2396,7 +2404,7 @@ again: rcu_read_unlock(); goto done; } - if (nr_pages == 1 && consume_stock(memcg)) { + if (consume_stock(memcg, nr_pages)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not From 4c9c535968cacf507c5febe49e6ec5123eadf207 Mon Sep 17 00:00:00 2001 From: Suleiman Souhlal Date: Tue, 18 Dec 2012 14:21:41 -0800 Subject: [PATCH 12/48] memcg: reclaim when more than one page needed mem_cgroup_do_charge() was written before kmem accounting, and expects three cases: being called for 1 page, being called for a stock of 32 pages, or being called for a hugepage. If we call for 2 or 3 pages (and both the stack and several slabs used in process creation are such, at least with the debug options I had), it assumed it's being called for stock and just retried without reclaiming. Fix that by passing down a minsize argument in addition to the csize. And what to do about that (csize == PAGE_SIZE && ret) retry? If it's needed at all (and presumably is since it's there, perhaps to handle races), then it should be extended to more than PAGE_SIZE, yet how far? And should there be a retry count limit, of what? For now retry up to COSTLY_ORDER (as page_alloc.c does) and make sure not to do it if __GFP_NORETRY. v4: fixed nr pages calculation pointed out by Christoph Lameter. Signed-off-by: Suleiman Souhlal Signed-off-by: Glauber Costa Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: David Rientjes Cc: Tejun Heo Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 92887b28624..20e3619870a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2258,7 +2258,8 @@ enum { }; static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, bool oom_check) + unsigned int nr_pages, unsigned int min_pages, + bool oom_check) { unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; @@ -2281,18 +2282,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); /* - * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch - * of regular pages (CHARGE_BATCH), or a single regular page (1). - * * Never reclaim on behalf of optional batching, retry with a * single page instead. */ - if (nr_pages == CHARGE_BATCH) + if (nr_pages > min_pages) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; + if (gfp_mask & __GFP_NORETRY) + return CHARGE_NOMEM; + ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; @@ -2305,7 +2306,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages == 1 && ret) + if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) return CHARGE_RETRY; /* @@ -2439,7 +2440,8 @@ again: nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; } - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, + oom_check); switch (ret) { case CHARGE_OK: break; From 86ae53e1a138a3295c04ae69bf404be00244a381 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:21:45 -0800 Subject: [PATCH 13/48] memcg: change defines to an enum This is just a cleanup patch for clarity of expression. In earlier submissions, people asked it to be in a separate patch, so here it is. Signed-off-by: Glauber Costa Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: David Rientjes Cc: Tejun Heo Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 20e3619870a..c7b0b1b803a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -388,9 +388,12 @@ enum charge_type { }; /* for encoding cft->private value on file */ -#define _MEM (0) -#define _MEMSWAP (1) -#define _OOM_TYPE (2) +enum res_type { + _MEM, + _MEMSWAP, + _OOM_TYPE, +}; + #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) @@ -3952,7 +3955,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); char str[64]; u64 val; - int type, name, len; + int name, len; + enum res_type type; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); @@ -3988,7 +3992,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, const char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + enum res_type type; + int name; unsigned long long val; int ret; @@ -4064,7 +4069,8 @@ out: static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + int name; + enum res_type type; type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); @@ -4400,7 +4406,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); u64 threshold, usage; int i, size, ret; @@ -4483,7 +4489,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); u64 usage; int i, j, size; @@ -4561,7 +4567,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *event; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); BUG_ON(type != _OOM_TYPE); event = kmalloc(sizeof(*event), GFP_KERNEL); @@ -4586,7 +4592,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *ev, *tmp; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); BUG_ON(type != _OOM_TYPE); From 510fc4e11b772fd60f2c545c64d4c55abd07ce36 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:21:47 -0800 Subject: [PATCH 14/48] memcg: kmem accounting basic infrastructure Add the basic infrastructure for the accounting of kernel memory. To control that, the following files are created: * memory.kmem.usage_in_bytes * memory.kmem.limit_in_bytes * memory.kmem.failcnt * memory.kmem.max_usage_in_bytes They have the same meaning of their user memory counterparts. They reflect the state of the "kmem" res_counter. Per cgroup kmem memory accounting is not enabled until a limit is set for the group. Once the limit is set the accounting cannot be disabled for that group. This means that after the patch is applied, no behavioral changes exists for whoever is still using memcg to control their memory usage, until memory.kmem.limit_in_bytes is set for the first time. We always account to both user and kernel resource_counters. This effectively means that an independent kernel limit is in place when the limit is set to a lower value than the user memory. A equal or higher value means that the user limit will always hit first, meaning that kmem is effectively unlimited. People who want to track kernel memory but not limit it, can set this limit to a very high number (like RESOURCE_MAX - 1page - that no one will ever hit, or equal to the user memory) [akpm@linux-foundation.org: MEMCG_MMEM only works with slab and slub] Signed-off-by: Glauber Costa Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Tejun Heo Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 1 + mm/memcontrol.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 124 insertions(+), 3 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 675d8a2326c..19ccb33c99d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -882,6 +882,7 @@ config MEMCG_SWAP_ENABLED config MEMCG_KMEM bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" depends on MEMCG && EXPERIMENTAL + depends on SLUB || SLAB default n help The Kernel Memory extension for Memory Resource Controller can limit diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c7b0b1b803a..bba1cb4bbb8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -267,6 +267,10 @@ struct mem_cgroup { struct work_struct work_freeing; }; + /* + * the counter to account for kernel memory usage. + */ + struct res_counter kmem; /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. @@ -282,6 +286,7 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; + unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ bool oom_lock; atomic_t under_oom; @@ -334,6 +339,20 @@ struct mem_cgroup { #endif }; +/* internal only representation about the status of kmem accounting. */ +enum { + KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ +}; + +#define KMEM_ACCOUNTED_MASK (1 << KMEM_ACCOUNTED_ACTIVE) + +#ifdef CONFIG_MEMCG_KMEM +static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} +#endif + /* Stuffs for move charges at task migration. */ /* * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a @@ -392,6 +411,7 @@ enum res_type { _MEM, _MEMSWAP, _OOM_TYPE, + _KMEM, }; #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) @@ -1456,6 +1476,10 @@ done: res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); + printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", + res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); } /* @@ -3977,6 +4001,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, else val = res_counter_read_u64(&memcg->memsw, name); break; + case _KMEM: + val = res_counter_read_u64(&memcg->kmem, name); + break; default: BUG(); } @@ -3984,6 +4011,59 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); return simple_read_from_buffer(buf, nbytes, ppos, str, len); } + +static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) +{ + int ret = -EINVAL; +#ifdef CONFIG_MEMCG_KMEM + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + /* + * For simplicity, we won't allow this to be disabled. It also can't + * be changed if the cgroup has children already, or if tasks had + * already joined. + * + * If tasks join before we set the limit, a person looking at + * kmem.usage_in_bytes will have no way to determine when it took + * place, which makes the value quite meaningless. + * + * After it first became limited, changes in the value of the limit are + * of course permitted. + * + * Taking the cgroup_lock is really offensive, but it is so far the only + * way to guarantee that no children will appear. There are plenty of + * other offenders, and they should all go away. Fine grained locking + * is probably the way to go here. When we are fully hierarchical, we + * can also get rid of the use_hierarchy check. + */ + cgroup_lock(); + mutex_lock(&set_limit_mutex); + if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { + if (cgroup_task_count(cont) || (memcg->use_hierarchy && + !list_empty(&cont->children))) { + ret = -EBUSY; + goto out; + } + ret = res_counter_set_limit(&memcg->kmem, val); + VM_BUG_ON(ret); + + memcg_kmem_set_active(memcg); + } else + ret = res_counter_set_limit(&memcg->kmem, val); +out: + mutex_unlock(&set_limit_mutex); + cgroup_unlock(); +#endif + return ret; +} + +static void memcg_propagate_kmem(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + if (!parent) + return; + memcg->kmem_account_flags = parent->kmem_account_flags; +} + /* * The user of this function is... * RES_LIMIT. @@ -4015,8 +4095,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, break; if (type == _MEM) ret = mem_cgroup_resize_limit(memcg, val); - else + else if (type == _MEMSWAP) ret = mem_cgroup_resize_memsw_limit(memcg, val); + else if (type == _KMEM) + ret = memcg_update_kmem_limit(cont, val); + else + return -EINVAL; break; case RES_SOFT_LIMIT: ret = res_counter_memparse_write_strategy(buffer, &val); @@ -4082,14 +4166,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) case RES_MAX_USAGE: if (type == _MEM) res_counter_reset_max(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_max(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_max(&memcg->kmem); + else + return -EINVAL; break; case RES_FAILCNT: if (type == _MEM) res_counter_reset_failcnt(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_failcnt(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_failcnt(&memcg->kmem); + else + return -EINVAL; break; } @@ -4651,6 +4743,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, #ifdef CONFIG_MEMCG_KMEM static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + memcg_propagate_kmem(memcg); return mem_cgroup_sockets_init(memcg, ss); }; @@ -4764,6 +4857,31 @@ static struct cftype mem_cgroup_files[] = { .trigger = mem_cgroup_reset, .read = mem_cgroup_read, }, +#endif +#ifdef CONFIG_MEMCG_KMEM + { + .name = "kmem.limit_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), + .write_string = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { + .name = "kmem.usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), + .read = mem_cgroup_read, + }, + { + .name = "kmem.failcnt", + .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, + { + .name = "kmem.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, #endif { }, /* terminate */ }; @@ -5010,6 +5128,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) if (parent && parent->use_hierarchy) { res_counter_init(&memcg->res, &parent->res); res_counter_init(&memcg->memsw, &parent->memsw); + res_counter_init(&memcg->kmem, &parent->kmem); /* * We increment refcnt of the parent to ensure that we can * safely access it on res_counter_charge/uncharge. @@ -5020,6 +5139,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); + res_counter_init(&memcg->kmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this From 7a64bf05b2a6fe3703062d13d389e3eb904741c6 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:21:51 -0800 Subject: [PATCH 15/48] mm: add a __GFP_KMEMCG flag This flag is used to indicate to the callees that this allocation is a kernel allocation in process context, and should be accounted to current's memcg. Signed-off-by: Glauber Costa Acked-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Mel Gorman Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Suleiman Souhlal Cc: Tejun Heo Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 ++ include/trace/events/gfpflags.h | 1 + 2 files changed, 3 insertions(+) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f74856e17e4..643c9a6f7f3 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -30,6 +30,7 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u +#define ___GFP_KMEMCG 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u @@ -89,6 +90,7 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ +#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5b14b..1eddbf1557f 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -34,6 +34,7 @@ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ + {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ From 7ae1e1d0f8ac2927ed7e3ca6d15e42d485903459 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:21:56 -0800 Subject: [PATCH 16/48] memcg: kmem controller infrastructure Introduce infrastructure for tracking kernel memory pages to a given memcg. This will happen whenever the caller includes the flag __GFP_KMEMCG flag, and the task belong to a memcg other than the root. In memcontrol.h those functions are wrapped in inline acessors. The idea is to later on, patch those with static branches, so we don't incur any overhead when no mem cgroups with limited kmem are being used. Users of this functionality shall interact with the memcg core code through the following functions: memcg_kmem_newpage_charge: will return true if the group can handle the allocation. At this point, struct page is not yet allocated. memcg_kmem_commit_charge: will either revert the charge, if struct page allocation failed, or embed memcg information into page_cgroup. memcg_kmem_uncharge_page: called at free time, will revert the charge. Signed-off-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Kamezawa Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Johannes Weiner Cc: Tejun Heo Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Rik van Riel Cc: Suleiman Souhlal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 110 ++++++++++++++++++++++++ mm/memcontrol.c | 170 +++++++++++++++++++++++++++++++++++++ 2 files changed, 280 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e98a74c0c9c..afa2ad40457 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -21,6 +21,7 @@ #define _LINUX_MEMCONTROL_H #include #include +#include struct mem_cgroup; struct page_cgroup; @@ -414,5 +415,114 @@ static inline void sock_release_memcg(struct sock *sk) { } #endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */ + +#ifdef CONFIG_MEMCG_KMEM +static inline bool memcg_kmem_enabled(void) +{ + return true; +} + +/* + * In general, we'll do everything in our power to not incur in any overhead + * for non-memcg users for the kmem functions. Not even a function call, if we + * can avoid it. + * + * Therefore, we'll inline all those functions so that in the best case, we'll + * see that kmemcg is off for everybody and proceed quickly. If it is on, + * we'll still do most of the flag checking inline. We check a lot of + * conditions, but because they are pretty simple, they are expected to be + * fast. + */ +bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, + int order); +void __memcg_kmem_commit_charge(struct page *page, + struct mem_cgroup *memcg, int order); +void __memcg_kmem_uncharge_pages(struct page *page, int order); + +/** + * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. + * @gfp: the gfp allocation flags. + * @memcg: a pointer to the memcg this was charged against. + * @order: allocation order. + * + * returns true if the memcg where the current task belongs can hold this + * allocation. + * + * We return true automatically if this allocation is not to be accounted to + * any memcg. + */ +static inline bool +memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +{ + if (!memcg_kmem_enabled()) + return true; + + /* + * __GFP_NOFAIL allocations will move on even if charging is not + * possible. Therefore we don't even try, and have this allocation + * unaccounted. We could in theory charge it with + * res_counter_charge_nofail, but we hope those allocations are rare, + * and won't be worth the trouble. + */ + if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL)) + return true; + if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) + return true; + + /* If the test is dying, just let it go. */ + if (unlikely(fatal_signal_pending(current))) + return true; + + return __memcg_kmem_newpage_charge(gfp, memcg, order); +} + +/** + * memcg_kmem_uncharge_pages: uncharge pages from memcg + * @page: pointer to struct page being freed + * @order: allocation order. + * + * there is no need to specify memcg here, since it is embedded in page_cgroup + */ +static inline void +memcg_kmem_uncharge_pages(struct page *page, int order) +{ + if (memcg_kmem_enabled()) + __memcg_kmem_uncharge_pages(page, order); +} + +/** + * memcg_kmem_commit_charge: embeds correct memcg in a page + * @page: pointer to struct page recently allocated + * @memcg: the memcg structure we charged against + * @order: allocation order. + * + * Needs to be called after memcg_kmem_newpage_charge, regardless of success or + * failure of the allocation. if @page is NULL, this function will revert the + * charges. Otherwise, it will commit the memcg given by @memcg to the + * corresponding page_cgroup. + */ +static inline void +memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) +{ + if (memcg_kmem_enabled() && memcg) + __memcg_kmem_commit_charge(page, memcg, order); +} + +#else +static inline bool +memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +{ + return true; +} + +static inline void memcg_kmem_uncharge_pages(struct page *page, int order) +{ +} + +static inline void +memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bba1cb4bbb8..b9afa060b8d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -10,6 +10,10 @@ * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * + * Kernel Memory Controller + * Copyright (C) 2012 Parallels Inc. and Google Inc. + * Authors: Glauber Costa and Suleiman Souhlal + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -2661,6 +2665,172 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, memcg_check_events(memcg, page); } +#ifdef CONFIG_MEMCG_KMEM +static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) +{ + return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && + (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); +} + +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +{ + struct res_counter *fail_res; + struct mem_cgroup *_memcg; + int ret = 0; + bool may_oom; + + ret = res_counter_charge(&memcg->kmem, size, &fail_res); + if (ret) + return ret; + + /* + * Conditions under which we can wait for the oom_killer. Those are + * the same conditions tested by the core page allocator + */ + may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); + + _memcg = memcg; + ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, + &_memcg, may_oom); + + if (ret == -EINTR) { + /* + * __mem_cgroup_try_charge() chosed to bypass to root due to + * OOM kill or fatal signal. Since our only options are to + * either fail the allocation or charge it to this cgroup, do + * it as a temporary condition. But we can't fail. From a + * kmem/slab perspective, the cache has already been selected, + * by mem_cgroup_kmem_get_cache(), so it is too late to change + * our minds. + * + * This condition will only trigger if the task entered + * memcg_charge_kmem in a sane state, but was OOM-killed during + * __mem_cgroup_try_charge() above. Tasks that were already + * dying when the allocation triggers should have been already + * directed to the root cgroup in memcontrol.h + */ + res_counter_charge_nofail(&memcg->res, size, &fail_res); + if (do_swap_account) + res_counter_charge_nofail(&memcg->memsw, size, + &fail_res); + ret = 0; + } else if (ret) + res_counter_uncharge(&memcg->kmem, size); + + return ret; +} + +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +{ + res_counter_uncharge(&memcg->kmem, size); + res_counter_uncharge(&memcg->res, size); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, size); +} + +/* + * We need to verify if the allocation against current->mm->owner's memcg is + * possible for the given order. But the page is not allocated yet, so we'll + * need a further commit step to do the final arrangements. + * + * It is possible for the task to switch cgroups in this mean time, so at + * commit time, we can't rely on task conversion any longer. We'll then use + * the handle argument to return to the caller which cgroup we should commit + * against. We could also return the memcg directly and avoid the pointer + * passing, but a boolean return value gives better semantics considering + * the compiled-out case as well. + * + * Returning true means the allocation is possible. + */ +bool +__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +{ + struct mem_cgroup *memcg; + int ret; + + *_memcg = NULL; + memcg = try_get_mem_cgroup_from_mm(current->mm); + + /* + * very rare case described in mem_cgroup_from_task. Unfortunately there + * isn't much we can do without complicating this too much, and it would + * be gfp-dependent anyway. Just let it go + */ + if (unlikely(!memcg)) + return true; + + if (!memcg_can_account_kmem(memcg)) { + css_put(&memcg->css); + return true; + } + + mem_cgroup_get(memcg); + + ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); + if (!ret) + *_memcg = memcg; + else + mem_cgroup_put(memcg); + + css_put(&memcg->css); + return (ret == 0); +} + +void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, + int order) +{ + struct page_cgroup *pc; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + + /* The page allocation failed. Revert */ + if (!page) { + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + mem_cgroup_put(memcg); + return; + } + + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + pc->mem_cgroup = memcg; + SetPageCgroupUsed(pc); + unlock_page_cgroup(pc); +} + +void __memcg_kmem_uncharge_pages(struct page *page, int order) +{ + struct mem_cgroup *memcg = NULL; + struct page_cgroup *pc; + + + pc = lookup_page_cgroup(page); + /* + * Fast unlocked return. Theoretically might have changed, have to + * check again after locking. + */ + if (!PageCgroupUsed(pc)) + return; + + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + memcg = pc->mem_cgroup; + ClearPageCgroupUsed(pc); + } + unlock_page_cgroup(pc); + + /* + * We trust that only if there is a memcg associated with the page, it + * is a valid allocation + */ + if (!memcg) + return; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + mem_cgroup_put(memcg); +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) From 6a1a0d3b625a4091e7a0eb249aefc6a644385149 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:00 -0800 Subject: [PATCH 17/48] mm: allocate kernel pages to the right memcg When a process tries to allocate a page with the __GFP_KMEMCG flag, the page allocator will call the corresponding memcg functions to validate the allocation. Tasks in the root memcg can always proceed. To avoid adding markers to the page - and a kmem flag that would necessarily follow, as much as doing page_cgroup lookups for no reason, whoever is marking its allocations with __GFP_KMEMCG flag is responsible for telling the page allocator that this is such an allocation at free_pages() time. This is done by the invocation of __free_accounted_pages() and free_accounted_pages(). Signed-off-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Kamezawa Hiroyuki Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Johannes Weiner Cc: Suleiman Souhlal Cc: Tejun Heo Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 3 +++ mm/page_alloc.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 643c9a6f7f3..0f615eb23d0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -367,6 +367,9 @@ extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); extern void free_hot_cold_page_list(struct list_head *list, int cold); +extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); +extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); + #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 62496edbd8d..2ad2ad168ef 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2612,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; + struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2630,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + /* + * Will only have any effect when __GFP_KMEMCG is set. This is + * verified in the (always inline) callee + */ + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); @@ -2665,6 +2673,8 @@ out: if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; + memcg_kmem_commit_charge(page, memcg, order); + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2717,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +/* + * __free_memcg_kmem_pages and free_memcg_kmem_pages will free + * pages allocated with __GFP_KMEMCG. + * + * Those pages are accounted to a particular memcg, embedded in the + * corresponding page_cgroup. To avoid adding a hit in the allocator to search + * for that information only to find out that it is NULL for users who have no + * interest in that whatsoever, we provide these functions. + * + * The caller knows better which flags it relies on. + */ +void __free_memcg_kmem_pages(struct page *page, unsigned int order) +{ + memcg_kmem_uncharge_pages(page, order); + __free_pages(page, order); +} + +void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + VM_BUG_ON(!virt_addr_valid((void *)addr)); + __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + } +} + static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) { if (addr) { From 50bdd430c20566b13d8bc59946184b08f5875de6 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:04 -0800 Subject: [PATCH 18/48] res_counter: return amount of charges after res_counter_uncharge() It is useful to know how many charges are still left after a call to res_counter_uncharge. While it is possible to issue a res_counter_read after uncharge, this can be racy. If we need, for instance, to take some action when the counters drop down to 0, only one of the callers should see it. This is the same semantics as the atomic variables in the kernel. Since the current return value is void, we don't need to worry about anything breaking due to this change: nobody relied on that, and only users appearing from now on will be checking this value. Signed-off-by: Glauber Costa Reviewed-by: Michal Hocko Acked-by: Kamezawa Hiroyuki Acked-by: David Rientjes Cc: Johannes Weiner Cc: Suleiman Souhlal Cc: Tejun Heo Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/resource_counter.txt | 7 ++++--- include/linux/res_counter.h | 12 +++++++----- kernel/res_counter.c | 20 +++++++++++++------- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt index 0c4a344e78f..c4d99ed0b41 100644 --- a/Documentation/cgroups/resource_counter.txt +++ b/Documentation/cgroups/resource_counter.txt @@ -83,16 +83,17 @@ to work with it. res_counter->lock internally (it must be called with res_counter->lock held). The force parameter indicates whether we can bypass the limit. - e. void res_counter_uncharge[_locked] + e. u64 res_counter_uncharge[_locked] (struct res_counter *rc, unsigned long val) When a resource is released (freed) it should be de-accounted from the resource counter it was accounted to. This is called - "uncharging". + "uncharging". The return value of this function indicate the amount + of charges still present in the counter. The _locked routines imply that the res_counter->lock is taken. - f. void res_counter_uncharge_until + f. u64 res_counter_uncharge_until (struct res_counter *rc, struct res_counter *top, unsinged long val) diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 6f54e40fa21..5ae8456d967 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -125,14 +125,16 @@ int res_counter_charge_nofail(struct res_counter *counter, * * these calls check for usage underflow and show a warning on the console * _locked call expects the counter->lock to be taken + * + * returns the total charges still present in @counter. */ -void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); -void res_counter_uncharge(struct res_counter *counter, unsigned long val); +u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); +u64 res_counter_uncharge(struct res_counter *counter, unsigned long val); -void res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val); +u64 res_counter_uncharge_until(struct res_counter *counter, + struct res_counter *top, + unsigned long val); /** * res_counter_margin - calculate chargeable space of a counter * @cnt: the counter diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 3920d593e63..ff55247e704 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, return __res_counter_charge(counter, val, limit_fail_at, true); } -void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) +u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) { if (WARN_ON(counter->usage < val)) val = counter->usage; counter->usage -= val; + return counter->usage; } -void res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val) +u64 res_counter_uncharge_until(struct res_counter *counter, + struct res_counter *top, + unsigned long val) { unsigned long flags; struct res_counter *c; + u64 ret = 0; local_irq_save(flags); for (c = counter; c != top; c = c->parent) { + u64 r; spin_lock(&c->lock); - res_counter_uncharge_locked(c, val); + r = res_counter_uncharge_locked(c, val); + if (c == counter) + ret = r; spin_unlock(&c->lock); } local_irq_restore(flags); + return ret; } -void res_counter_uncharge(struct res_counter *counter, unsigned long val) +u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) { - res_counter_uncharge_until(counter, NULL, val); + return res_counter_uncharge_until(counter, NULL, val); } static inline unsigned long long * From 7de37682bec35bbe0cd69b8112ef257bc5fb1c3e Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:07 -0800 Subject: [PATCH 19/48] memcg: kmem accounting lifecycle management Because kmem charges can outlive the cgroup, we need to make sure that we won't free the memcg structure while charges are still in flight. For reviewing simplicity, the charge functions will issue mem_cgroup_get() at every charge, and mem_cgroup_put() at every uncharge. This can get expensive, however, and we can do better. mem_cgroup_get() only really needs to be issued once: when the first limit is set. In the same spirit, we only need to issue mem_cgroup_put() when the last charge is gone. We'll need an extra bit in kmem_account_flags for that: KMEM_ACCOUNTED_DEAD. it will be set when the cgroup dies, if there are charges in the group. If there aren't, we can proceed right away. Our uncharge function will have to test that bit every time the charges drop to 0. Because that is not the likely output of res_counter_uncharge, this should not impose a big hit on us: it is certainly much better than a reference count decrease at every operation. Signed-off-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Kamezawa Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Johannes Weiner Cc: Suleiman Souhlal Cc: Tejun Heo Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 57 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b9afa060b8d..9a62ac3ea88 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -346,6 +346,7 @@ struct mem_cgroup { /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ + KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ }; #define KMEM_ACCOUNTED_MASK (1 << KMEM_ACCOUNTED_ACTIVE) @@ -355,6 +356,23 @@ static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) { set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); } + +static bool memcg_kmem_is_active(struct mem_cgroup *memcg) +{ + return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) +{ + if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) + set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) +{ + return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, + &memcg->kmem_account_flags); +} #endif /* Stuffs for move charges at task migration. */ @@ -2722,10 +2740,16 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) { - res_counter_uncharge(&memcg->kmem, size); res_counter_uncharge(&memcg->res, size); if (do_swap_account) res_counter_uncharge(&memcg->memsw, size); + + /* Not down to 0 */ + if (res_counter_uncharge(&memcg->kmem, size)) + return; + + if (memcg_kmem_test_and_clear_dead(memcg)) + mem_cgroup_put(memcg); } /* @@ -2764,13 +2788,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) return true; } - mem_cgroup_get(memcg); - ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); if (!ret) *_memcg = memcg; - else - mem_cgroup_put(memcg); css_put(&memcg->css); return (ret == 0); @@ -2786,7 +2806,6 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, /* The page allocation failed. Revert */ if (!page) { memcg_uncharge_kmem(memcg, PAGE_SIZE << order); - mem_cgroup_put(memcg); return; } @@ -2827,7 +2846,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) VM_BUG_ON(mem_cgroup_is_root(memcg)); memcg_uncharge_kmem(memcg, PAGE_SIZE << order); - mem_cgroup_put(memcg); } #endif /* CONFIG_MEMCG_KMEM */ @@ -4217,6 +4235,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) VM_BUG_ON(ret); memcg_kmem_set_active(memcg); + /* + * kmem charges can outlive the cgroup. In the case of slab + * pages, for instance, a page contain objects from various + * processes, so it is unfeasible to migrate them away. We + * need to reference count the memcg because of that. + */ + mem_cgroup_get(memcg); } else ret = res_counter_set_limit(&memcg->kmem, val); out: @@ -4232,6 +4257,10 @@ static void memcg_propagate_kmem(struct mem_cgroup *memcg) if (!parent) return; memcg->kmem_account_flags = parent->kmem_account_flags; +#ifdef CONFIG_MEMCG_KMEM + if (memcg_kmem_is_active(memcg)) + mem_cgroup_get(memcg); +#endif } /* @@ -4920,6 +4949,20 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) static void kmem_cgroup_destroy(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); + + memcg_kmem_mark_dead(memcg); + + if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) + return; + + /* + * Charges already down to 0, undo mem_cgroup_get() done in the charge + * path here, being careful not to race with memcg_uncharge_kmem: it is + * possible that the charges went down to 0 between mark_dead and the + * res_counter read, so in that case, we don't need the put + */ + if (memcg_kmem_test_and_clear_dead(memcg)) + mem_cgroup_put(memcg); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) From a8964b9b84f99c0b1b5d7c09520f89f0700e742e Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:09 -0800 Subject: [PATCH 20/48] memcg: use static branches when code not in use We can use static branches to patch the code in or out when not used. Because the _ACTIVE bit on kmem_accounted is only set after the increment is done, we guarantee that the root memcg will always be selected for kmem charges until all call sites are patched (see memcg_kmem_enabled). This guarantees that no mischarges are applied. Static branch decrement happens when the last reference count from the kmem accounting in memcg dies. This will only happen when the charges drop down to 0. When that happens, we need to disable the static branch only on those memcgs that enabled it. To achieve this, we would be forced to complicate the code by keeping track of which memcgs were the ones that actually enabled limits, and which ones got it from its parents. It is a lot simpler just to do static_key_slow_inc() on every child that is accounted. Signed-off-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Kamezawa Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Johannes Weiner Cc: Suleiman Souhlal Cc: Tejun Heo Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 4 +- mm/memcontrol.c | 79 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 78 insertions(+), 5 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index afa2ad40457..87d61e840dd 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -22,6 +22,7 @@ #include #include #include +#include struct mem_cgroup; struct page_cgroup; @@ -417,9 +418,10 @@ static inline void sock_release_memcg(struct sock *sk) #endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */ #ifdef CONFIG_MEMCG_KMEM +extern struct static_key memcg_kmem_enabled_key; static inline bool memcg_kmem_enabled(void) { - return true; + return static_key_false(&memcg_kmem_enabled_key); } /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9a62ac3ea88..bc70254558f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -346,10 +346,13 @@ struct mem_cgroup { /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ + KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ }; -#define KMEM_ACCOUNTED_MASK (1 << KMEM_ACCOUNTED_ACTIVE) +/* We account when limit is on, but only after call sites are patched */ +#define KMEM_ACCOUNTED_MASK \ + ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) #ifdef CONFIG_MEMCG_KMEM static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) @@ -362,6 +365,11 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg) return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); } +static void memcg_kmem_set_activated(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) { if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) @@ -532,6 +540,26 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) } #endif +#ifdef CONFIG_MEMCG_KMEM +struct static_key memcg_kmem_enabled_key; + +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ + if (memcg_kmem_is_active(memcg)) + static_key_slow_dec(&memcg_kmem_enabled_key); +} +#else +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +static void disarm_static_keys(struct mem_cgroup *memcg) +{ + disarm_sock_keys(memcg); + disarm_kmem_keys(memcg); +} + static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * @@ -4204,6 +4232,8 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) { int ret = -EINVAL; #ifdef CONFIG_MEMCG_KMEM + bool must_inc_static_branch = false; + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); /* * For simplicity, we won't allow this to be disabled. It also can't @@ -4234,7 +4264,15 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) ret = res_counter_set_limit(&memcg->kmem, val); VM_BUG_ON(ret); - memcg_kmem_set_active(memcg); + /* + * After this point, kmem_accounted (that we test atomically in + * the beginning of this conditional), is no longer 0. This + * guarantees only one process will set the following boolean + * to true. We don't need test_and_set because we're protected + * by the set_limit_mutex anyway. + */ + memcg_kmem_set_activated(memcg); + must_inc_static_branch = true; /* * kmem charges can outlive the cgroup. In the case of slab * pages, for instance, a page contain objects from various @@ -4247,6 +4285,27 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) out: mutex_unlock(&set_limit_mutex); cgroup_unlock(); + + /* + * We are by now familiar with the fact that we can't inc the static + * branch inside cgroup_lock. See disarm functions for details. A + * worker here is overkill, but also wrong: After the limit is set, we + * must start accounting right away. Since this operation can't fail, + * we can safely defer it to here - no rollback will be needed. + * + * The boolean used to control this is also safe, because + * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be + * able to set it to true; + */ + if (must_inc_static_branch) { + static_key_slow_inc(&memcg_kmem_enabled_key); + /* + * setting the active bit after the inc will guarantee no one + * starts accounting before all call sites are patched + */ + memcg_kmem_set_active(memcg); + } + #endif return ret; } @@ -4258,8 +4317,20 @@ static void memcg_propagate_kmem(struct mem_cgroup *memcg) return; memcg->kmem_account_flags = parent->kmem_account_flags; #ifdef CONFIG_MEMCG_KMEM - if (memcg_kmem_is_active(memcg)) + /* + * When that happen, we need to disable the static branch only on those + * memcgs that enabled it. To achieve this, we would be forced to + * complicate the code by keeping track of which memcgs were the ones + * that actually enabled limits, and which ones got it from its + * parents. + * + * It is a lot simpler just to do static_key_slow_inc() on every child + * that is accounted. + */ + if (memcg_kmem_is_active(memcg)) { mem_cgroup_get(memcg); + static_key_slow_inc(&memcg_kmem_enabled_key); + } #endif } @@ -5184,7 +5255,7 @@ static void free_work(struct work_struct *work) * to move this code around, and make sure it is outside * the cgroup_lock. */ - disarm_sock_keys(memcg); + disarm_static_keys(memcg); if (size < PAGE_SIZE) kfree(memcg); else From bea207c86e4eb158bf20f7b36bee72da9272e8d3 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:11 -0800 Subject: [PATCH 21/48] memcg: allow a memcg with kmem charges to be destructed Because the ultimate goal of the kmem tracking in memcg is to track slab pages as well, we can't guarantee that we'll always be able to point a page to a particular process, and migrate the charges along with it - since in the common case, a page will contain data belonging to multiple processes. Because of that, when we destroy a memcg, we only make sure the destruction will succeed by discounting the kmem charges from the user charges when we try to empty the cgroup. Signed-off-by: Glauber Costa Acked-by: Kamezawa Hiroyuki Reviewed-by: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bc70254558f..f96ccc90fa6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -547,6 +547,11 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg) { if (memcg_kmem_is_active(memcg)) static_key_slow_dec(&memcg_kmem_enabled_key); + /* + * This check can't live in kmem destruction function, + * since the charges will outlive the cgroup + */ + WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); } #else static void disarm_kmem_keys(struct mem_cgroup *memcg) @@ -4025,6 +4030,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) { int node, zid; + u64 usage; do { /* This is for making all *used* pages to be on LRU. */ @@ -4045,13 +4051,20 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) cond_resched(); /* + * Kernel memory may not necessarily be trackable to a specific + * process. So they are not migrated, and therefore we can't + * expect their value to drop to 0 here. + * Having res filled up with kmem only is enough. + * * This is a safety check because mem_cgroup_force_empty_list * could have raced with mem_cgroup_replace_page_cache callers * so the lru seemed empty but the page could have been added * right after the check. RES_USAGE should be safe as we always * charge before adding to the LRU. */ - } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0); + usage = res_counter_read_u64(&memcg->res, RES_USAGE) - + res_counter_read_u64(&memcg->kmem, RES_USAGE); + } while (usage > 0); } /* From c8b2a36fb1597e9390cf4c1a7f2dd394dc7d7b17 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:13 -0800 Subject: [PATCH 22/48] memcg: execute the whole memcg freeing in free_worker() A lot of the initialization we do in mem_cgroup_create() is done with softirqs enabled. This include grabbing a css id, which holds &ss->id_lock->rlock, and the per-zone trees, which holds rtpz->lock->rlock. All of those signal to the lockdep mechanism that those locks can be used in SOFTIRQ-ON-W context. This means that the freeing of memcg structure must happen in a compatible context, otherwise we'll get a deadlock, like the one below, caught by lockdep: free_accounted_pages+0x47/0x4c free_task+0x31/0x5c __put_task_struct+0xc2/0xdb put_task_struct+0x1e/0x22 delayed_put_task_struct+0x7a/0x98 __rcu_process_callbacks+0x269/0x3df rcu_process_callbacks+0x31/0x5b __do_softirq+0x122/0x277 This usage pattern could not be triggered before kmem came into play. With the introduction of kmem stack handling, it is possible that we call the last mem_cgroup_put() from the task destructor, which is run in an rcu callback. Such callbacks are run with softirqs disabled, leading to the offensive usage pattern. In general, we have little, if any, means to guarantee in which context the last memcg_put will happen. The best we can do is test it and try to make sure no invalid context releases are happening. But as we add more code to memcg, the possible interactions grow in number and expose more ways to get context conflicts. One thing to keep in mind, is that part of the freeing process is already deferred to a worker, such as vfree(), that can only be called from process context. For the moment, the only two functions we really need moved away are: * free_css_id(), and * mem_cgroup_remove_from_trees(). But because the later accesses per-zone info, free_mem_cgroup_per_zone_info() needs to be moved as well. With that, we are left with the per_cpu stats only. Better move it all. Signed-off-by: Glauber Costa Tested-by: Greg Thelen Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 66 +++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f96ccc90fa6..e16694d5e11 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5247,16 +5247,29 @@ out_free: } /* - * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, - * but in process context. The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. + * At destroying mem_cgroup, references from swap_cgroup can remain. + * (scanning all at force_empty is too costly...) + * + * Instead of clearing all references at force_empty, we remember + * the number of reference from swap_cgroup and free mem_cgroup when + * it goes down to 0. + * + * Removal of cgroup itself succeeds regardless of refs from swap. */ -static void free_work(struct work_struct *work) + +static void __mem_cgroup_free(struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; + int node; int size = sizeof(struct mem_cgroup); - memcg = container_of(work, struct mem_cgroup, work_freeing); + mem_cgroup_remove_from_trees(memcg); + free_css_id(&mem_cgroup_subsys, &memcg->css); + + for_each_node(node) + free_mem_cgroup_per_zone_info(memcg, node); + + free_percpu(memcg->stat); + /* * We need to make sure that (at least for now), the jump label * destruction code runs outside of the cgroup lock. This is because @@ -5275,6 +5288,20 @@ static void free_work(struct work_struct *work) vfree(memcg); } + +/* + * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, + * but in process context. The work_freeing structure is overlaid + * on the rcu_freeing structure, which itself is overlaid on memsw. + */ +static void free_work(struct work_struct *work) +{ + struct mem_cgroup *memcg; + + memcg = container_of(work, struct mem_cgroup, work_freeing); + __mem_cgroup_free(memcg); +} + static void free_rcu(struct rcu_head *rcu_head) { struct mem_cgroup *memcg; @@ -5284,31 +5311,6 @@ static void free_rcu(struct rcu_head *rcu_head) schedule_work(&memcg->work_freeing); } -/* - * At destroying mem_cgroup, references from swap_cgroup can remain. - * (scanning all at force_empty is too costly...) - * - * Instead of clearing all references at force_empty, we remember - * the number of reference from swap_cgroup and free mem_cgroup when - * it goes down to 0. - * - * Removal of cgroup itself succeeds regardless of refs from swap. - */ - -static void __mem_cgroup_free(struct mem_cgroup *memcg) -{ - int node; - - mem_cgroup_remove_from_trees(memcg); - free_css_id(&mem_cgroup_subsys, &memcg->css); - - for_each_node(node) - free_mem_cgroup_per_zone_info(memcg, node); - - free_percpu(memcg->stat); - call_rcu(&memcg->rcu_freeing, free_rcu); -} - static void mem_cgroup_get(struct mem_cgroup *memcg) { atomic_inc(&memcg->refcnt); @@ -5318,7 +5320,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) { if (atomic_sub_and_test(count, &memcg->refcnt)) { struct mem_cgroup *parent = parent_mem_cgroup(memcg); - __mem_cgroup_free(memcg); + call_rcu(&memcg->rcu_freeing, free_rcu); if (parent) mem_cgroup_put(parent); } From 2ad306b17c0ac5a1b1f250d5f772aeb87fdf1eba Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:18 -0800 Subject: [PATCH 23/48] fork: protect architectures where THREAD_SIZE >= PAGE_SIZE against fork bombs Because those architectures will draw their stacks directly from the page allocator, rather than the slab cache, we can directly pass __GFP_KMEMCG flag, and issue the corresponding free_pages. This code path is taken when the architecture doesn't define CONFIG_ARCH_THREAD_INFO_ALLOCATOR (only ia64 seems to), and has THREAD_SIZE >= PAGE_SIZE. Luckily, most - if not all - of the remaining architectures fall in this category. This will guarantee that every stack page is accounted to the memcg the process currently lives on, and will have the allocations to fail if they go over limit. For the time being, I am defining a new variant of THREADINFO_GFP, not to mess with the other path. Once the slab is also tracked by memcg, we can get rid of that flag. Tested to successfully protect against :(){ :|:& };: Signed-off-by: Glauber Costa Acked-by: Frederic Weisbecker Acked-by: Kamezawa Hiroyuki Reviewed-by: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/thread_info.h | 2 ++ kernel/fork.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ccc1899bd62..e7e04736802 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -61,6 +61,8 @@ extern long do_no_restart_syscall(struct restart_block *parm); # define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) #endif +#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG) + /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions diff --git a/kernel/fork.c b/kernel/fork.c index c36c4e301ef..85f6d536608 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP, + struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; @@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { - free_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; From d5bdae7d59451b9d63303f7794ef32bb76ba6330 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:22 -0800 Subject: [PATCH 24/48] memcg: add documentation about the kmem controller Signed-off-by: Glauber Costa Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 59 +++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index a25cb3fafeb..5b5b6314377 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -71,6 +71,11 @@ Brief summary of control files. memory.oom_control # set/show oom controls. memory.numa_stat # show the number of memory usage per numa node + memory.kmem.limit_in_bytes # set/show hard limit for kernel memory + memory.kmem.usage_in_bytes # show current kernel memory allocation + memory.kmem.failcnt # show the number of kernel memory usage hits limits + memory.kmem.max_usage_in_bytes # show max kernel memory usage recorded + memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits @@ -268,20 +273,66 @@ the amount of kernel memory used by the system. Kernel memory is fundamentally different than user memory, since it can't be swapped out, which makes it possible to DoS the system by consuming too much of this precious resource. +Kernel memory won't be accounted at all until limit on a group is set. This +allows for existing setups to continue working without disruption. The limit +cannot be set if the cgroup have children, or if there are already tasks in the +cgroup. Attempting to set the limit under those conditions will return -EBUSY. +When use_hierarchy == 1 and a group is accounted, its children will +automatically be accounted regardless of their limit value. + +After a group is first limited, it will be kept being accounted until it +is removed. The memory limitation itself, can of course be removed by writing +-1 to memory.kmem.limit_in_bytes. In this case, kmem will be accounted, but not +limited. + Kernel memory limits are not imposed for the root cgroup. Usage for the root -cgroup may or may not be accounted. +cgroup may or may not be accounted. The memory used is accumulated into +memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. +(currently only for tcp). +The main "kmem" counter is fed into the main counter, so kmem charges will +also be visible from the user counter. Currently no soft limit is implemented for kernel memory. It is future work to trigger slab reclaim when those limits are reached. 2.7.1 Current Kernel Memory resources accounted +* stack pages: every process consumes some stack pages. By accounting into +kernel memory, we prevent new processes from being created when the kernel +memory usage is too high. + * sockets memory pressure: some sockets protocols have memory pressure thresholds. The Memory Controller allows them to be controlled individually per cgroup, instead of globally. * tcp memory pressure: sockets memory pressure for the tcp protocol. +2.7.3 Common use cases + +Because the "kmem" counter is fed to the main user counter, kernel memory can +never be limited completely independently of user memory. Say "U" is the user +limit, and "K" the kernel limit. There are three possible ways limits can be +set: + + U != 0, K = unlimited: + This is the standard memcg limitation mechanism already present before kmem + accounting. Kernel memory is completely ignored. + + U != 0, K < U: + Kernel memory is a subset of the user memory. This setup is useful in + deployments where the total amount of memory per-cgroup is overcommited. + Overcommiting kernel memory limits is definitely not recommended, since the + box can still run out of non-reclaimable memory. + In this case, the admin could set up K so that the sum of all groups is + never greater than the total memory, and freely set U at the cost of his + QoS. + + U != 0, K >= U: + Since kmem charges will also be fed to the user counter and reclaim will be + triggered for the cgroup for both kinds of memory. This setup gives the + admin a unified view of memory, and it is also useful for people who just + want to track kernel memory usage. + 3. User Interface 0. Configuration @@ -290,6 +341,7 @@ a. Enable CONFIG_CGROUPS b. Enable CONFIG_RESOURCE_COUNTERS c. Enable CONFIG_MEMCG d. Enable CONFIG_MEMCG_SWAP (to use swap extension) +d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) 1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) # mount -t tmpfs none /sys/fs/cgroup @@ -406,6 +458,11 @@ About use_hierarchy, see Section 6. Because rmdir() moves all pages to parent, some out-of-use page caches can be moved to the parent. If you want to avoid that, force_empty will be useful. + Also, note that when memory.kmem.limit_in_bytes is set the charges due to + kernel pages will still be seen. This is not considered a failure and the + write will still return success. In this case, it is expected that + memory.kmem.usage_in_bytes == memory.usage_in_bytes. + About use_hierarchy, see Section 6. 5.2 stat file From ba6c496ed834a37a26fc6fc87fc9aecb0fa0014d Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:27 -0800 Subject: [PATCH 25/48] slab/slub: struct memcg_params For the kmem slab controller, we need to record some extra information in the kmem_cache structure. Signed-off-by: Glauber Costa Signed-off-by: Suleiman Souhlal Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 24 ++++++++++++++++++++++++ include/linux/slab_def.h | 3 +++ include/linux/slub_def.h | 3 +++ mm/slab.h | 13 +++++++++++++ 4 files changed, 43 insertions(+) diff --git a/include/linux/slab.h b/include/linux/slab.h index 743a1041512..00efba14922 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -176,6 +176,30 @@ void kmem_cache_free(struct kmem_cache *, void *); #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif +/* + * This is the main placeholder for memcg-related information in kmem caches. + * struct kmem_cache will hold a pointer to it, so the memory cost while + * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it + * would otherwise be if that would be bundled in kmem_cache: we'll need an + * extra pointer chase. But the trade off clearly lays in favor of not + * penalizing non-users. + * + * Both the root cache and the child caches will have it. For the root cache, + * this will hold a dynamically allocated array large enough to hold + * information about the currently limited memcgs in the system. + * + * Child caches will hold extra metadata needed for its operation. Fields are: + * + * @memcg: pointer to the memcg this cache belongs to + */ +struct memcg_cache_params { + bool is_root_cache; + union { + struct kmem_cache *memcg_caches[0]; + struct mem_cgroup *memcg; + }; +}; + /* * Common kmalloc functions provided by all allocators */ diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 45c0356fdc8..8bb6e0eaf3c 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -81,6 +81,9 @@ struct kmem_cache { */ int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ +#ifdef CONFIG_MEMCG_KMEM + struct memcg_cache_params *memcg_params; +#endif /* 6) per-cpu/per-node data, touched during every alloc/free */ /* diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index df448adb728..961e72eab90 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -101,6 +101,9 @@ struct kmem_cache { #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif +#ifdef CONFIG_MEMCG_KMEM + struct memcg_cache_params *memcg_params; +#endif #ifdef CONFIG_NUMA /* diff --git a/mm/slab.h b/mm/slab.h index 1cb9c9ee0e6..49e7a8b1d27 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -100,4 +100,17 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); + +#ifdef CONFIG_MEMCG_KMEM +static inline bool is_root_cache(struct kmem_cache *s) +{ + return !s->memcg_params || s->memcg_params->is_root_cache; +} +#else +static inline bool is_root_cache(struct kmem_cache *s) +{ + return true; +} + +#endif #endif From 6ccfb5bcf52bcf100fa085946f044fdbba015048 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:31 -0800 Subject: [PATCH 26/48] slab: annotate on-slab caches nodelist locks We currently provide lockdep annotation for kmalloc caches, and also caches that have SLAB_DEBUG_OBJECTS enabled. The reason for this is that we can quite frequently nest in the l3->list_lock lock, which is not something trivial to avoid. My proposal with this patch, is to extend this to caches whose slab management object lives within the slab as well ("on_slab"). The need for this arose in the context of testing kmemcg-slab patches. With such patchset, we can have per-memcg kmalloc caches. So the same path that led to nesting between kmalloc caches will could then lead to in-memcg nesting. Because they are not annotated, lockdep will trigger. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/mm/slab.c b/mm/slab.c index 2c3a2e0394d..c26ab9fbe1f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -641,6 +641,26 @@ static void init_node_lock_keys(int q) } } +static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) +{ + struct kmem_list3 *l3; + l3 = cachep->nodelists[q]; + if (!l3) + return; + + slab_set_lock_classes(cachep, &on_slab_l3_key, + &on_slab_alc_key, q); +} + +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ + int node; + + VM_BUG_ON(OFF_SLAB(cachep)); + for_each_node(node) + on_slab_lock_classes_node(cachep, node); +} + static inline void init_lock_keys(void) { int node; @@ -657,6 +677,14 @@ static inline void init_lock_keys(void) { } +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ +} + +static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) +{ +} + static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) { } @@ -1385,6 +1413,9 @@ static int __cpuinit cpuup_prepare(long cpu) free_alien_cache(alien); if (cachep->flags & SLAB_DEBUG_OBJECTS) slab_set_debugobj_lock_classes_node(cachep, node); + else if (!OFF_SLAB(cachep) && + !(cachep->flags & SLAB_DESTROY_BY_RCU)) + on_slab_lock_classes_node(cachep, node); } init_node_lock_keys(node); @@ -2489,7 +2520,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); slab_set_debugobj_lock_classes(cachep); - } + } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) + on_slab_lock_classes(cachep); return 0; } From 2633d7a028239a738b793be5ca8fa6ac312f5793 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:34 -0800 Subject: [PATCH 27/48] slab/slub: consider a memcg parameter in kmem_create_cache Allow a memcg parameter to be passed during cache creation. When the slub allocator is being used, it will only merge caches that belong to the same memcg. We'll do this by scanning the global list, and then translating the cache to a memcg-specific cache Default function is created as a wrapper, passing NULL to the memcg version. We only merge caches that belong to the same memcg. A helper is provided, memcg_css_id: because slub needs a unique cache name for sysfs. Since this is visible, but not the canonical location for slab data, the cache name is not used, the css_id should suffice. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 26 +++++++++++++++++++ include/linux/slab.h | 14 ++++++++++- mm/memcontrol.c | 51 ++++++++++++++++++++++++++++++++++++++ mm/slab.h | 23 ++++++++++++++--- mm/slab_common.c | 42 ++++++++++++++++++++++++------- mm/slub.c | 19 +++++++++++--- 6 files changed, 157 insertions(+), 18 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 87d61e840dd..0b69a047000 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -28,6 +28,7 @@ struct mem_cgroup; struct page_cgroup; struct page; struct mm_struct; +struct kmem_cache; /* Stats that can be updated by kernel. */ enum mem_cgroup_page_stat_item { @@ -441,6 +442,11 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order); void __memcg_kmem_uncharge_pages(struct page *page, int order); +int memcg_cache_id(struct mem_cgroup *memcg); +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s); +void memcg_release_cache(struct kmem_cache *cachep); +void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep); + /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. @@ -525,6 +531,26 @@ static inline void memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) { } + +static inline int memcg_cache_id(struct mem_cgroup *memcg) +{ + return -1; +} + +static inline int memcg_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *s) +{ + return 0; +} + +static inline void memcg_release_cache(struct kmem_cache *cachep) +{ +} + +static inline void memcg_cache_list_add(struct mem_cgroup *memcg, + struct kmem_cache *s) +{ +} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 00efba14922..c0fcf28c15b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,6 +116,7 @@ struct kmem_cache { }; #endif +struct mem_cgroup; /* * struct kmem_cache related prototypes */ @@ -125,6 +126,9 @@ int slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); +struct kmem_cache * +kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t, + unsigned long, void (*)(void *)); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); void kmem_cache_free(struct kmem_cache *, void *); @@ -191,15 +195,23 @@ void kmem_cache_free(struct kmem_cache *, void *); * Child caches will hold extra metadata needed for its operation. Fields are: * * @memcg: pointer to the memcg this cache belongs to + * @list: list_head for the list of all caches in this memcg + * @root_cache: pointer to the global, root cache, this cache was derived from */ struct memcg_cache_params { bool is_root_cache; union { struct kmem_cache *memcg_caches[0]; - struct mem_cgroup *memcg; + struct { + struct mem_cgroup *memcg; + struct list_head list; + struct kmem_cache *root_cache; + }; }; }; +int memcg_update_all_caches(int num_memcgs); + /* * Common kmalloc functions provided by all allocators */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e16694d5e11..3eafe6cf6ca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -341,6 +341,14 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif +#if defined(CONFIG_MEMCG_KMEM) + /* analogous to slab_common's slab_caches list. per-memcg */ + struct list_head memcg_slab_caches; + /* Not a spinlock, we can take a lot of time walking the list */ + struct mutex slab_caches_mutex; + /* Index in the kmem_cache->memcg_params->memcg_caches array */ + int kmemcg_id; +#endif }; /* internal only representation about the status of kmem accounting. */ @@ -2785,6 +2793,47 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) mem_cgroup_put(memcg); } +void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) +{ + if (!memcg) + return; + + mutex_lock(&memcg->slab_caches_mutex); + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); + mutex_unlock(&memcg->slab_caches_mutex); +} + +/* + * helper for acessing a memcg's index. It will be used as an index in the + * child cache array in kmem_cache, and also to derive its name. This function + * will return -1 when this is not a kmem-limited memcg. + */ +int memcg_cache_id(struct mem_cgroup *memcg) +{ + return memcg ? memcg->kmemcg_id : -1; +} + +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) +{ + size_t size = sizeof(struct memcg_cache_params); + + if (!memcg_kmem_enabled()) + return 0; + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) + return -ENOMEM; + + if (memcg) + s->memcg_params->memcg = memcg; + return 0; +} + +void memcg_release_cache(struct kmem_cache *s) +{ + kfree(s->memcg_params); +} + /* * We need to verify if the allocation against current->mm->owner's memcg is * possible for the given order. But the page is not allocated yet, so we'll @@ -5026,7 +5075,9 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, #ifdef CONFIG_MEMCG_KMEM static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + memcg->kmemcg_id = -1; memcg_propagate_kmem(memcg); + return mem_cgroup_sockets_init(memcg, ss); }; diff --git a/mm/slab.h b/mm/slab.h index 49e7a8b1d27..abe582d20c7 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -43,12 +43,15 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, extern void create_boot_cache(struct kmem_cache *, const char *name, size_t size, unsigned long flags); +struct mem_cgroup; #ifdef CONFIG_SLUB -struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)); +struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)); #else -static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +static inline struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) { return NULL; } #endif @@ -106,11 +109,23 @@ static inline bool is_root_cache(struct kmem_cache *s) { return !s->memcg_params || s->memcg_params->is_root_cache; } + +static inline bool cache_match_memcg(struct kmem_cache *cachep, + struct mem_cgroup *memcg) +{ + return (is_root_cache(cachep) && !memcg) || + (cachep->memcg_params->memcg == memcg); +} #else static inline bool is_root_cache(struct kmem_cache *s) { return true; } +static inline bool cache_match_memcg(struct kmem_cache *cachep, + struct mem_cgroup *memcg) +{ + return true; +} #endif #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index a8e76d79ee6..3031badcc57 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "slab.h" @@ -27,7 +28,8 @@ DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; #ifdef CONFIG_DEBUG_VM -static int kmem_cache_sanity_check(const char *name, size_t size) +static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, + size_t size) { struct kmem_cache *s = NULL; @@ -53,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size) continue; } - if (!strcmp(s->name, name)) { + /* + * For simplicity, we won't check this in the list of memcg + * caches. We have control over memcg naming, and if there + * aren't duplicates in the global list, there won't be any + * duplicates in the memcg lists as well. + */ + if (!memcg && !strcmp(s->name, name)) { pr_err("%s (%s): Cache name already exists.\n", __func__, name); dump_stack(); @@ -66,7 +74,8 @@ static int kmem_cache_sanity_check(const char *name, size_t size) return 0; } #else -static inline int kmem_cache_sanity_check(const char *name, size_t size) +static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, + const char *name, size_t size) { return 0; } @@ -125,8 +134,9 @@ unsigned long calculate_alignment(unsigned long flags, * as davem. */ -struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +struct kmem_cache * +kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s = NULL; int err = 0; @@ -134,7 +144,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align get_online_cpus(); mutex_lock(&slab_mutex); - if (!kmem_cache_sanity_check(name, size) == 0) + if (!kmem_cache_sanity_check(memcg, name, size) == 0) goto out_locked; /* @@ -145,7 +155,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align */ flags &= CACHE_CREATE_MASK; - s = __kmem_cache_alias(name, size, align, flags, ctor); + s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); if (s) goto out_locked; @@ -154,6 +164,13 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align s->object_size = s->size = size; s->align = calculate_alignment(flags, align, size); s->ctor = ctor; + + if (memcg_register_cache(memcg, s)) { + kmem_cache_free(kmem_cache, s); + err = -ENOMEM; + goto out_locked; + } + s->name = kstrdup(name, GFP_KERNEL); if (!s->name) { kmem_cache_free(kmem_cache, s); @@ -163,10 +180,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align err = __kmem_cache_create(s, flags); if (!err) { - s->refcount = 1; list_add(&s->list, &slab_caches); - + memcg_cache_list_add(memcg, s); } else { kfree(s->name); kmem_cache_free(kmem_cache, s); @@ -194,6 +210,13 @@ out_locked: return s; } + +struct kmem_cache * +kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor); +} EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *s) @@ -209,6 +232,7 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); + memcg_release_cache(s); kfree(s->name); kmem_cache_free(kmem_cache, s); } else { diff --git a/mm/slub.c b/mm/slub.c index 87f9f32bf0c..985332b3885 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -3786,7 +3787,7 @@ static int slab_unmergeable(struct kmem_cache *s) return 0; } -static struct kmem_cache *find_mergeable(size_t size, +static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, size_t align, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -3822,17 +3823,21 @@ static struct kmem_cache *find_mergeable(size_t size, if (s->size - size >= sizeof(void *)) continue; + if (!cache_match_memcg(s, memcg)) + continue; + return s; } return NULL; } -struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - s = find_mergeable(size, align, flags, name, ctor); + s = find_mergeable(memcg, size, align, flags, name, ctor); if (s) { s->refcount++; /* @@ -5156,6 +5161,12 @@ static char *create_unique_id(struct kmem_cache *s) if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg)); +#endif + BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } From 55007d849759252ddd573aeb36143b947202d509 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:38 -0800 Subject: [PATCH 28/48] memcg: allocate memory for memcg caches whenever a new memcg appears Every cache that is considered a root cache (basically the "original" caches, tied to the root memcg/no-memcg) will have an array that should be large enough to store a cache pointer per each memcg in the system. Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently in the 64k pointers range. Most of the time, we won't be using that much. What goes in this patch, is a simple scheme to dynamically allocate such an array, in order to minimize memory usage for memcg caches. Because we would also like to avoid allocations all the time, at least for now, the array will only grow. It will tend to be big enough to hold the maximum number of kmem-limited memcgs ever achieved. We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have more than that, we'll start doubling the size of this array every time the limit is reached. Because we are only considering kmem limited memcgs, a natural point for this to happen is when we write to the limit. At that point, we already have set_limit_mutex held, so that will become our natural synchronization mechanism. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 + mm/memcontrol.c | 207 ++++++++++++++++++++++++++++++++++--- mm/slab_common.c | 28 +++++ 3 files changed, 221 insertions(+), 16 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0b69a047000..45085e14e02 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -447,6 +447,8 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s); void memcg_release_cache(struct kmem_cache *cachep); void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep); +int memcg_update_cache_size(struct kmem_cache *s, int num_groups); +void memcg_update_array_size(int num_groups); /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3eafe6cf6ca..db38b60e5f8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -378,6 +378,11 @@ static void memcg_kmem_set_activated(struct mem_cgroup *memcg) set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); } +static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) +{ + clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) { if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) @@ -549,12 +554,48 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) #endif #ifdef CONFIG_MEMCG_KMEM +/* + * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. + * There are two main reasons for not using the css_id for this: + * 1) this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. + * + * 2) In order not to violate the cgroup API, we would like to do all memory + * allocation in ->create(). At that point, we haven't yet allocated the + * css_id. Having a separate index prevents us from messing with the cgroup + * core for this + * + * The current size of the caches array is stored in + * memcg_limited_groups_array_size. It will double each time we have to + * increase it. + */ +static DEFINE_IDA(kmem_limited_groups); +static int memcg_limited_groups_array_size; +/* + * MIN_SIZE is different than 1, because we would like to avoid going through + * the alloc/free process all the time. In a small machine, 4 kmem-limited + * cgroups is a reasonable guess. In the future, it could be a parameter or + * tunable, but that is strictly not necessary. + * + * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get + * this constant directly from cgroup, but it is understandable that this is + * better kept as an internal representation in cgroup.c. In any case, the + * css_id space is not getting any smaller, and we don't have to necessarily + * increase ours as well if it increases. + */ +#define MEMCG_CACHES_MIN_SIZE 4 +#define MEMCG_CACHES_MAX_SIZE 65535 + struct static_key memcg_kmem_enabled_key; static void disarm_kmem_keys(struct mem_cgroup *memcg) { - if (memcg_kmem_is_active(memcg)) + if (memcg_kmem_is_active(memcg)) { static_key_slow_dec(&memcg_kmem_enabled_key); + ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); + } /* * This check can't live in kmem destruction function, * since the charges will outlive the cgroup @@ -2813,6 +2854,120 @@ int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } +/* + * This ends up being protected by the set_limit mutex, during normal + * operation, because that is its main call site. + * + * But when we create a new cache, we can call this as well if its parent + * is kmem-limited. That will have to hold set_limit_mutex as well. + */ +int memcg_update_cache_sizes(struct mem_cgroup *memcg) +{ + int num, ret; + + num = ida_simple_get(&kmem_limited_groups, + 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + if (num < 0) + return num; + /* + * After this point, kmem_accounted (that we test atomically in + * the beginning of this conditional), is no longer 0. This + * guarantees only one process will set the following boolean + * to true. We don't need test_and_set because we're protected + * by the set_limit_mutex anyway. + */ + memcg_kmem_set_activated(memcg); + + ret = memcg_update_all_caches(num+1); + if (ret) { + ida_simple_remove(&kmem_limited_groups, num); + memcg_kmem_clear_activated(memcg); + return ret; + } + + memcg->kmemcg_id = num; + INIT_LIST_HEAD(&memcg->memcg_slab_caches); + mutex_init(&memcg->slab_caches_mutex); + return 0; +} + +static size_t memcg_caches_array_size(int num_groups) +{ + ssize_t size; + if (num_groups <= 0) + return 0; + + size = 2 * num_groups; + if (size < MEMCG_CACHES_MIN_SIZE) + size = MEMCG_CACHES_MIN_SIZE; + else if (size > MEMCG_CACHES_MAX_SIZE) + size = MEMCG_CACHES_MAX_SIZE; + + return size; +} + +/* + * We should update the current array size iff all caches updates succeed. This + * can only be done from the slab side. The slab mutex needs to be held when + * calling this. + */ +void memcg_update_array_size(int num) +{ + if (num > memcg_limited_groups_array_size) + memcg_limited_groups_array_size = memcg_caches_array_size(num); +} + +int memcg_update_cache_size(struct kmem_cache *s, int num_groups) +{ + struct memcg_cache_params *cur_params = s->memcg_params; + + VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); + + if (num_groups > memcg_limited_groups_array_size) { + int i; + ssize_t size = memcg_caches_array_size(num_groups); + + size *= sizeof(void *); + size += sizeof(struct memcg_cache_params); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) { + s->memcg_params = cur_params; + return -ENOMEM; + } + + s->memcg_params->is_root_cache = true; + + /* + * There is the chance it will be bigger than + * memcg_limited_groups_array_size, if we failed an allocation + * in a cache, in which case all caches updated before it, will + * have a bigger array. + * + * But if that is the case, the data after + * memcg_limited_groups_array_size is certainly unused + */ + for (i = 0; i < memcg_limited_groups_array_size; i++) { + if (!cur_params->memcg_caches[i]) + continue; + s->memcg_params->memcg_caches[i] = + cur_params->memcg_caches[i]; + } + + /* + * Ideally, we would wait until all caches succeed, and only + * then free the old one. But this is not worth the extra + * pointer per-cache we'd have to have for this. + * + * It is not a big deal if some caches are left with a size + * bigger than the others. And all updates will reset this + * anyway. + */ + kfree(cur_params); + } + return 0; +} + int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) { size_t size = sizeof(struct memcg_cache_params); @@ -2820,6 +2975,9 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) if (!memcg_kmem_enabled()) return 0; + if (!memcg) + size += memcg_limited_groups_array_size * sizeof(void *); + s->memcg_params = kzalloc(size, GFP_KERNEL); if (!s->memcg_params) return -ENOMEM; @@ -4326,14 +4484,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) ret = res_counter_set_limit(&memcg->kmem, val); VM_BUG_ON(ret); - /* - * After this point, kmem_accounted (that we test atomically in - * the beginning of this conditional), is no longer 0. This - * guarantees only one process will set the following boolean - * to true. We don't need test_and_set because we're protected - * by the set_limit_mutex anyway. - */ - memcg_kmem_set_activated(memcg); + ret = memcg_update_cache_sizes(memcg); + if (ret) { + res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); + goto out; + } must_inc_static_branch = true; /* * kmem charges can outlive the cgroup. In the case of slab @@ -4372,11 +4527,13 @@ out: return ret; } -static void memcg_propagate_kmem(struct mem_cgroup *memcg) +static int memcg_propagate_kmem(struct mem_cgroup *memcg) { + int ret = 0; struct mem_cgroup *parent = parent_mem_cgroup(memcg); if (!parent) - return; + goto out; + memcg->kmem_account_flags = parent->kmem_account_flags; #ifdef CONFIG_MEMCG_KMEM /* @@ -4389,11 +4546,24 @@ static void memcg_propagate_kmem(struct mem_cgroup *memcg) * It is a lot simpler just to do static_key_slow_inc() on every child * that is accounted. */ - if (memcg_kmem_is_active(memcg)) { - mem_cgroup_get(memcg); - static_key_slow_inc(&memcg_kmem_enabled_key); - } + if (!memcg_kmem_is_active(memcg)) + goto out; + + /* + * destroy(), called if we fail, will issue static_key_slow_inc() and + * mem_cgroup_put() if kmem is enabled. We have to either call them + * unconditionally, or clear the KMEM_ACTIVE flag. I personally find + * this more consistent, since it always leads to the same destroy path + */ + mem_cgroup_get(memcg); + static_key_slow_inc(&memcg_kmem_enabled_key); + + mutex_lock(&set_limit_mutex); + ret = memcg_update_cache_sizes(memcg); + mutex_unlock(&set_limit_mutex); #endif +out: + return ret; } /* @@ -5075,8 +5245,12 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, #ifdef CONFIG_MEMCG_KMEM static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + int ret; + memcg->kmemcg_id = -1; - memcg_propagate_kmem(memcg); + ret = memcg_propagate_kmem(memcg); + if (ret) + return ret; return mem_cgroup_sockets_init(memcg, ss); }; @@ -5479,6 +5653,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) res_counter_init(&memcg->res, &parent->res); res_counter_init(&memcg->memsw, &parent->memsw); res_counter_init(&memcg->kmem, &parent->kmem); + /* * We increment refcnt of the parent to ensure that we can * safely access it on res_counter_charge/uncharge. diff --git a/mm/slab_common.c b/mm/slab_common.c index 3031badcc57..1c424b6511b 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -81,6 +81,34 @@ static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, } #endif +#ifdef CONFIG_MEMCG_KMEM +int memcg_update_all_caches(int num_memcgs) +{ + struct kmem_cache *s; + int ret = 0; + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) { + if (!is_root_cache(s)) + continue; + + ret = memcg_update_cache_size(s, num_memcgs); + /* + * See comment in memcontrol.c, memcg_update_cache_size: + * Instead of freeing the memory, we'll just leave the caches + * up to this point in an updated state. + */ + if (ret) + goto out; + } + + memcg_update_array_size(num_memcgs); +out: + mutex_unlock(&slab_mutex); + return ret; +} +#endif + /* * Figure out what the alignment of the objects will be given a set of * flags, a user specified alignment and the size of the objects. From d7f25f8a2f81252d1ac134470ba1d0a287cf8fcd Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:40 -0800 Subject: [PATCH 29/48] memcg: infrastructure to match an allocation to the right cache The page allocator is able to bind a page to a memcg when it is allocated. But for the caches, we'd like to have as many objects as possible in a page belonging to the same cache. This is done in this patch by calling memcg_kmem_get_cache in the beginning of every allocation function. This function is patched out by static branches when kernel memory controller is not being used. It assumes that the task allocating, which determines the memcg in the page allocator, belongs to the same cgroup throughout the whole process. Misaccounting can happen if the task calls memcg_kmem_get_cache() while belonging to a cgroup, and later on changes. This is considered acceptable, and should only happen upon task migration. Before the cache is created by the memcg core, there is also a possible imbalance: the task belongs to a memcg, but the cache being allocated from is the global cache, since the child cache is not yet guaranteed to be ready. This case is also fine, since in this case the GFP_KMEMCG will not be passed and the page allocator will not attempt any cgroup accounting. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 41 +++++++ init/Kconfig | 1 - mm/memcontrol.c | 217 +++++++++++++++++++++++++++++++++++++ 3 files changed, 258 insertions(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 45085e14e02..bd9b5d73bc2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -449,6 +449,10 @@ void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep); int memcg_update_cache_size(struct kmem_cache *s, int num_groups); void memcg_update_array_size(int num_groups); + +struct kmem_cache * +__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); + /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. @@ -518,6 +522,37 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) __memcg_kmem_commit_charge(page, memcg, order); } +/** + * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation + * @cachep: the original global kmem cache + * @gfp: allocation flags. + * + * This function assumes that the task allocating, which determines the memcg + * in the page allocator, belongs to the same cgroup throughout the whole + * process. Misacounting can happen if the task calls memcg_kmem_get_cache() + * while belonging to a cgroup, and later on changes. This is considered + * acceptable, and should only happen upon task migration. + * + * Before the cache is created by the memcg core, there is also a possible + * imbalance: the task belongs to a memcg, but the cache being allocated from + * is the global cache, since the child cache is not yet guaranteed to be + * ready. This case is also fine, since in this case the GFP_KMEMCG will not be + * passed and the page allocator will not attempt any cgroup accounting. + */ +static __always_inline struct kmem_cache * +memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) +{ + if (!memcg_kmem_enabled()) + return cachep; + if (gfp & __GFP_NOFAIL) + return cachep; + if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) + return cachep; + if (unlikely(fatal_signal_pending(current))) + return cachep; + + return __memcg_kmem_get_cache(cachep, gfp); +} #else static inline bool memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) @@ -553,6 +588,12 @@ static inline void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *s) { } + +static inline struct kmem_cache * +memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) +{ + return cachep; +} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/init/Kconfig b/init/Kconfig index 19ccb33c99d..7d30240e5bf 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -883,7 +883,6 @@ config MEMCG_KMEM bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" depends on MEMCG && EXPERIMENTAL depends on SLUB || SLAB - default n help The Kernel Memory extension for Memory Resource Controller can limit the amount of memory used by kernel objects in the system. Those are diff --git a/mm/memcontrol.c b/mm/memcontrol.c index db38b60e5f8..efd26620a60 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -588,7 +588,14 @@ static int memcg_limited_groups_array_size; #define MEMCG_CACHES_MIN_SIZE 4 #define MEMCG_CACHES_MAX_SIZE 65535 +/* + * A lot of the calls to the cache allocation functions are expected to be + * inlined by the compiler. Since the calls to memcg_kmem_get_cache are + * conditional to this static branch, we'll have to allow modules that does + * kmem_cache_alloc and the such to see this symbol as well + */ struct static_key memcg_kmem_enabled_key; +EXPORT_SYMBOL(memcg_kmem_enabled_key); static void disarm_kmem_keys(struct mem_cgroup *memcg) { @@ -2989,9 +2996,219 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) void memcg_release_cache(struct kmem_cache *s) { + struct kmem_cache *root; + struct mem_cgroup *memcg; + int id; + + /* + * This happens, for instance, when a root cache goes away before we + * add any memcg. + */ + if (!s->memcg_params) + return; + + if (s->memcg_params->is_root_cache) + goto out; + + memcg = s->memcg_params->memcg; + id = memcg_cache_id(memcg); + + root = s->memcg_params->root_cache; + root->memcg_params->memcg_caches[id] = NULL; + mem_cgroup_put(memcg); + + mutex_lock(&memcg->slab_caches_mutex); + list_del(&s->memcg_params->list); + mutex_unlock(&memcg->slab_caches_mutex); + +out: kfree(s->memcg_params); } +static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) +{ + char *name; + struct dentry *dentry; + + rcu_read_lock(); + dentry = rcu_dereference(memcg->css.cgroup->dentry); + rcu_read_unlock(); + + BUG_ON(dentry == NULL); + + name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, + memcg_cache_id(memcg), dentry->d_name.name); + + return name; +} + +static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, + struct kmem_cache *s) +{ + char *name; + struct kmem_cache *new; + + name = memcg_cache_name(memcg, s); + if (!name) + return NULL; + + new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, + (s->flags & ~SLAB_PANIC), s->ctor); + + kfree(name); + return new; +} + +/* + * This lock protects updaters, not readers. We want readers to be as fast as + * they can, and they will either see NULL or a valid cache value. Our model + * allow them to see NULL, in which case the root memcg will be selected. + * + * We need this lock because multiple allocations to the same cache from a non + * will span more than one worker. Only one of them can create the cache. + */ +static DEFINE_MUTEX(memcg_cache_mutex); +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct kmem_cache *new_cachep; + int idx; + + BUG_ON(!memcg_can_account_kmem(memcg)); + + idx = memcg_cache_id(memcg); + + mutex_lock(&memcg_cache_mutex); + new_cachep = cachep->memcg_params->memcg_caches[idx]; + if (new_cachep) + goto out; + + new_cachep = kmem_cache_dup(memcg, cachep); + + if (new_cachep == NULL) { + new_cachep = cachep; + goto out; + } + + mem_cgroup_get(memcg); + new_cachep->memcg_params->root_cache = cachep; + + cachep->memcg_params->memcg_caches[idx] = new_cachep; + /* + * the readers won't lock, make sure everybody sees the updated value, + * so they won't put stuff in the queue again for no reason + */ + wmb(); +out: + mutex_unlock(&memcg_cache_mutex); + return new_cachep; +} + +struct create_work { + struct mem_cgroup *memcg; + struct kmem_cache *cachep; + struct work_struct work; +}; + +static void memcg_create_cache_work_func(struct work_struct *w) +{ + struct create_work *cw; + + cw = container_of(w, struct create_work, work); + memcg_create_kmem_cache(cw->memcg, cw->cachep); + /* Drop the reference gotten when we enqueued. */ + css_put(&cw->memcg->css); + kfree(cw); +} + +/* + * Enqueue the creation of a per-memcg kmem_cache. + * Called with rcu_read_lock. + */ +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct create_work *cw; + + cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + if (cw == NULL) + return; + + /* The corresponding put will be done in the workqueue. */ + if (!css_tryget(&memcg->css)) { + kfree(cw); + return; + } + + cw->memcg = memcg; + cw->cachep = cachep; + + INIT_WORK(&cw->work, memcg_create_cache_work_func); + schedule_work(&cw->work); +} + +/* + * Return the kmem_cache we're supposed to use for a slab allocation. + * We try to use the current memcg's version of the cache. + * + * If the cache does not exist yet, if we are the first user of it, + * we either create it immediately, if possible, or create it asynchronously + * in a workqueue. + * In the latter case, we will let the current allocation go through with + * the original cache. + * + * Can't be called in interrupt context or from kernel threads. + * This function needs to be called with rcu_read_lock() held. + */ +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, + gfp_t gfp) +{ + struct mem_cgroup *memcg; + int idx; + + VM_BUG_ON(!cachep->memcg_params); + VM_BUG_ON(!cachep->memcg_params->is_root_cache); + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); + rcu_read_unlock(); + + if (!memcg_can_account_kmem(memcg)) + return cachep; + + idx = memcg_cache_id(memcg); + + /* + * barrier to mare sure we're always seeing the up to date value. The + * code updating memcg_caches will issue a write barrier to match this. + */ + read_barrier_depends(); + if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { + /* + * If we are in a safe context (can wait, and not in interrupt + * context), we could be be predictable and return right away. + * This would guarantee that the allocation being performed + * already belongs in the new cache. + * + * However, there are some clashes that can arrive from locking. + * For instance, because we acquire the slab_mutex while doing + * kmem_cache_dup, this means no further allocation could happen + * with the slab_mutex held. + * + * Also, because cache creation issue get_online_cpus(), this + * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, + * that ends up reversed during cpu hotplug. (cpuset allocates + * a bunch of GFP_KERNEL memory during cpuup). Due to all that, + * better to defer everything. + */ + memcg_create_cache_enqueue(memcg, cachep); + return cachep; + } + + return cachep->memcg_params->memcg_caches[idx]; +} +EXPORT_SYMBOL(__memcg_kmem_get_cache); + /* * We need to verify if the allocation against current->mm->owner's memcg is * possible for the given order. But the page is not allocated yet, so we'll From 0e9d92f2d02d8c8320f0502307c688d07bdac2b3 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:42 -0800 Subject: [PATCH 30/48] memcg: skip memcg kmem allocations in specified code regions Create a mechanism that skip memcg allocations during certain pieces of our core code. It basically works in the same way as preempt_disable()/preempt_enable(): By marking a region under which all allocations will be accounted to the root memcg. We need this to prevent races in early cache creation, when we allocate data using caches that are not necessarily created already. Signed-off-by: Glauber Costa yCc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + mm/memcontrol.c | 57 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9914c662ed7..f712465b05c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1597,6 +1597,7 @@ struct task_struct { unsigned long nr_pages; /* uncharged usage */ unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ } memcg_batch; + unsigned int memcg_kmem_skip_account; #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT atomic_t ptrace_bp_refcnt; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index efd26620a60..65302a083d2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3025,6 +3025,37 @@ out: kfree(s->memcg_params); } +/* + * During the creation a new cache, we need to disable our accounting mechanism + * altogether. This is true even if we are not creating, but rather just + * enqueing new caches to be created. + * + * This is because that process will trigger allocations; some visible, like + * explicit kmallocs to auxiliary data structures, name strings and internal + * cache structures; some well concealed, like INIT_WORK() that can allocate + * objects during debug. + * + * If any allocation happens during memcg_kmem_get_cache, we will recurse back + * to it. This may not be a bounded recursion: since the first cache creation + * failed to complete (waiting on the allocation), we'll just try to create the + * cache again, failing at the same point. + * + * memcg_kmem_get_cache is prepared to abort after seeing a positive count of + * memcg_kmem_skip_account. So we enclose anything that might allocate memory + * inside the following two functions. + */ +static inline void memcg_stop_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account++; +} + +static inline void memcg_resume_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account--; +} + static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) { char *name; @@ -3084,7 +3115,6 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out; new_cachep = kmem_cache_dup(memcg, cachep); - if (new_cachep == NULL) { new_cachep = cachep; goto out; @@ -3125,8 +3155,8 @@ static void memcg_create_cache_work_func(struct work_struct *w) * Enqueue the creation of a per-memcg kmem_cache. * Called with rcu_read_lock. */ -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { struct create_work *cw; @@ -3147,6 +3177,24 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, schedule_work(&cw->work); } +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + /* + * We need to stop accounting when we kmalloc, because if the + * corresponding kmalloc cache is not yet created, the first allocation + * in __memcg_create_cache_enqueue will recurse. + * + * However, it is better to enclose the whole function. Depending on + * the debugging options enabled, INIT_WORK(), for instance, can + * trigger an allocation. This too, will make us recurse. Because at + * this point we can't allow ourselves back into memcg_kmem_get_cache, + * the safest choice is to do it like this, wrapping the whole function. + */ + memcg_stop_kmem_account(); + __memcg_create_cache_enqueue(memcg, cachep); + memcg_resume_kmem_account(); +} /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. @@ -3169,6 +3217,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, VM_BUG_ON(!cachep->memcg_params); VM_BUG_ON(!cachep->memcg_params->is_root_cache); + if (!current->mm || current->memcg_kmem_skip_account) + return cachep; + rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); rcu_read_unlock(); From b9ce5ef49f00daf2254c6953c8d31f79aabccd34 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:46 -0800 Subject: [PATCH 31/48] sl[au]b: always get the cache from its page in kmem_cache_free() struct page already has this information. If we start chaining caches, this information will always be more trustworthy than whatever is passed into the function. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++++ mm/slab.c | 6 +++++- mm/slab.h | 39 ++++++++++++++++++++++++++++++++++++++ mm/slob.c | 2 +- mm/slub.c | 15 +++------------ 5 files changed, 53 insertions(+), 14 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bd9b5d73bc2..2298122e71a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -554,6 +554,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) return __memcg_kmem_get_cache(cachep, gfp); } #else +static inline bool memcg_kmem_enabled(void) +{ + return false; +} + static inline bool memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) { diff --git a/mm/slab.c b/mm/slab.c index c26ab9fbe1f..bab6fec765a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -87,7 +87,6 @@ */ #include -#include "slab.h" #include #include #include @@ -128,6 +127,8 @@ #include "internal.h" +#include "slab.h" + /* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). @@ -3883,6 +3884,9 @@ EXPORT_SYMBOL(__kmalloc); void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; + cachep = cache_from_obj(cachep, objp); + if (!cachep) + return; local_irq_save(flags); debug_check_no_locks_freed(objp, cachep->object_size); diff --git a/mm/slab.h b/mm/slab.h index abe582d20c7..c95e922b166 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -116,6 +116,13 @@ static inline bool cache_match_memcg(struct kmem_cache *cachep, return (is_root_cache(cachep) && !memcg) || (cachep->memcg_params->memcg == memcg); } + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return (p == s) || + (s->memcg_params && (p == s->memcg_params->root_cache)); +} #else static inline bool is_root_cache(struct kmem_cache *s) { @@ -127,5 +134,37 @@ static inline bool cache_match_memcg(struct kmem_cache *cachep, { return true; } + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return true; +} #endif + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + struct page *page; + + /* + * When kmemcg is not being used, both assignments should return the + * same value. but we don't want to pay the assignment price in that + * case. If it is not compiled in, the compiler should be smart enough + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ + if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + return s; + + page = virt_to_head_page(x); + cachep = page->slab_cache; + if (slab_equal_or_root(cachep, s)) + return cachep; + + pr_err("%s: Wrong slab cache. %s but object is from %s\n", + __FUNCTION__, cachep->name, s->name); + WARN_ON_ONCE(1); + return s; +} #endif diff --git a/mm/slob.c b/mm/slob.c index 795bab7d391..a99fdf7a090 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -58,7 +58,6 @@ #include #include -#include "slab.h" #include #include /* struct reclaim_state */ @@ -73,6 +72,7 @@ #include +#include "slab.h" /* * slob_block has a field 'units', which indicates size of block if +ve, * or offset of next block if -ve (in SLOB_UNITs). diff --git a/mm/slub.c b/mm/slub.c index 985332b3885..6d5f2305d7a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2611,19 +2611,10 @@ redo: void kmem_cache_free(struct kmem_cache *s, void *x) { - struct page *page; - - page = virt_to_head_page(x); - - if (kmem_cache_debug(s) && page->slab_cache != s) { - pr_err("kmem_cache_free: Wrong slab cache. %s but object" - " is from %s\n", page->slab_cache->name, s->name); - WARN_ON_ONCE(1); + s = cache_from_obj(s, x); + if (!s) return; - } - - slab_free(s, page, x, _RET_IP_); - + slab_free(s, virt_to_head_page(x), x, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); From d79923fad95b0cdf7770e024677180c734cb7148 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:48 -0800 Subject: [PATCH 32/48] sl[au]b: allocate objects from memcg cache We are able to match a cache allocation to a particular memcg. If the task doesn't change groups during the allocation itself - a rare event, this will give us a good picture about who is the first group to touch a cache page. This patch uses the now available infrastructure by calling memcg_kmem_get_cache() before all the cache allocations. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 5 ++++- mm/memcontrol.c | 3 +++ mm/slab.c | 6 +++++- mm/slub.c | 7 ++++--- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 961e72eab90..364ba6c9fe2 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -225,7 +225,10 @@ void *__kmalloc(size_t size, gfp_t flags); static __always_inline void * kmalloc_order(size_t size, gfp_t flags, unsigned int order) { - void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order); + void *ret; + + flags |= (__GFP_COMP | __GFP_KMEMCG); + ret = (void *) __get_free_pages(flags, order); kmemleak_alloc(ret, size, 1, flags); return ret; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 65302a083d2..cc13797d0fb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3086,6 +3086,9 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, (s->flags & ~SLAB_PANIC), s->ctor); + if (new) + new->allocflags |= __GFP_KMEMCG; + kfree(name); return new; } diff --git a/mm/slab.c b/mm/slab.c index bab6fec765a..e265865e870 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1933,7 +1933,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) } if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - free_pages((unsigned long)addr, cachep->gfporder); + free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) @@ -3486,6 +3486,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (slab_should_failslab(cachep, flags)) return NULL; + cachep = memcg_kmem_get_cache(cachep, flags); + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3571,6 +3573,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) if (slab_should_failslab(cachep, flags)) return NULL; + cachep = memcg_kmem_get_cache(cachep, flags); + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); diff --git a/mm/slub.c b/mm/slub.c index 6d5f2305d7a..ef39e872b8e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1405,7 +1405,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) reset_page_mapcount(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_pages(page, order); + __free_memcg_kmem_pages(page, order); } #define need_reserve_slab_rcu \ @@ -2323,6 +2323,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, if (slab_pre_alloc_hook(s, gfpflags)) return NULL; + s = memcg_kmem_get_cache(s, gfpflags); redo: /* @@ -3284,7 +3285,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK; + flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3390,7 +3391,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kmemleak_free(x); - __free_pages(page, compound_order(page)); + __free_memcg_kmem_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, _RET_IP_); From 1f458cbf122288b23620ee822e19bcbb76c8d6ec Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:50 -0800 Subject: [PATCH 33/48] memcg: destroy memcg caches Implement destruction of memcg caches. Right now, only caches where our reference counter is the last remaining are deleted. If there are any other reference counters around, we just leave the caches lying around until they go away. When that happens, a destruction function is called from the cache code. Caches are only destroyed in process context, so we queue them up for later processing in the general case. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 ++ include/linux/slab.h | 10 +++++- mm/memcontrol.c | 63 ++++++++++++++++++++++++++++++++++++++ mm/slab.c | 3 ++ mm/slab.h | 23 ++++++++++++++ mm/slub.c | 7 ++++- 6 files changed, 106 insertions(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2298122e71a..79fcf0cd718 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -453,6 +453,8 @@ void memcg_update_array_size(int num_groups); struct kmem_cache * __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); +void mem_cgroup_destroy_cache(struct kmem_cache *cachep); + /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. diff --git a/include/linux/slab.h b/include/linux/slab.h index c0fcf28c15b..869efb8d237 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -11,6 +11,8 @@ #include #include +#include + /* * Flags to pass to kmem_cache_create(). @@ -179,7 +181,6 @@ void kmem_cache_free(struct kmem_cache *, void *); #ifndef ARCH_SLAB_MINALIGN #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif - /* * This is the main placeholder for memcg-related information in kmem caches. * struct kmem_cache will hold a pointer to it, so the memory cost while @@ -197,6 +198,10 @@ void kmem_cache_free(struct kmem_cache *, void *); * @memcg: pointer to the memcg this cache belongs to * @list: list_head for the list of all caches in this memcg * @root_cache: pointer to the global, root cache, this cache was derived from + * @dead: set to true after the memcg dies; the cache may still be around. + * @nr_pages: number of pages that belongs to this cache. + * @destroy: worker to be called whenever we are ready, or believe we may be + * ready, to destroy this cache. */ struct memcg_cache_params { bool is_root_cache; @@ -206,6 +211,9 @@ struct memcg_cache_params { struct mem_cgroup *memcg; struct list_head list; struct kmem_cache *root_cache; + bool dead; + atomic_t nr_pages; + struct work_struct destroy; }; }; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cc13797d0fb..270a3678985 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2779,6 +2779,19 @@ static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); } +/* + * This is a bit cumbersome, but it is rarely used and avoids a backpointer + * in the memcg_cache_params struct. + */ +static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) +{ + struct kmem_cache *cachep; + + VM_BUG_ON(p->is_root_cache); + cachep = p->root_cache; + return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; +} + static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; @@ -3056,6 +3069,31 @@ static inline void memcg_resume_kmem_account(void) current->memcg_kmem_skip_account--; } +static void kmem_cache_destroy_work_func(struct work_struct *w) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *p; + + p = container_of(w, struct memcg_cache_params, destroy); + + cachep = memcg_params_to_cache(p); + + if (!atomic_read(&cachep->memcg_params->nr_pages)) + kmem_cache_destroy(cachep); +} + +void mem_cgroup_destroy_cache(struct kmem_cache *cachep) +{ + if (!cachep->memcg_params->dead) + return; + + /* + * We have to defer the actual destroying to a workqueue, because + * we might currently be in a context that cannot sleep. + */ + schedule_work(&cachep->memcg_params->destroy); +} + static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) { char *name; @@ -3125,6 +3163,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mem_cgroup_get(memcg); new_cachep->memcg_params->root_cache = cachep; + atomic_set(&new_cachep->memcg_params->nr_pages , 0); cachep->memcg_params->memcg_caches[idx] = new_cachep; /* @@ -3143,6 +3182,25 @@ struct create_work { struct work_struct work; }; +static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *params; + + if (!memcg_kmem_is_active(memcg)) + return; + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + cachep = memcg_params_to_cache(params); + cachep->memcg_params->dead = true; + INIT_WORK(&cachep->memcg_params->destroy, + kmem_cache_destroy_work_func); + schedule_work(&cachep->memcg_params->destroy); + } + mutex_unlock(&memcg->slab_caches_mutex); +} + static void memcg_create_cache_work_func(struct work_struct *w) { struct create_work *cw; @@ -3358,6 +3416,10 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) VM_BUG_ON(mem_cgroup_is_root(memcg)); memcg_uncharge_kmem(memcg, PAGE_SIZE << order); } +#else +static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ +} #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -5975,6 +6037,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont) struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); mem_cgroup_reparent_charges(memcg); + mem_cgroup_destroy_all_caches(memcg); } static void mem_cgroup_css_free(struct cgroup *cont) diff --git a/mm/slab.c b/mm/slab.c index e265865e870..7467343f9fe 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1895,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (page->pfmemalloc) SetPageSlabPfmemalloc(page + i); } + memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1931,6 +1932,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) __ClearPageSlab(page); page++; } + + memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); diff --git a/mm/slab.h b/mm/slab.h index c95e922b166..43d8a38b534 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -117,6 +117,21 @@ static inline bool cache_match_memcg(struct kmem_cache *cachep, (cachep->memcg_params->memcg == memcg); } +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ + if (!is_root_cache(s)) + atomic_add(1 << order, &s->memcg_params->nr_pages); +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ + if (is_root_cache(s)) + return; + + if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) + mem_cgroup_destroy_cache(s); +} + static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { @@ -135,6 +150,14 @@ static inline bool cache_match_memcg(struct kmem_cache *cachep, return true; } +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ +} + static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { diff --git a/mm/slub.c b/mm/slub.c index ef39e872b8e..692177bebdf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1344,6 +1344,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) void *start; void *last; void *p; + int order; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1352,7 +1353,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; + order = compound_order(page); inc_slabs_node(s, page_to_nid(page), page->objects); + memcg_bind_pages(s, order); page->slab_cache = s; __SetPageSlab(page); if (page->pfmemalloc) @@ -1361,7 +1364,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); + memset(start, POISON_INUSE, PAGE_SIZE << order); last = start; for_each_object(p, s, start, page->objects) { @@ -1402,6 +1405,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); + + memcg_release_pages(s, order); reset_page_mapcount(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; From 7cf2798240a2a2230cb16a391beef98d8a7ad362 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:55 -0800 Subject: [PATCH 34/48] memcg/sl[au]b: track all the memcg children of a kmem_cache This enables us to remove all the children of a kmem_cache being destroyed, if for example the kernel module it's being used in gets unloaded. Otherwise, the children will still point to the destroyed parent. Signed-off-by: Suleiman Souhlal Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++++ mm/memcontrol.c | 49 ++++++++++++++++++++++++++++++++++++-- mm/slab_common.c | 3 +++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 79fcf0cd718..e119f3ef793 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -454,6 +454,7 @@ struct kmem_cache * __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); void mem_cgroup_destroy_cache(struct kmem_cache *cachep); +void kmem_cache_destroy_memcg_children(struct kmem_cache *s); /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. @@ -601,6 +602,10 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { return cachep; } + +static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ +} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 270a3678985..4b68ec2c8df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2772,6 +2772,8 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, memcg_check_events(memcg, page); } +static DEFINE_MUTEX(set_limit_mutex); + #ifdef CONFIG_MEMCG_KMEM static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) { @@ -3176,6 +3178,51 @@ out: return new_cachep; } +void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + struct kmem_cache *c; + int i; + + if (!s->memcg_params) + return; + if (!s->memcg_params->is_root_cache) + return; + + /* + * If the cache is being destroyed, we trust that there is no one else + * requesting objects from it. Even if there are, the sanity checks in + * kmem_cache_destroy should caught this ill-case. + * + * Still, we don't want anyone else freeing memcg_caches under our + * noses, which can happen if a new memcg comes to life. As usual, + * we'll take the set_limit_mutex to protect ourselves against this. + */ + mutex_lock(&set_limit_mutex); + for (i = 0; i < memcg_limited_groups_array_size; i++) { + c = s->memcg_params->memcg_caches[i]; + if (!c) + continue; + + /* + * We will now manually delete the caches, so to avoid races + * we need to cancel all pending destruction workers and + * proceed with destruction ourselves. + * + * kmem_cache_destroy() will call kmem_cache_shrink internally, + * and that could spawn the workers again: it is likely that + * the cache still have active pages until this very moment. + * This would lead us back to mem_cgroup_destroy_cache. + * + * But that will not execute at all if the "dead" flag is not + * set, so flip it down to guarantee we are in control. + */ + c->memcg_params->dead = false; + cancel_delayed_work_sync(&c->memcg_params->destroy); + kmem_cache_destroy(c); + } + mutex_unlock(&set_limit_mutex); +} + struct create_work { struct mem_cgroup *memcg; struct kmem_cache *cachep; @@ -4284,8 +4331,6 @@ void mem_cgroup_print_bad_page(struct page *page) } #endif -static DEFINE_MUTEX(set_limit_mutex); - static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 1c424b6511b..080a43804bf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -249,6 +249,9 @@ EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *s) { + /* Destroy all the children caches if we aren't a memcg cache */ + kmem_cache_destroy_memcg_children(s); + get_online_cpus(); mutex_lock(&slab_mutex); s->refcount--; From 22933152934f30de6f05b600c03f8a08f853a8d2 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:59 -0800 Subject: [PATCH 35/48] memcg/sl[au]b: shrink dead caches This means that when we destroy a memcg cache that happened to be empty, those caches may take a lot of time to go away: removing the memcg reference won't destroy them - because there are pending references, and the empty pages will stay there, until a shrinker is called upon for any reason. In this patch, we will call kmem_cache_shrink() for all dead caches that cannot be destroyed because of remaining pages. After shrinking, it is possible that it could be freed. If this is not the case, we'll schedule a lazy worker to keep trying. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4b68ec2c8df..7633e0d429e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3080,7 +3080,27 @@ static void kmem_cache_destroy_work_func(struct work_struct *w) cachep = memcg_params_to_cache(p); - if (!atomic_read(&cachep->memcg_params->nr_pages)) + /* + * If we get down to 0 after shrink, we could delete right away. + * However, memcg_release_pages() already puts us back in the workqueue + * in that case. If we proceed deleting, we'll get a dangling + * reference, and removing the object from the workqueue in that case + * is unnecessary complication. We are not a fast path. + * + * Note that this case is fundamentally different from racing with + * shrink_slab(): if memcg_cgroup_destroy_cache() is called in + * kmem_cache_shrink, not only we would be reinserting a dead cache + * into the queue, but doing so from inside the worker racing to + * destroy it. + * + * So if we aren't down to zero, we'll just schedule a worker and try + * again + */ + if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + return; + } else kmem_cache_destroy(cachep); } @@ -3089,6 +3109,26 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) if (!cachep->memcg_params->dead) return; + /* + * There are many ways in which we can get here. + * + * We can get to a memory-pressure situation while the delayed work is + * still pending to run. The vmscan shrinkers can then release all + * cache memory and get us to destruction. If this is the case, we'll + * be executed twice, which is a bug (the second time will execute over + * bogus data). In this case, cancelling the work should be fine. + * + * But we can also get here from the worker itself, if + * kmem_cache_shrink is enough to shake all the remaining objects and + * get the page count to 0. In this case, we'll deadlock if we try to + * cancel the work (the worker runs with an internal lock held, which + * is the same lock we would hold for cancel_work_sync().) + * + * Since we can't possibly know who got us here, just refrain from + * running if there is already work pending + */ + if (work_pending(&cachep->memcg_params->destroy)) + return; /* * We have to defer the actual destroying to a workqueue, because * we might currently be in a context that cannot sleep. @@ -3217,7 +3257,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) * set, so flip it down to guarantee we are in control. */ c->memcg_params->dead = false; - cancel_delayed_work_sync(&c->memcg_params->destroy); + cancel_work_sync(&c->memcg_params->destroy); kmem_cache_destroy(c); } mutex_unlock(&set_limit_mutex); @@ -3242,7 +3282,7 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) cachep = memcg_params_to_cache(params); cachep->memcg_params->dead = true; INIT_WORK(&cachep->memcg_params->destroy, - kmem_cache_destroy_work_func); + kmem_cache_destroy_work_func); schedule_work(&cachep->memcg_params->destroy); } mutex_unlock(&memcg->slab_caches_mutex); From 749c54151a6e5b229e4ae067dbc651e54b161fbc Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:01 -0800 Subject: [PATCH 36/48] memcg: aggregate memcg cache values in slabinfo When we create caches in memcgs, we need to display their usage information somewhere. We'll adopt a scheme similar to /proc/meminfo, with aggregate totals shown in the global file, and per-group information stored in the group itself. For the time being, only reads are allowed in the per-group cache. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 +++++++ include/linux/slab.h | 4 ++++ mm/memcontrol.c | 30 +++++++++++++++++++++++++- mm/slab.h | 27 +++++++++++++++++++++++ mm/slab_common.c | 44 ++++++++++++++++++++++++++++++++++---- 5 files changed, 108 insertions(+), 5 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e119f3ef793..8dc7c746b44 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -420,6 +420,11 @@ static inline void sock_release_memcg(struct sock *sk) #ifdef CONFIG_MEMCG_KMEM extern struct static_key memcg_kmem_enabled_key; + +extern int memcg_limited_groups_array_size; +#define for_each_memcg_cache_index(_idx) \ + for ((_idx) = 0; i < memcg_limited_groups_array_size; (_idx)++) + static inline bool memcg_kmem_enabled(void) { return static_key_false(&memcg_kmem_enabled_key); @@ -557,6 +562,9 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) return __memcg_kmem_get_cache(cachep, gfp); } #else +#define for_each_memcg_cache_index(_idx) \ + for (; NULL; ) + static inline bool memcg_kmem_enabled(void) { return false; diff --git a/include/linux/slab.h b/include/linux/slab.h index 869efb8d237..b9278663f22 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -220,6 +220,10 @@ struct memcg_cache_params { int memcg_update_all_caches(int num_memcgs); +struct seq_file; +int cache_show(struct kmem_cache *s, struct seq_file *m); +void print_slabinfo_header(struct seq_file *m); + /* * Common kmalloc functions provided by all allocators */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7633e0d429e..a32d83c2e35 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -572,7 +572,8 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) * increase it. */ static DEFINE_IDA(kmem_limited_groups); -static int memcg_limited_groups_array_size; +int memcg_limited_groups_array_size; + /* * MIN_SIZE is different than 1, because we would like to avoid going through * the alloc/free process all the time. In a small machine, 4 kmem-limited @@ -2794,6 +2795,27 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; } +#ifdef CONFIG_SLABINFO +static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, + struct seq_file *m) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct memcg_cache_params *params; + + if (!memcg_can_account_kmem(memcg)) + return -EIO; + + print_slabinfo_header(m); + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) + cache_show(memcg_params_to_cache(params), m); + mutex_unlock(&memcg->slab_caches_mutex); + + return 0; +} +#endif + static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; @@ -5822,6 +5844,12 @@ static struct cftype mem_cgroup_files[] = { .trigger = mem_cgroup_reset, .read = mem_cgroup_read, }, +#ifdef CONFIG_SLABINFO + { + .name = "kmem.slabinfo", + .read_seq_string = mem_cgroup_slabinfo_read, + }, +#endif #endif { }, /* terminate */ }; diff --git a/mm/slab.h b/mm/slab.h index 43d8a38b534..ec5dae1c8e7 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -138,6 +138,23 @@ static inline bool slab_equal_or_root(struct kmem_cache *s, return (p == s) || (s->memcg_params && (p == s->memcg_params->root_cache)); } + +/* + * We use suffixes to the name in memcg because we can't have caches + * created in the system with the same name. But when we print them + * locally, better refer to them with the base name + */ +static inline const char *cache_name(struct kmem_cache *s) +{ + if (!is_root_cache(s)) + return s->memcg_params->root_cache->name; + return s->name; +} + +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +{ + return s->memcg_params->memcg_caches[idx]; +} #else static inline bool is_root_cache(struct kmem_cache *s) { @@ -163,6 +180,16 @@ static inline bool slab_equal_or_root(struct kmem_cache *s, { return true; } + +static inline const char *cache_name(struct kmem_cache *s) +{ + return s->name; +} + +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +{ + return NULL; +} #endif static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) diff --git a/mm/slab_common.c b/mm/slab_common.c index 080a43804bf..081f1b8d9a7 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -322,7 +322,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, #ifdef CONFIG_SLABINFO -static void print_slabinfo_header(struct seq_file *m) +void print_slabinfo_header(struct seq_file *m) { /* * Output format version, so at least we can change it @@ -366,16 +366,43 @@ static void s_stop(struct seq_file *m, void *p) mutex_unlock(&slab_mutex); } -static int s_show(struct seq_file *m, void *p) +static void +memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) +{ + struct kmem_cache *c; + struct slabinfo sinfo; + int i; + + if (!is_root_cache(s)) + return; + + for_each_memcg_cache_index(i) { + c = cache_from_memcg(s, i); + if (!c) + continue; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(c, &sinfo); + + info->active_slabs += sinfo.active_slabs; + info->num_slabs += sinfo.num_slabs; + info->shared_avail += sinfo.shared_avail; + info->active_objs += sinfo.active_objs; + info->num_objs += sinfo.num_objs; + } +} + +int cache_show(struct kmem_cache *s, struct seq_file *m) { - struct kmem_cache *s = list_entry(p, struct kmem_cache, list); struct slabinfo sinfo; memset(&sinfo, 0, sizeof(sinfo)); get_slabinfo(s, &sinfo); + memcg_accumulate_slabinfo(s, &sinfo); + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", - s->name, sinfo.active_objs, sinfo.num_objs, s->size, + cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size, sinfo.objects_per_slab, (1 << sinfo.cache_order)); seq_printf(m, " : tunables %4u %4u %4u", @@ -387,6 +414,15 @@ static int s_show(struct seq_file *m, void *p) return 0; } +static int s_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + + if (!is_root_cache(s)) + return 0; + return cache_show(s, m); +} + /* * slabinfo_op - iterator that generates /proc/slabinfo * From 943a451a87d229ca564a27274b58eaeae35fde5d Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:03 -0800 Subject: [PATCH 37/48] slab: propagate tunable values SLAB allows us to tune a particular cache behavior with tunables. When creating a new memcg cache copy, we'd like to preserve any tunables the parent cache already had. This could be done by an explicit call to do_tune_cpucache() after the cache is created. But this is not very convenient now that the caches are created from common code, since this function is SLAB-specific. Another method of doing that is taking advantage of the fact that do_tune_cpucache() is always called from enable_cpucache(), which is called at cache initialization. We can just preset the values, and then things work as expected. It can also happen that a root cache has its tunables updated during normal system operation. In this case, we will propagate the change to all caches that are already active. This change will require us to move the assignment of root_cache in memcg_params a bit earlier. We need this to be already set - which memcg_kmem_register_cache will do - when we reach __kmem_cache_create() Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 ++++--- include/linux/slab.h | 2 +- mm/memcontrol.c | 10 +++++---- mm/slab.c | 44 +++++++++++++++++++++++++++++++++++--- mm/slab.h | 12 +++++++++++ mm/slab_common.c | 7 +++--- 6 files changed, 69 insertions(+), 14 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8dc7c746b44..ea02ff97083 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -448,7 +448,8 @@ void __memcg_kmem_commit_charge(struct page *page, void __memcg_kmem_uncharge_pages(struct page *page, int order); int memcg_cache_id(struct mem_cgroup *memcg); -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s); +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache); void memcg_release_cache(struct kmem_cache *cachep); void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep); @@ -590,8 +591,9 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return -1; } -static inline int memcg_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *s) +static inline int +memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache) { return 0; } diff --git a/include/linux/slab.h b/include/linux/slab.h index b9278663f22..5d168d7e0a2 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -130,7 +130,7 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, void (*)(void *)); struct kmem_cache * kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t, - unsigned long, void (*)(void *)); + unsigned long, void (*)(void *), struct kmem_cache *); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); void kmem_cache_free(struct kmem_cache *, void *); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a32d83c2e35..f3009b4bae5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3012,7 +3012,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return 0; } -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache) { size_t size = sizeof(struct memcg_cache_params); @@ -3026,8 +3027,10 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) if (!s->memcg_params) return -ENOMEM; - if (memcg) + if (memcg) { s->memcg_params->memcg = memcg; + s->memcg_params->root_cache = root_cache; + } return 0; } @@ -3186,7 +3189,7 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, return NULL; new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, - (s->flags & ~SLAB_PANIC), s->ctor); + (s->flags & ~SLAB_PANIC), s->ctor, s); if (new) new->allocflags |= __GFP_KMEMCG; @@ -3226,7 +3229,6 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, } mem_cgroup_get(memcg); - new_cachep->memcg_params->root_cache = cachep; atomic_set(&new_cachep->memcg_params->nr_pages , 0); cachep->memcg_params->memcg_caches[idx] = new_cachep; diff --git a/mm/slab.c b/mm/slab.c index 7467343f9fe..4dcbf96a77b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4041,7 +4041,7 @@ static void do_ccupdate_local(void *info) } /* Always called with the slab_mutex held */ -static int do_tune_cpucache(struct kmem_cache *cachep, int limit, +static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { struct ccupdate_struct *new; @@ -4084,12 +4084,48 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, return alloc_kmemlist(cachep, gfp); } +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared, gfp_t gfp) +{ + int ret; + struct kmem_cache *c = NULL; + int i = 0; + + ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); + + if (slab_state < FULL) + return ret; + + if ((ret < 0) || !is_root_cache(cachep)) + return ret; + + for_each_memcg_cache_index(i) { + c = cache_from_memcg(cachep, i); + if (c) + /* return value determined by the parent cache only */ + __do_tune_cpucache(c, limit, batchcount, shared, gfp); + } + + return ret; +} + /* Called with slab_mutex held always */ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) { int err; - int limit, shared; + int limit = 0; + int shared = 0; + int batchcount = 0; + if (!is_root_cache(cachep)) { + struct kmem_cache *root = memcg_root_cache(cachep); + limit = root->limit; + shared = root->shared; + batchcount = root->batchcount; + } + + if (limit && shared && batchcount) + goto skip_setup; /* * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm @@ -4131,7 +4167,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) if (limit > 32) limit = 32; #endif - err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); + batchcount = (limit + 1) / 2; +skip_setup: + err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); diff --git a/mm/slab.h b/mm/slab.h index ec5dae1c8e7..34a98d64219 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -155,6 +155,13 @@ static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) { return s->memcg_params->memcg_caches[idx]; } + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) + return s; + return s->memcg_params->root_cache; +} #else static inline bool is_root_cache(struct kmem_cache *s) { @@ -190,6 +197,11 @@ static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) { return NULL; } + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + return s; +} #endif static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) diff --git a/mm/slab_common.c b/mm/slab_common.c index 081f1b8d9a7..3f3cd97d3fd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -164,7 +164,8 @@ unsigned long calculate_alignment(unsigned long flags, struct kmem_cache * kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) + size_t align, unsigned long flags, void (*ctor)(void *), + struct kmem_cache *parent_cache) { struct kmem_cache *s = NULL; int err = 0; @@ -193,7 +194,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, s->align = calculate_alignment(flags, align, size); s->ctor = ctor; - if (memcg_register_cache(memcg, s)) { + if (memcg_register_cache(memcg, s, parent_cache)) { kmem_cache_free(kmem_cache, s); err = -ENOMEM; goto out_locked; @@ -243,7 +244,7 @@ struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { - return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor); + return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); } EXPORT_SYMBOL(kmem_cache_create); From 107dab5c92d5f9c3afe962036e47c207363255c7 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:05 -0800 Subject: [PATCH 38/48] slub: slub-specific propagation changes SLUB allows us to tune a particular cache behavior with sysfs-based tunables. When creating a new memcg cache copy, we'd like to preserve any tunables the parent cache already had. This can be done by tapping into the store attribute function provided by the allocator. We of course don't need to mess with read-only fields. Since the attributes can have multiple types and are stored internally by sysfs, the best strategy is to issue a ->show() in the root cache, and then ->store() in the memcg cache. The drawback of that, is that sysfs can allocate up to a page in buffering for show(), that we are likely not to need, but also can't guarantee. To avoid always allocating a page for that, we can update the caches at store time with the maximum attribute size ever stored to the root cache. We will then get a buffer big enough to hold it. The corolary to this, is that if no stores happened, nothing will be propagated. It can also happen that a root cache has its tunables updated during normal system operation. In this case, we will propagate the change to all caches that are already active. [akpm@linux-foundation.org: tweak code to avoid __maybe_unused] Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 1 + mm/slub.c | 76 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 364ba6c9fe2..9db4825cd39 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -103,6 +103,7 @@ struct kmem_cache { #endif #ifdef CONFIG_MEMCG_KMEM struct memcg_cache_params *memcg_params; + int max_attr_size; /* for propagation, maximum size of a stored attr */ #endif #ifdef CONFIG_NUMA diff --git a/mm/slub.c b/mm/slub.c index 692177bebdf..21c94d9695e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -201,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); - +static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void sysfs_slab_remove(struct kmem_cache *s) { } +static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) @@ -3865,6 +3866,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) if (slab_state <= UP) return 0; + memcg_propagate_slab_attrs(s); mutex_unlock(&slab_mutex); err = sysfs_slab_add(s); mutex_lock(&slab_mutex); @@ -5098,10 +5100,82 @@ static ssize_t slab_attr_store(struct kobject *kobj, return -EIO; err = attribute->store(s, buf, len); +#ifdef CONFIG_MEMCG_KMEM + if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { + int i; + mutex_lock(&slab_mutex); + if (s->max_attr_size < len) + s->max_attr_size = len; + + for_each_memcg_cache_index(i) { + struct kmem_cache *c = cache_from_memcg(s, i); + /* + * This function's return value is determined by the + * parent cache only + */ + if (c) + attribute->store(c, buf, len); + } + mutex_unlock(&slab_mutex); + } +#endif return err; } +static void memcg_propagate_slab_attrs(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + int i; + char *buffer = NULL; + + if (!is_root_cache(s)) + return; + + /* + * This mean this cache had no attribute written. Therefore, no point + * in copying default values around + */ + if (!s->max_attr_size) + return; + + for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { + char mbuf[64]; + char *buf; + struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + + if (!attr || !attr->store || !attr->show) + continue; + + /* + * It is really bad that we have to allocate here, so we will + * do it only as a fallback. If we actually allocate, though, + * we can just use the allocated buffer until the end. + * + * Most of the slub attributes will tend to be very small in + * size, but sysfs allows buffers up to a page, so they can + * theoretically happen. + */ + if (buffer) + buf = buffer; + else if (s->max_attr_size < ARRAY_SIZE(mbuf)) + buf = mbuf; + else { + buffer = (char *) get_zeroed_page(GFP_KERNEL); + if (WARN_ON(!buffer)) + continue; + buf = buffer; + } + + attr->show(s->memcg_params->root_cache, buf); + attr->store(s, buf, strlen(buf)); + } + + if (buffer) + free_page((unsigned long)buffer); +#endif +} + static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, From 92e793495597af4135d94314113bf13eafb0e663 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:08 -0800 Subject: [PATCH 39/48] kmem: add slab-specific documentation about the kmem controller Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 5b5b6314377..8b8c28b9864 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -301,6 +301,13 @@ to trigger slab reclaim when those limits are reached. kernel memory, we prevent new processes from being created when the kernel memory usage is too high. +* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy +of each kmem_cache is created everytime the cache is touched by the first time +from inside the memcg. The creation is done lazily, so some objects can still be +skipped while the cache is being created. All objects in a slab page should +belong to the same memcg. This only fails to hold when a task is migrated to a +different memcg during the page allocation by the cache. + * sockets memory pressure: some sockets protocols have memory pressure thresholds. The Memory Controller allows them to be controlled individually per cgroup, instead of globally. From ebe945c27628fca03723582eba138acc2e2f3d15 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:10 -0800 Subject: [PATCH 40/48] memcg: add comments clarifying aspects of cache attribute propagation This patch clarifies two aspects of cache attribute propagation. First, the expected context for the for_each_memcg_cache macro in memcontrol.h. The usages already in the codebase are safe. In mm/slub.c, it is trivially safe because the lock is acquired right before the loop. In mm/slab.c, it is less so: the lock is acquired by an outer function a few steps back in the stack, so a VM_BUG_ON() is added to make sure it is indeed safe. A comment is also added to detail why we are returning the value of the parent cache and ignoring the children's when we propagate the attributes. Signed-off-by: Glauber Costa Cc: Michal Hocko Cc: Kamezawa Hiroyuki Cc: Johannes Weiner Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ mm/slab.c | 1 + mm/slub.c | 21 +++++++++++++++++---- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ea02ff97083..0108a56f814 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -422,6 +422,12 @@ static inline void sock_release_memcg(struct sock *sk) extern struct static_key memcg_kmem_enabled_key; extern int memcg_limited_groups_array_size; + +/* + * Helper macro to loop through all memcg-specific caches. Callers must still + * check if the cache is valid (it is either valid or NULL). + * the slab_mutex must be held when looping through those caches + */ #define for_each_memcg_cache_index(_idx) \ for ((_idx) = 0; i < memcg_limited_groups_array_size; (_idx)++) diff --git a/mm/slab.c b/mm/slab.c index 4dcbf96a77b..e7667a3584b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4099,6 +4099,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, if ((ret < 0) || !is_root_cache(cachep)) return ret; + VM_BUG_ON(!mutex_is_locked(&slab_mutex)); for_each_memcg_cache_index(i) { c = cache_from_memcg(cachep, i); if (c) diff --git a/mm/slub.c b/mm/slub.c index 21c94d9695e..efe2cffc29b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5108,12 +5108,25 @@ static ssize_t slab_attr_store(struct kobject *kobj, if (s->max_attr_size < len) s->max_attr_size = len; + /* + * This is a best effort propagation, so this function's return + * value will be determined by the parent cache only. This is + * basically because not all attributes will have a well + * defined semantics for rollbacks - most of the actions will + * have permanent effects. + * + * Returning the error value of any of the children that fail + * is not 100 % defined, in the sense that users seeing the + * error code won't be able to know anything about the state of + * the cache. + * + * Only returning the error code for the parent cache at least + * has well defined semantics. The cache being written to + * directly either failed or succeeded, in which case we loop + * through the descendants with best-effort propagation. + */ for_each_memcg_cache_index(i) { struct kmem_cache *c = cache_from_memcg(s, i); - /* - * This function's return value is determined by the - * parent cache only - */ if (c) attribute->store(c, buf, len); } From 5413dfba88d6f6090c8cdf181ab9172d24752f8f Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:13 -0800 Subject: [PATCH 41/48] slub: drop mutex before deleting sysfs entry Sasha Levin recently reported a lockdep problem resulting from the new attribute propagation introduced by kmemcg series. In short, slab_mutex will be called from within the sysfs attribute store function. This will create a dependency, that will later be held backwards when a cache is destroyed - since destruction occurs with the slab_mutex held, and then calls in to the sysfs directory removal function. In this patch, I propose to adopt a strategy close to what __kmem_cache_create does before calling sysfs_slab_add, and release the lock before the call to sysfs_slab_remove. This is pretty much the last operation in the kmem_cache_shutdown() path, so we could do better by splitting this and moving this call alone to later on. This will fit nicely when sysfs handling is consistent between all caches, but will look weird now. Lockdep info: ====================================================== [ INFO: possible circular locking dependency detected ] 3.7.0-rc4-next-20121106-sasha-00008-g353b62f #117 Tainted: G W ------------------------------------------------------- trinity-child13/6961 is trying to acquire lock: (s_active#43){++++.+}, at: sysfs_addrm_finish+0x31/0x60 but task is already holding lock: (slab_mutex){+.+.+.}, at: kmem_cache_destroy+0x22/0xe0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (slab_mutex){+.+.+.}: lock_acquire+0x1aa/0x240 __mutex_lock_common+0x59/0x5a0 mutex_lock_nested+0x3f/0x50 slab_attr_store+0xde/0x110 sysfs_write_file+0xfa/0x150 vfs_write+0xb0/0x180 sys_pwrite64+0x60/0xb0 tracesys+0xe1/0xe6 -> #0 (s_active#43){++++.+}: __lock_acquire+0x14df/0x1ca0 lock_acquire+0x1aa/0x240 sysfs_deactivate+0x122/0x1a0 sysfs_addrm_finish+0x31/0x60 sysfs_remove_dir+0x89/0xd0 kobject_del+0x16/0x40 __kmem_cache_shutdown+0x40/0x60 kmem_cache_destroy+0x40/0xe0 mon_text_release+0x78/0xe0 __fput+0x122/0x2d0 ____fput+0x9/0x10 task_work_run+0xbe/0x100 do_exit+0x432/0xbd0 do_group_exit+0x84/0xd0 get_signal_to_deliver+0x81d/0x930 do_signal+0x3a/0x950 do_notify_resume+0x3e/0x90 int_signal+0x12/0x17 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(slab_mutex); lock(s_active#43); lock(slab_mutex); lock(s_active#43); *** DEADLOCK *** 2 locks held by trinity-child13/6961: #0: (mon_lock){+.+.+.}, at: mon_text_release+0x25/0xe0 #1: (slab_mutex){+.+.+.}, at: kmem_cache_destroy+0x22/0xe0 stack backtrace: Pid: 6961, comm: trinity-child13 Tainted: G W 3.7.0-rc4-next-20121106-sasha-00008-g353b62f #117 Call Trace: print_circular_bug+0x1fb/0x20c __lock_acquire+0x14df/0x1ca0 lock_acquire+0x1aa/0x240 sysfs_deactivate+0x122/0x1a0 sysfs_addrm_finish+0x31/0x60 sysfs_remove_dir+0x89/0xd0 kobject_del+0x16/0x40 __kmem_cache_shutdown+0x40/0x60 kmem_cache_destroy+0x40/0xe0 mon_text_release+0x78/0xe0 __fput+0x122/0x2d0 ____fput+0x9/0x10 task_work_run+0xbe/0x100 do_exit+0x432/0xbd0 do_group_exit+0x84/0xd0 get_signal_to_deliver+0x81d/0x930 do_signal+0x3a/0x950 do_notify_resume+0x3e/0x90 int_signal+0x12/0x17 Signed-off-by: Glauber Costa Reported-by: Sasha Levin Cc: Michal Hocko Cc: Kamezawa Hiroyuki Cc: Johannes Weiner Cc: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index efe2cffc29b..ba2ca53f6c3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3153,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s) { int rc = kmem_cache_close(s); - if (!rc) + if (!rc) { + /* + * We do the same lock strategy around sysfs_slab_add, see + * __kmem_cache_create. Because this is pretty much the last + * operation we do and the lock will be released shortly after + * that in slab_common.c, we could just move sysfs_slab_remove + * to a later point in common code. We should do that when we + * have a common sysfs framework for all allocators. + */ + mutex_unlock(&slab_mutex); sysfs_slab_remove(s); + mutex_lock(&slab_mutex); + } return rc; } From 5bbe1ec11fcf08af086ae214cdd4f4da0f25ca46 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 18 Dec 2012 14:23:16 -0800 Subject: [PATCH 42/48] Documentation: ABI: /sys/devices/system/node/ Describe NUMA node sysfs files/attributes. Note that for the specific dates and contacts I couldn't find, I left it as default for Oct 2002 and linux-mm. Signed-off-by: Davidlohr Bueso Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/stable/sysfs-devices-node | 96 ++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index 49b82cad700..ce259c13c36 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -1,7 +1,101 @@ +What: /sys/devices/system/node/possible +Date: October 2002 +Contact: Linux Memory Management list +Description: + Nodes that could be possibly become online at some point. + +What: /sys/devices/system/node/online +Date: October 2002 +Contact: Linux Memory Management list +Description: + Nodes that are online. + +What: /sys/devices/system/node/has_normal_memory +Date: October 2002 +Contact: Linux Memory Management list +Description: + Nodes that have regular memory. + +What: /sys/devices/system/node/has_cpu +Date: October 2002 +Contact: Linux Memory Management list +Description: + Nodes that have one or more CPUs. + +What: /sys/devices/system/node/has_high_memory +Date: October 2002 +Contact: Linux Memory Management list +Description: + Nodes that have regular or high memory. + Depends on CONFIG_HIGHMEM. + What: /sys/devices/system/node/nodeX Date: October 2002 Contact: Linux Memory Management list Description: When CONFIG_NUMA is enabled, this is a directory containing information on node X such as what CPUs are local to the - node. + node. Each file is detailed next. + +What: /sys/devices/system/node/nodeX/cpumap +Date: October 2002 +Contact: Linux Memory Management list +Description: + The node's cpumap. + +What: /sys/devices/system/node/nodeX/cpulist +Date: October 2002 +Contact: Linux Memory Management list +Description: + The CPUs associated to the node. + +What: /sys/devices/system/node/nodeX/meminfo +Date: October 2002 +Contact: Linux Memory Management list +Description: + Provides information about the node's distribution and memory + utilization. Similar to /proc/meminfo, see Documentation/filesystems/proc.txt + +What: /sys/devices/system/node/nodeX/numastat +Date: October 2002 +Contact: Linux Memory Management list +Description: + The node's hit/miss statistics, in units of pages. + See Documentation/numastat.txt + +What: /sys/devices/system/node/nodeX/distance +Date: October 2002 +Contact: Linux Memory Management list +Description: + Distance between the node and all the other nodes + in the system. + +What: /sys/devices/system/node/nodeX/vmstat +Date: October 2002 +Contact: Linux Memory Management list +Description: + The node's zoned virtual memory statistics. + This is a superset of numastat. + +What: /sys/devices/system/node/nodeX/compact +Date: February 2010 +Contact: Mel Gorman +Description: + When this file is written to, all memory within that node + will be compacted. When it completes, memory will be freed + into blocks which have as many contiguous pages as possible + +What: /sys/devices/system/node/nodeX/scan_unevictable_pages +Date: October 2008 +Contact: Lee Schermerhorn +Description: + When set, it triggers scanning the node's unevictable lists + and move any pages that have become evictable onto the respective + zone's inactive list. See mm/vmscan.c + +What: /sys/devices/system/node/nodeX/hugepages/hugepages-/ +Date: December 2009 +Contact: Lee Schermerhorn +Description: + The node's huge page size control/query attributes. + See Documentation/vm/hugetlbpage.txt \ No newline at end of file From 7d12efaea7e74bf8f4953412514e836313fa32ec Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 18 Dec 2012 14:23:17 -0800 Subject: [PATCH 43/48] mm/mprotect.c: coding-style cleanups A few gremlins have recently crept in. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 3dca970367d..94722a4d6b4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -114,7 +114,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, #ifdef CONFIG_NUMA_BALANCING static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) + pmd_t *pmd) { spin_lock(&mm->page_table_lock); set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); @@ -122,15 +122,15 @@ static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, } #else static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) + pmd_t *pmd) { BUG(); } #endif /* CONFIG_NUMA_BALANCING */ -static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; unsigned long next; @@ -143,7 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { + else if (change_huge_pmd(vma, pmd, addr, newprot, + prot_numa)) { pages += HPAGE_PMD_NR; continue; } @@ -167,9 +168,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * return pages; } -static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) +static inline unsigned long change_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; @@ -304,7 +305,8 @@ success: dirty_accountable = 1; } - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); + change_protection(vma, start, end, vma->vm_page_prot, + dirty_accountable, 0); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); @@ -361,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = -EINVAL; if (!(vma->vm_flags & VM_GROWSDOWN)) goto out; - } - else { + } else { if (vma->vm_start > start) goto out; if (unlikely(grows & PROT_GROWSUP)) { @@ -378,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, for (nstart = start ; ; ) { unsigned long newflags; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + newflags = vm_flags; + newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { From 7179e7bf4592ac5a7b30257a7df6259ee81e51da Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Tue, 18 Dec 2012 14:23:19 -0800 Subject: [PATCH 44/48] mm/hugetlb: create hugetlb cgroup file in hugetlb_init Build kernel with CONFIG_HUGETLBFS=y,CONFIG_HUGETLB_PAGE=y and CONFIG_CGROUP_HUGETLB=y, then specify hugepagesz=xx boot option, system will fail to boot. This failure is caused by following code path: setup_hugepagesz hugetlb_add_hstate hugetlb_cgroup_file_init cgroup_add_cftypes kzalloc <--slab is *not available* yet For this path, slab is not available yet, so memory allocated will be failed, and cause WARN_ON() in hugetlb_cgroup_file_init(). So I move hugetlb_cgroup_file_init() into hugetlb_init(). [akpm@linux-foundation.org: tweak coding-style, remove pointless __init on inlined function] [akpm@linux-foundation.org: fix warning] Signed-off-by: Jianguo Wu Signed-off-by: Jiang Liu Reviewed-by: Aneesh Kumar K.V Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_cgroup.h | 5 ++--- mm/hugetlb.c | 11 +---------- mm/hugetlb_cgroup.c | 19 +++++++++++++++++-- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index d73878c694b..ce8217f7b5c 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -62,7 +62,7 @@ extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page); extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); -extern int hugetlb_cgroup_file_init(int idx) __init; +extern void hugetlb_cgroup_file_init(void) __init; extern void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage); @@ -111,9 +111,8 @@ hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, return; } -static inline int __init hugetlb_cgroup_file_init(int idx) +static inline void hugetlb_cgroup_file_init(void) { - return 0; } static inline void hugetlb_cgroup_migrate(struct page *oldhpage, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e5318c7793a..4f3ea0b1e57 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void) default_hstate.max_huge_pages = default_hstate_max_huge_pages; hugetlb_init_hstates(); - gather_bootmem_prealloc(); - report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); + hugetlb_cgroup_file_init(); return 0; } @@ -1943,13 +1941,6 @@ void __init hugetlb_add_hstate(unsigned order) h->next_nid_to_free = first_node(node_states[N_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); - /* - * Add cgroup control files only if the huge page consists - * of more than two normal pages. This is because we use - * page[2].lru.next for storing cgoup details. - */ - if (order >= HUGETLB_CGROUP_MIN_ORDER) - hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); parsed_hstate = h; } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index b5bde7a5c01..9cea7de22ff 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -333,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize) return buf; } -int __init hugetlb_cgroup_file_init(int idx) +static void __init __hugetlb_cgroup_file_init(int idx) { char buf[32]; struct cftype *cft; @@ -375,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx) WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); - return 0; + return; +} + +void __init hugetlb_cgroup_file_init(void) +{ + struct hstate *h; + + for_each_hstate(h) { + /* + * Add cgroup control files only if the huge page consists + * of more than two normal pages. This is because we use + * page[2].lru.next for storing cgroup details. + */ + if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) + __hugetlb_cgroup_file_init(hstate_index(h)); + } } /* From 79a4dcefd3256896416f5a94d2d1442a7f5aa9b8 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 18 Dec 2012 14:23:24 -0800 Subject: [PATCH 45/48] mm/memory_hotplug.c: improve comments Signed-off-by: Tang Chen Cc: Jiang Liu Cc: Lai Jiangshan Cc: Wen Congyang Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962e353aa86..d04ed87bfac 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -590,18 +590,21 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, } #ifdef CONFIG_MOVABLE_NODE -/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ +/* + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have + * normal memory. + */ static bool can_online_high_movable(struct zone *zone) { return true; } -#else /* #ifdef CONFIG_MOVABLE_NODE */ +#else /* CONFIG_MOVABLE_NODE */ /* ensure every online node has NORMAL memory */ static bool can_online_high_movable(struct zone *zone) { return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); } -#endif /* #ifdef CONFIG_MOVABLE_NODE */ +#endif /* CONFIG_MOVABLE_NODE */ /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, @@ -1112,12 +1115,15 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) } #ifdef CONFIG_MOVABLE_NODE -/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ +/* + * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have + * normal memory. + */ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) { return true; } -#else /* #ifdef CONFIG_MOVABLE_NODE */ +#else /* CONFIG_MOVABLE_NODE */ /* ensure the node has NORMAL memory if it is still online */ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) { @@ -1141,7 +1147,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) */ return present_pages == 0; } -#endif /* #ifdef CONFIG_MOVABLE_NODE */ +#endif /* CONFIG_MOVABLE_NODE */ /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, From dc053733ea44babedb20266300b984d6add8b9e5 Mon Sep 17 00:00:00 2001 From: Abhijit Pawar Date: Tue, 18 Dec 2012 14:23:27 -0800 Subject: [PATCH 46/48] mm/kmemleak.c: remove obsolete simple_strtoul Replace the obsolete simple_strtoul() with kstrtoul(). Signed-off-by: Abhijit Pawar Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a217cc54406..752a705c77c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str) struct kmemleak_object *object; unsigned long addr; - addr= simple_strtoul(str, NULL, 0); + if (kstrtoul(str, 0, &addr)) + return -EINVAL; object = find_and_get_object(addr, 0); if (!object) { pr_info("Unknown object at 0x%08lx\n", addr); From d37dd5dcb955dd8c2cdd4eaef1f15d1b7ecbc379 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Tue, 18 Dec 2012 14:23:28 -0800 Subject: [PATCH 47/48] vmscan: comment too_many_isolated() Comment "Why it's doing so" rather than "What it does" as proposed by Andrew Morton. Signed-off-by: Wu Fengguang Reviewed-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Reviewed-by: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 7f3096137b8..e73d0206ddd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page) } /* - * Are there way too many processes in the direct reclaim path already? + * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and + * then get resheduled. When there are massive number of tasks doing page + * allocation, such sleeping direct reclaimers may keep piling up on each CPU, + * the LRU list will go small and be scanned faster than necessary, leading to + * unnecessary swapping, thrashing and OOM. */ static int too_many_isolated(struct zone *zone, int file, struct scan_control *sc) From 3cf23841b4b76eb94d3f8d0fb3627690e4431413 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Tue, 18 Dec 2012 14:23:31 -0800 Subject: [PATCH 48/48] mm/vmscan.c: avoid possible deadlock caused by too_many_isolated() Neil found that if too_many_isolated() returns true while performing direct reclaim we can end up waiting for other threads to complete their direct reclaim. If those threads are allowed to enter the FS or IO to free memory, but this thread is not, then it is possible that those threads will be waiting on this thread and so we get a circular deadlock. some task enters direct reclaim with GFP_KERNEL => too_many_isolated() false => vmscan and run into dirty pages => pageout() => take some FS lock => fs/block code does GFP_NOIO allocation => enter direct reclaim again => too_many_isolated() true => waiting for others to progress, however the other tasks may be circular waiting for the FS lock.. The fix is to let !__GFP_IO and !__GFP_FS direct reclaims enjoy higher priority than normal ones, by lowering the throttle threshold for the latter. Allowing ~1/8 isolated pages in normal is large enough. For example, for a 1GB LRU list, that's ~128MB isolated pages, or 1k blocked tasks (each isolates 32 4KB pages), or 64 blocked tasks per logical CPU (assuming 16 logical CPUs per NUMA node). So it's not likely some CPU goes idle waiting (when it could make progress) because of this limit: there are much more sleeping reclaim tasks than the number of CPU, so the task may well be blocked by some low level queue/lock anyway. Now !GFP_IOFS reclaims won't be waiting for GFP_IOFS reclaims to progress. They will be blocked only when there are too many concurrent !GFP_IOFS reclaims, however that's very unlikely because the IO-less direct reclaims is able to progress much more faster, and they won't deadlock each other. The threshold is raised high enough for them, so that there can be sufficient parallel progress of !GFP_IOFS reclaims. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Wu Fengguang Cc: Torsten Kaiser Tested-by: NeilBrown Reviewed-by: Minchan Kim Acked-by: KOSAKI Motohiro Acked-by: Rik van Riel Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index e73d0206ddd..828530e2794 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1202,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file, isolated = zone_page_state(zone, NR_ISOLATED_ANON); } + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + inactive >>= 3; + return isolated > inactive; }