From a3437870160cf2caaac6bdd76c7377a5a4145a8c Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Wed, 23 Jul 2008 21:27:44 -0700 Subject: [PATCH] hugetlb: new sysfs interface Provide new hugepages user APIs that are more suited to multiple hstates in sysfs. There is a new directory, /sys/kernel/hugepages. Underneath that directory there will be a directory per-supported hugepage size, e.g.: /sys/kernel/hugepages/hugepages-64kB /sys/kernel/hugepages/hugepages-16384kB /sys/kernel/hugepages/hugepages-16777216kB corresponding to 64k, 16m and 16g respectively. Within each hugepages-size directory there are a number of files, corresponding to the tracked counters in the hstate, e.g.: /sys/kernel/hugepages/hugepages-64/nr_hugepages /sys/kernel/hugepages/hugepages-64/nr_overcommit_hugepages /sys/kernel/hugepages/hugepages-64/free_hugepages /sys/kernel/hugepages/hugepages-64/resv_hugepages /sys/kernel/hugepages/hugepages-64/surplus_hugepages Of these files, the first two are read-write and the latter three are read-only. The size of the hugepage being manipulated is trivially deducible from the enclosing directory and is always expressed in kB (to match meminfo). [dave@linux.vnet.ibm.com: fix build] [nacc@us.ibm.com: hugetlb: hang off of /sys/kernel/mm rather than /sys/kernel] [nacc@us.ibm.com: hugetlb: remove CONFIG_SYSFS dependency] Acked-by: Greg Kroah-Hartman Signed-off-by: Nishanth Aravamudan Signed-off-by: Nick Piggin Cc: Dave Hansen Signed-off-by: Nishanth Aravamudan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../ABI/testing/sysfs-kernel-mm-hugepages | 15 + Documentation/vm/hugetlbpage.txt | 23 ++ include/linux/hugetlb.h | 2 + mm/hugetlb.c | 288 ++++++++++++++---- 4 files changed, 262 insertions(+), 66 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-hugepages diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-hugepages b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages new file mode 100644 index 00000000000..e21c00571cf --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages @@ -0,0 +1,15 @@ +What: /sys/kernel/mm/hugepages/ +Date: June 2008 +Contact: Nishanth Aravamudan , hugetlb maintainers +Description: + /sys/kernel/mm/hugepages/ contains a number of subdirectories + of the form hugepages-kB, where is the page size + of the hugepages supported by the kernel/CPU combination. + + Under these directories are a number of files: + nr_hugepages + nr_overcommit_hugepages + free_hugepages + surplus_hugepages + resv_hugepages + See Documentation/vm/hugetlbpage.txt for details. diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 3102b81bef8..8a5b5763f0f 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -95,6 +95,29 @@ this condition holds, however, no more surplus huge pages will be allowed on the system until one of the two sysctls are increased sufficiently, or the surplus huge pages go out of use and are freed. +With support for multiple hugepage pools at run-time available, much of +the hugepage userspace interface has been duplicated in sysfs. The above +information applies to the default hugepage size (which will be +controlled by the proc interfaces for backwards compatibility). The root +hugepage control directory is + + /sys/kernel/mm/hugepages + +For each hugepage size supported by the running kernel, a subdirectory +will exist, of the form + + hugepages-${size}kB + +Inside each of these directories, the same set of files will exist: + + nr_hugepages + nr_overcommit_hugepages + free_hugepages + resv_hugepages + surplus_hugepages + +which function as described above for the default hugepage-sized case. + If the user applications are going to request hugepages using mmap system call, then it is required that system administrator mount a file system of type hugetlbfs: diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ba9263e631b..58c0de32e7f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -164,6 +164,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, #ifdef CONFIG_HUGETLB_PAGE +#define HSTATE_NAME_LEN 32 /* Defines one hugetlb page size */ struct hstate { int hugetlb_next_nid; @@ -179,6 +180,7 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; + char name[HSTATE_NAME_LEN]; }; void __init hugetlb_add_hstate(unsigned order); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4cf7a90e914..bb49ce5d006 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -942,72 +943,6 @@ static void __init report_hugepages(void) } } -static int __init hugetlb_init(void) -{ - BUILD_BUG_ON(HPAGE_SHIFT == 0); - - if (!size_to_hstate(HPAGE_SIZE)) { - hugetlb_add_hstate(HUGETLB_PAGE_ORDER); - parsed_hstate->max_huge_pages = default_hstate_max_huge_pages; - } - default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates; - - hugetlb_init_hstates(); - - report_hugepages(); - - return 0; -} -module_init(hugetlb_init); - -/* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_add_hstate(unsigned order) -{ - struct hstate *h; - if (size_to_hstate(PAGE_SIZE << order)) { - printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); - return; - } - BUG_ON(max_hstate >= HUGE_MAX_HSTATE); - BUG_ON(order == 0); - h = &hstates[max_hstate++]; - h->order = order; - h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); - hugetlb_init_one_hstate(h); - parsed_hstate = h; -} - -static int __init hugetlb_setup(char *s) -{ - unsigned long *mhp; - - /* - * !max_hstate means we haven't parsed a hugepagesz= parameter yet, - * so this hugepages= parameter goes to the "default hstate". - */ - if (!max_hstate) - mhp = &default_hstate_max_huge_pages; - else - mhp = &parsed_hstate->max_huge_pages; - - if (sscanf(s, "%lu", mhp) <= 0) - *mhp = 0; - - return 1; -} -__setup("hugepages=", hugetlb_setup); - -static unsigned int cpuset_mems_nr(unsigned int *array) -{ - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; - - return nr; -} - #ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) @@ -1105,6 +1040,227 @@ out: return ret; } +#define HSTATE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static struct kobject *hugepages_kobj; +static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; + +static struct hstate *kobj_to_hstate(struct kobject *kobj) +{ + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (hstate_kobjs[i] == kobj) + return &hstates[i]; + BUG(); + return NULL; +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_huge_pages); +} +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + h->max_huge_pages = set_max_huge_pages(h, input); + + return count; +} +HSTATE_ATTR(nr_hugepages); + +static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); +} +static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + spin_unlock(&hugetlb_lock); + + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +{ + int retval; + + hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, + hugepages_kobj); + if (!hstate_kobjs[h - hstates]) + return -ENOMEM; + + retval = sysfs_create_group(hstate_kobjs[h - hstates], + &hstate_attr_group); + if (retval) + kobject_put(hstate_kobjs[h - hstates]); + + return retval; +} + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h); + if (err) + printk(KERN_ERR "Hugetlb: Unable to add hstate %s", + h->name); + } +} + +static void __exit hugetlb_exit(void) +{ + struct hstate *h; + + for_each_hstate(h) { + kobject_put(hstate_kobjs[h - hstates]); + } + + kobject_put(hugepages_kobj); +} +module_exit(hugetlb_exit); + +static int __init hugetlb_init(void) +{ + BUILD_BUG_ON(HPAGE_SHIFT == 0); + + if (!size_to_hstate(HPAGE_SIZE)) { + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + parsed_hstate->max_huge_pages = default_hstate_max_huge_pages; + } + default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates; + + hugetlb_init_hstates(); + + report_hugepages(); + + hugetlb_sysfs_init(); + + return 0; +} +module_init(hugetlb_init); + +/* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_add_hstate(unsigned order) +{ + struct hstate *h; + if (size_to_hstate(PAGE_SIZE << order)) { + printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order == 0); + h = &hstates[max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", + huge_page_size(h)/1024); + hugetlb_init_one_hstate(h); + parsed_hstate = h; +} + +static int __init hugetlb_setup(char *s) +{ + unsigned long *mhp; + + /* + * !max_hstate means we haven't parsed a hugepagesz= parameter yet, + * so this hugepages= parameter goes to the "default hstate". + */ + if (!max_hstate) + mhp = &default_hstate_max_huge_pages; + else + mhp = &parsed_hstate->max_huge_pages; + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + + return 1; +} +__setup("hugepages=", hugetlb_setup); + +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos)