diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index bbd2221923c..6a7621b2b12 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -147,6 +147,7 @@ extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); +extern void mpol_fix_fork_child_flag(struct task_struct *p); #define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x)) #ifdef CONFIG_CPUSET @@ -248,6 +249,10 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } +static inline void mpol_fix_fork_child_flag(struct task_struct *p) +{ +} + #define set_cpuset_being_rebound(x) do {} while (0) static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, diff --git a/include/linux/sched.h b/include/linux/sched.h index b0e37cfa09f..2cda439ece4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -932,6 +932,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */ +#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ /* * Only the _current_ task can read/write to tsk->flags, but other diff --git a/kernel/fork.c b/kernel/fork.c index c21bae8c93b..a02063903aa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1021,6 +1021,7 @@ static task_t *copy_process(unsigned long clone_flags, p->mempolicy = NULL; goto bad_fork_cleanup_cpuset; } + mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_DEBUG_MUTEXES diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e93cc740c22..4f71cfd29c6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes) return mpol_check_policy(mode, nodes); } + +/* + * Update task->flags PF_MEMPOLICY bit: set iff non-default + * mempolicy. Allows more rapid checking of this (combined perhaps + * with other PF_* flag bits) on memory allocation hot code paths. + * + * If called from outside this file, the task 'p' should -only- be + * a newly forked child not yet visible on the task list, because + * manipulating the task flags of a visible task is not safe. + * + * The above limitation is why this routine has the funny name + * mpol_fix_fork_child_flag(). + * + * It is also safe to call this with a task pointer of current, + * which the static wrapper mpol_set_task_struct_flag() does, + * for use within this file. + */ + +void mpol_fix_fork_child_flag(struct task_struct *p) +{ + if (p->mempolicy) + p->flags |= PF_MEMPOLICY; + else + p->flags &= ~PF_MEMPOLICY; +} + +static void mpol_set_task_struct_flag(void) +{ + mpol_fix_fork_child_flag(current); +} + /* Set the process memory policy */ long do_set_mempolicy(int mode, nodemask_t *nodes) { @@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; + mpol_set_task_struct_flag(); if (new && new->policy == MPOL_INTERLEAVE) current->il_next = first_node(new->v.nodes); return 0; diff --git a/mm/slab.c b/mm/slab.c index de516658d3d..f80b52388a1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -899,6 +899,7 @@ static struct array_cache *alloc_arraycache(int node, int entries, #ifdef CONFIG_NUMA static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); +static void *alternate_node_alloc(struct kmem_cache *, gfp_t); static struct array_cache **alloc_alien_cache(int node, int limit) { @@ -2808,19 +2809,11 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) struct array_cache *ac; #ifdef CONFIG_NUMA - if (unlikely(current->mempolicy && !in_interrupt())) { - int nid = slab_node(current->mempolicy); - - if (nid != numa_node_id()) - return __cache_alloc_node(cachep, flags, nid); - } - if (unlikely(cpuset_do_slab_mem_spread() && - (cachep->flags & SLAB_MEM_SPREAD) && - !in_interrupt())) { - int nid = cpuset_mem_spread_node(); - - if (nid != numa_node_id()) - return __cache_alloc_node(cachep, flags, nid); + if (unlikely(current->flags & (PF_SPREAD_PAGE | PF_SPREAD_SLAB | + PF_MEMPOLICY))) { + objp = alternate_node_alloc(cachep, flags); + if (objp != NULL) + return objp; } #endif @@ -2855,6 +2848,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, } #ifdef CONFIG_NUMA +/* + * Try allocating on another node if PF_SPREAD_PAGE|PF_SPREAD_SLAB|PF_MEMPOLICY. + * + * If we are in_interrupt, then process context, including cpusets and + * mempolicy, may not apply and should not be used for allocation policy. + */ +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + int nid_alloc, nid_here; + + if (in_interrupt()) + return NULL; + nid_alloc = nid_here = numa_node_id(); + if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) + nid_alloc = cpuset_mem_spread_node(); + else if (current->mempolicy) + nid_alloc = slab_node(current->mempolicy); + if (nid_alloc != nid_here) + return __cache_alloc_node(cachep, flags, nid_alloc); + return NULL; +} + /* * A interface to enable slab creation on nodeid */