diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index a73454aec33..20b51cab659 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -86,9 +86,44 @@ extern void rebuild_sched_domains(void); extern void cpuset_print_task_mems_allowed(struct task_struct *p); +/* + * reading current mems_allowed and mempolicy in the fastpath must protected + * by get_mems_allowed() + */ +static inline void get_mems_allowed(void) +{ + current->mems_allowed_change_disable++; + + /* + * ensure that reading mems_allowed and mempolicy happens after the + * update of ->mems_allowed_change_disable. + * + * the write-side task finds ->mems_allowed_change_disable is not 0, + * and knows the read-side task is reading mems_allowed or mempolicy, + * so it will clear old bits lazily. + */ + smp_mb(); +} + +static inline void put_mems_allowed(void) +{ + /* + * ensure that reading mems_allowed and mempolicy before reducing + * mems_allowed_change_disable. + * + * the write-side task will know that the read-side task is still + * reading mems_allowed or mempolicy, don't clears old bits in the + * nodemask. + */ + smp_mb(); + --ACCESS_ONCE(current->mems_allowed_change_disable); +} + static inline void set_mems_allowed(nodemask_t nodemask) { + task_lock(current); current->mems_allowed = nodemask; + task_unlock(current); } #else /* !CONFIG_CPUSETS */ @@ -187,6 +222,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) { } +static inline void get_mems_allowed(void) +{ +} + +static inline void put_mems_allowed(void) +{ +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index b55e988988b..415b8f8a3f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1421,6 +1421,7 @@ struct task_struct { #endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; /* Protected by alloc_lock */ + int mems_allowed_change_disable; int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS diff --git a/kernel/cpuset.c b/kernel/cpuset.c index db0990ac3fa..61d6af7fa67 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, * In order to avoid seeing no nodes if the old and new nodes are disjoint, * we structure updates as setting all new allowed nodes, then clearing newly * disallowed ones. - * - * Called with task's alloc_lock held */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { +repeat: + /* + * Allow tasks that have access to memory reserves because they have + * been OOM killed to get memory anywhere. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE))) + return; + if (current->flags & PF_EXITING) /* Let dying task have memory */ + return; + + task_lock(tsk); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, &tsk->mems_allowed, MPOL_REBIND_ONCE); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_ONCE); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); + + + /* + * ensure checking ->mems_allowed_change_disable after setting all new + * allowed nodes. + * + * the read-side task can see an nodemask with new allowed nodes and + * old allowed nodes. and if it allocates page when cpuset clears newly + * disallowed ones continuous, it can see the new allowed bits. + * + * And if setting all new allowed nodes is after the checking, setting + * all new allowed nodes and clearing newly disallowed ones will be done + * continuous, and the read-side task may find no node to alloc page. + */ + smp_mb(); + + /* + * Allocation of memory is very fast, we needn't sleep when waiting + * for the read-side. + */ + while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + task_unlock(tsk); + if (!task_curr(tsk)) + yield(); + goto repeat; + } + + /* + * ensure checking ->mems_allowed_change_disable before clearing all new + * disallowed nodes. + * + * if clearing newly disallowed bits before the checking, the read-side + * task may find no node to alloc page. + */ + smp_mb(); + + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; + task_unlock(tsk); } /* @@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p, cs = cgroup_cs(scan->cg); guarantee_online_mems(cs, newmems); - task_lock(p); cpuset_change_task_nodemask(p, newmems); - task_unlock(p); NODEMASK_FREE(newmems); @@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, err = set_cpus_allowed_ptr(tsk, cpus_attach); WARN_ON_ONCE(err); - task_lock(tsk); cpuset_change_task_nodemask(tsk, to); - task_unlock(tsk); cpuset_update_task_spread_flag(cs, tsk); } diff --git a/kernel/exit.c b/kernel/exit.c index eabca5a73a8..019a2843bf9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1002,8 +1002,10 @@ NORET_TYPE void do_exit(long code) exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA + task_lock(tsk); mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; + task_unlock(tsk); #endif #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) diff --git a/mm/filemap.c b/mm/filemap.c index d6f4f073836..88d719665a2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -461,9 +461,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { + int n; + struct page *page; + if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_exact_node(n, gfp, 0); + get_mems_allowed(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + put_mems_allowed(); + return page; } return alloc_pages(gfp, 0); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c9e6bbf377..54d42b009db 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -465,11 +465,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; - struct zonelist *zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); + struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + get_mems_allowed(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask, &mpol, &nodemask); /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are @@ -477,11 +479,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, */ if (!vma_has_reserves(vma) && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err;; for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { @@ -500,7 +502,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, break; } } +err: mpol_cond_put(mpol); + put_mems_allowed(); return page; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8a993db8802..721b2b33803 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1639,6 +1639,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. + * + * Must be protected by get_mems_allowed() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -1684,6 +1686,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) if (!(mask && current->mempolicy)) return false; + task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: @@ -1703,6 +1706,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) default: BUG(); } + task_unlock(current); return true; } @@ -1750,13 +1754,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; + struct page *page; + get_mems_allowed(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - return alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, 0, nid); + put_mems_allowed(); + return page; } zl = policy_zonelist(gfp, pol); if (unlikely(mpol_needs_cond_ref(pol))) { @@ -1766,12 +1774,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct page *page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); + put_mems_allowed(); return page; } /* * fast path: default or task policy */ - return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + put_mems_allowed(); + return page; } /** @@ -1796,18 +1807,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; + struct page *page; if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; + get_mems_allowed(); /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (pol->mode == MPOL_INTERLEAVE) - return alloc_page_interleave(gfp, order, interleave_nodes(pol)); - return __alloc_pages_nodemask(gfp, order, + page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else + page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + put_mems_allowed(); + return page; } EXPORT_SYMBOL(alloc_pages_current); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 596180fedd3..f7da2a2934b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1990,10 +1990,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + get_mems_allowed(); /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); - if (!preferred_zone) + if (!preferred_zone) { + put_mems_allowed(); return NULL; + } /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, @@ -2003,6 +2006,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + put_mems_allowed(); trace_mm_page_alloc(page, order, gfp_mask, migratetype); return page; diff --git a/mm/slab.c b/mm/slab.c index 50a73fca19c..02786e1a32d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3217,10 +3217,12 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; nid_alloc = nid_here = numa_node_id(); + get_mems_allowed(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_mem_spread_node(); else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); + put_mems_allowed(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3247,6 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) if (flags & __GFP_THISNODE) return NULL; + get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); @@ -3302,6 +3305,7 @@ retry: } } } + put_mems_allowed(); return obj; } diff --git a/mm/slub.c b/mm/slub.c index e46e3129697..26f0cb9cc58 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1360,6 +1360,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; + get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1369,10 +1370,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { page = get_partial_node(n); - if (page) + if (page) { + put_mems_allowed(); return page; + } } } + put_mems_allowed(); #endif return NULL; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ff3311447f..f2c367c9ec1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1774,6 +1774,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); unsigned long writeback_threshold; + get_mems_allowed(); delayacct_freepages_start(); if (scanning_global_lru(sc)) @@ -1857,6 +1858,7 @@ out: mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); delayacct_freepages_end(); + put_mems_allowed(); return ret; }