diff --git a/include/linux/mm.h b/include/linux/mm.h index 40236290e2a..ffee2f74341 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -131,6 +131,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) +/* + * special vmas that are non-mergable, non-mlock()able + */ +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index ec1a1baad34..b12f93a3c34 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -96,6 +96,7 @@ enum pageflags { PG_swapbacked, /* Page is backed by RAM/swap */ #ifdef CONFIG_UNEVICTABLE_LRU PG_unevictable, /* Page is "unevictable" */ + PG_mlocked, /* Page is vma mlocked */ #endif #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR PG_uncached, /* Page has been mapped as uncached */ @@ -232,7 +233,17 @@ PAGEFLAG_FALSE(SwapCache) #ifdef CONFIG_UNEVICTABLE_LRU PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) TESTCLEARFLAG(Unevictable, unevictable) + +#define MLOCK_PAGES 1 +PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) + TESTSCFLAG(Mlocked, mlocked) + #else + +#define MLOCK_PAGES 0 +PAGEFLAG_FALSE(Mlocked) + SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked) + PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable) SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable) __CLEARPAGEFLAG_NOOP(Unevictable) @@ -354,15 +365,17 @@ static inline void __ClearPageTail(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ #ifdef CONFIG_UNEVICTABLE_LRU -#define __PG_UNEVICTABLE (1 << PG_unevictable) +#define __PG_UNEVICTABLE (1 << PG_unevictable) +#define __PG_MLOCKED (1 << PG_mlocked) #else -#define __PG_UNEVICTABLE 0 +#define __PG_UNEVICTABLE 0 +#define __PG_MLOCKED 0 #endif #define PAGE_FLAGS (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \ 1 << PG_buddy | 1 << PG_writeback | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - __PG_UNEVICTABLE) + __PG_UNEVICTABLE | __PG_MLOCKED) /* * Flags checked in bad_page(). Pages on the free list should not have diff --git a/include/linux/rmap.h b/include/linux/rmap.h index fed6f5e0b41..955667e6a52 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -117,6 +117,19 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); */ int page_mkclean(struct page *); +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * called in munlock()/munmap() path to check for other vmas holding + * the page mlocked. + */ +int try_to_munlock(struct page *); +#else +static inline int try_to_munlock(struct page *page) +{ + return 0; /* a.k.a. SWAP_SUCCESS */ +} +#endif + #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) @@ -140,5 +153,6 @@ static inline int page_mkclean(struct page *page) #define SWAP_SUCCESS 0 #define SWAP_AGAIN 1 #define SWAP_FAIL 2 +#define SWAP_MLOCK 3 #endif /* _LINUX_RMAP_H */ diff --git a/mm/internal.h b/mm/internal.h index 3db17b2a1ac..4ebf0bef9a3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -61,6 +61,10 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +extern int mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +extern void munlock_vma_pages_all(struct vm_area_struct *vma); + #ifdef CONFIG_UNEVICTABLE_LRU /* * unevictable_migrate_page() called only from migrate_page_copy() to @@ -79,6 +83,65 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) } #endif +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * Called only in fault path via page_evictable() for a new page + * to determine if it's being mapped into a LOCKED vma. + * If so, mark page as mlocked. + */ +static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) +{ + VM_BUG_ON(PageLRU(page)); + + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) + return 0; + + SetPageMlocked(page); + return 1; +} + +/* + * must be called with vma's mmap_sem held for read, and page locked. + */ +extern void mlock_vma_page(struct page *page); + +/* + * Clear the page's PageMlocked(). This can be useful in a situation where + * we want to unconditionally remove a page from the pagecache -- e.g., + * on truncation or freeing. + * + * It is legal to call this function for any page, mlocked or not. + * If called for a page that is still mapped by mlocked vmas, all we do + * is revert to lazy LRU behaviour -- semantics are not broken. + */ +extern void __clear_page_mlock(struct page *page); +static inline void clear_page_mlock(struct page *page) +{ + if (unlikely(TestClearPageMlocked(page))) + __clear_page_mlock(page); +} + +/* + * mlock_migrate_page - called only from migrate_page_copy() to + * migrate the Mlocked page flag + */ +static inline void mlock_migrate_page(struct page *newpage, struct page *page) +{ + if (TestClearPageMlocked(page)) + SetPageMlocked(newpage); +} + + +#else /* CONFIG_UNEVICTABLE_LRU */ +static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) +{ + return 0; +} +static inline void clear_page_mlock(struct page *page) { } +static inline void mlock_vma_page(struct page *page) { } +static inline void mlock_migrate_page(struct page *new, struct page *old) { } + +#endif /* CONFIG_UNEVICTABLE_LRU */ /* * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, @@ -148,4 +211,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ +#define GUP_FLAGS_WRITE 0x1 +#define GUP_FLAGS_FORCE 0x2 +#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas); + #endif diff --git a/mm/memory.c b/mm/memory.c index 71cdefd1ef1..9fef7272fb9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -64,6 +64,8 @@ #include "internal.h" +#include "internal.h" + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -1129,12 +1131,17 @@ static inline int use_zero_page(struct vm_area_struct *vma) return !vma->vm_ops || !vma->vm_ops->fault; } -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int vm_flags; + unsigned int vm_flags = 0; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); if (len <= 0) return 0; @@ -1158,7 +1165,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pud_t *pud; pmd_t *pmd; pte_t *pte; - if (write) /* user gate pages are read-only */ + + /* user gate pages are read-only */ + if (!ignore && write) return i ? : -EFAULT; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); @@ -1190,8 +1199,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) - || !(vm_flags & vma->vm_flags)) + if (!vma || + (vma->vm_flags & (VM_IO | VM_PFNMAP)) || + (!ignore && !(vm_flags & vma->vm_flags))) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { @@ -1266,6 +1276,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } while (len); return i; } + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages(tsk, mm, + start, len, flags, + pages, vmas); +} + EXPORT_SYMBOL(get_user_pages); pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, @@ -1858,6 +1885,15 @@ gotten: new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) goto oom; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (vma->vm_flags & VM_LOCKED) { + lock_page(old_page); /* for LRU manipulation */ + clear_page_mlock(old_page); + unlock_page(old_page); + } cow_user_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); @@ -2325,7 +2361,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page_add_anon_rmap(page, vma, address); swap_free(entry); - if (vm_swap_full()) + if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) remove_exclusive_swap_page(page); unlock_page(page); @@ -2465,6 +2501,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_OOM; goto out; } + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (vma->vm_flags & VM_LOCKED) + clear_page_mlock(vmf.page); copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { diff --git a/mm/migrate.c b/mm/migrate.c index b10237d8b45..6802a7a3dfe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -371,6 +371,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page) __set_page_dirty_nobuffers(newpage); } + mlock_migrate_page(newpage, page); + #ifdef CONFIG_SWAP ClearPageSwapCache(page); #endif diff --git a/mm/mlock.c b/mm/mlock.c index 01fbe93eff5..8746fe3f973 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -8,10 +8,18 @@ #include #include #include +#include +#include +#include #include #include #include #include +#include +#include +#include + +#include "internal.h" int can_do_mlock(void) { @@ -23,17 +31,360 @@ int can_do_mlock(void) } EXPORT_SYMBOL(can_do_mlock); +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * Mlocked pages are marked with PageMlocked() flag for efficient testing + * in vmscan and, possibly, the fault path; and to support semi-accurate + * statistics. + * + * An mlocked page [PageMlocked(page)] is unevictable. As such, it will + * be placed on the LRU "unevictable" list, rather than the [in]active lists. + * The unevictable list is an LRU sibling list to the [in]active lists. + * PageUnevictable is set to indicate the unevictable state. + * + * When lazy mlocking via vmscan, it is important to ensure that the + * vma's VM_LOCKED status is not concurrently being modified, otherwise we + * may have mlocked a page that is being munlocked. So lazy mlock must take + * the mmap_sem for read, and verify that the vma really is locked + * (see mm/rmap.c). + */ + +/* + * LRU accounting for clear_page_mlock() + */ +void __clear_page_mlock(struct page *page) +{ + VM_BUG_ON(!PageLocked(page)); + + if (!page->mapping) { /* truncated ? */ + return; + } + + if (!isolate_lru_page(page)) { + putback_lru_page(page); + } else { + /* + * Page not on the LRU yet. Flush all pagevecs and retry. + */ + lru_add_drain_all(); + if (!isolate_lru_page(page)) + putback_lru_page(page); + } +} + +/* + * Mark page as mlocked if not already. + * If page on LRU, isolate and putback to move to unevictable list. + */ +void mlock_vma_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + if (!TestSetPageMlocked(page) && !isolate_lru_page(page)) + putback_lru_page(page); +} + +/* + * called from munlock()/munmap() path with page supposedly on the LRU. + * + * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked + * [in try_to_munlock()] and then attempt to isolate the page. We must + * isolate the page to keep others from messing with its unevictable + * and mlocked state while trying to munlock. However, we pre-clear the + * mlocked state anyway as we might lose the isolation race and we might + * not get another chance to clear PageMlocked. If we successfully + * isolate the page and try_to_munlock() detects other VM_LOCKED vmas + * mapping the page, it will restore the PageMlocked state, unless the page + * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), + * perhaps redundantly. + * If we lose the isolation race, and the page is mapped by other VM_LOCKED + * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() + * either of which will restore the PageMlocked state by calling + * mlock_vma_page() above, if it can grab the vma's mmap sem. + */ +static void munlock_vma_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + if (TestClearPageMlocked(page) && !isolate_lru_page(page)) { + try_to_munlock(page); + putback_lru_page(page); + } +} + +/* + * mlock a range of pages in the vma. + * + * This takes care of making the pages present too. + * + * vma->vm_mm->mmap_sem must be held for write. + */ +static int __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start; + struct page *pages[16]; /* 16 gives a reasonable batch */ + int write = !!(vma->vm_flags & VM_WRITE); + int nr_pages = (end - start) / PAGE_SIZE; + int ret; + + VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK); + VM_BUG_ON(start < vma->vm_start || end > vma->vm_end); + VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); + + lru_add_drain_all(); /* push cached pages to LRU */ + + while (nr_pages > 0) { + int i; + + cond_resched(); + + /* + * get_user_pages makes pages present if we are + * setting mlock. and this extra reference count will + * disable migration of this page. However, page may + * still be truncated out from under us. + */ + ret = get_user_pages(current, mm, addr, + min_t(int, nr_pages, ARRAY_SIZE(pages)), + write, 0, pages, NULL); + /* + * This can happen for, e.g., VM_NONLINEAR regions before + * a page has been allocated and mapped at a given offset, + * or for addresses that map beyond end of a file. + * We'll mlock the the pages if/when they get faulted in. + */ + if (ret < 0) + break; + if (ret == 0) { + /* + * We know the vma is there, so the only time + * we cannot get a single page should be an + * error (ret < 0) case. + */ + WARN_ON(1); + break; + } + + lru_add_drain(); /* push cached pages to LRU */ + + for (i = 0; i < ret; i++) { + struct page *page = pages[i]; + + lock_page(page); + /* + * Because we lock page here and migration is blocked + * by the elevated reference, we need only check for + * page truncation (file-cache only). + */ + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + put_page(page); /* ref from get_user_pages() */ + + /* + * here we assume that get_user_pages() has given us + * a list of virtually contiguous pages. + */ + addr += PAGE_SIZE; /* for next get_user_pages() */ + nr_pages--; + } + } + + lru_add_drain_all(); /* to update stats */ + + return 0; /* count entire vma as locked_vm */ +} + +/* + * private structure for munlock page table walk + */ +struct munlock_page_walk { + struct vm_area_struct *vma; + pmd_t *pmd; /* for migration_entry_wait() */ +}; + +/* + * munlock normal pages for present ptes + */ +static int __munlock_pte_handler(pte_t *ptep, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct munlock_page_walk *mpw = walk->private; + swp_entry_t entry; + struct page *page; + pte_t pte; + +retry: + pte = *ptep; + /* + * If it's a swap pte, we might be racing with page migration. + */ + if (unlikely(!pte_present(pte))) { + if (!is_swap_pte(pte)) + goto out; + entry = pte_to_swp_entry(pte); + if (is_migration_entry(entry)) { + migration_entry_wait(mpw->vma->vm_mm, mpw->pmd, addr); + goto retry; + } + goto out; + } + + page = vm_normal_page(mpw->vma, addr, pte); + if (!page) + goto out; + + lock_page(page); + if (!page->mapping) { + unlock_page(page); + goto retry; + } + munlock_vma_page(page); + unlock_page(page); + +out: + return 0; +} + +/* + * Save pmd for pte handler for waiting on migration entries + */ +static int __munlock_pmd_handler(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct munlock_page_walk *mpw = walk->private; + + mpw->pmd = pmd; + return 0; +} + + +/* + * munlock a range of pages in the vma using standard page table walk. + * + * vma->vm_mm->mmap_sem must be held for write. + */ +static void __munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + struct munlock_page_walk mpw = { + .vma = vma, + }; + struct mm_walk munlock_page_walk = { + .pmd_entry = __munlock_pmd_handler, + .pte_entry = __munlock_pte_handler, + .private = &mpw, + .mm = mm, + }; + + VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK); + VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); + VM_BUG_ON(start < vma->vm_start); + VM_BUG_ON(end > vma->vm_end); + + lru_add_drain_all(); /* push cached pages to LRU */ + walk_page_range(start, end, &munlock_page_walk); + lru_add_drain_all(); /* to update stats */ +} + +#else /* CONFIG_UNEVICTABLE_LRU */ + +/* + * Just make pages present if VM_LOCKED. No-op if unlocking. + */ +static int __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_flags & VM_LOCKED) + make_pages_present(start, end); + return 0; +} + +/* + * munlock a range of pages in the vma -- no-op. + */ +static void __munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ +} +#endif /* CONFIG_UNEVICTABLE_LRU */ + +/* + * mlock all pages in this vma range. For mmap()/mremap()/... + */ +int mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + int nr_pages = (end - start) / PAGE_SIZE; + BUG_ON(!(vma->vm_flags & VM_LOCKED)); + + /* + * filter unlockable vmas + */ + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + goto no_mlock; + + if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current))) + return __mlock_vma_pages_range(vma, start, end); + + /* + * User mapped kernel pages or huge pages: + * make these pages present to populate the ptes, but + * fall thru' to reset VM_LOCKED--no need to unlock, and + * return nr_pages so these don't get counted against task's + * locked limit. huge pages are already counted against + * locked vm limit. + */ + make_pages_present(start, end); + +no_mlock: + vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ + return nr_pages; /* pages NOT mlocked */ +} + + +/* + * munlock all pages in vma. For munmap() and exit(). + */ +void munlock_vma_pages_all(struct vm_area_struct *vma) +{ + vma->vm_flags &= ~VM_LOCKED; + __munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); +} + +/* + * mlock_fixup - handle mlock[all]/munlock[all] requests. + * + * Filters out "special" vmas -- VM_LOCKED never gets set for these, and + * munlock is a no-op. However, for some special vmas, we go ahead and + * populate the ptes via make_pages_present(). + * + * For vmas that pass the filters, merge/split as appropriate. + */ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned int newflags) { - struct mm_struct * mm = vma->vm_mm; + struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; - int pages; + int nr_pages; int ret = 0; + int lock = newflags & VM_LOCKED; - if (newflags == vma->vm_flags) { - *prev = vma; - goto out; + if (newflags == vma->vm_flags || + (vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; /* don't set VM_LOCKED, don't count */ + + if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current)) { + if (lock) + make_pages_present(start, end); + goto out; /* don't set VM_LOCKED, don't count */ } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); @@ -44,8 +395,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, goto success; } - *prev = vma; - if (start != vma->vm_start) { ret = split_vma(mm, vma, start, 1); if (ret) @@ -59,25 +408,32 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, } success: + /* + * Keep track of amount of locked VM. + */ + nr_pages = (end - start) >> PAGE_SHIFT; + if (!lock) + nr_pages = -nr_pages; + mm->locked_vm += nr_pages; + /* * vm_flags is protected by the mmap_sem held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we - * set VM_LOCKED, make_pages_present below will bring it back. + * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ vma->vm_flags = newflags; - /* - * Keep track of amount of locked VM. - */ - pages = (end - start) >> PAGE_SHIFT; - if (newflags & VM_LOCKED) { - pages = -pages; - if (!(newflags & VM_IO)) - ret = make_pages_present(start, end); - } + if (lock) { + ret = __mlock_vma_pages_range(vma, start, end); + if (ret > 0) { + mm->locked_vm -= ret; + ret = 0; + } + } else + __munlock_vma_pages_range(vma, start, end); - mm->locked_vm -= pages; out: + *prev = vma; return ret; } diff --git a/mm/mmap.c b/mm/mmap.c index e7a5a68a9c2..7bdfd2661f1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -662,8 +662,6 @@ again: remove_next = 1 + (end > next->vm_end); * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) - static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { diff --git a/mm/nommu.c b/mm/nommu.c index ed75bc962fb..2696b24f2bb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -34,6 +34,8 @@ #include #include +#include "internal.h" + void *high_memory; struct page *mem_map; unsigned long max_mapnr; @@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -/* - * get a list of pages in an address range belonging to the specified process - * and indicate the VMA that covers each page - * - this is potentially dodgy as we may end incrementing the page count of a - * slab page or a secondary page from a compound page - * - don't permit access to VMAs that don't support it, such as I/O mappings - */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas) { struct vm_area_struct *vma; unsigned long vm_flags; int i; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); /* calculate required read or write permissions. * - if 'force' is set, we only require the "MAY" flags. @@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, /* protect what we can, including chardevs */ if (vma->vm_flags & (VM_IO | VM_PFNMAP) || - !(vm_flags & vma->vm_flags)) + (!ignore && !(vm_flags & vma->vm_flags))) goto finish_or_fault; if (pages) { @@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, finish_or_fault: return i ? : -EFAULT; } + + +/* + * get a list of pages in an address range belonging to the specified process + * and indicate the VMA that covers each page + * - this is potentially dodgy as we may end incrementing the page count of a + * slab page or a secondary page from a compound page + * - don't permit access to VMAs that don't support it, such as I/O mappings + */ +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages(tsk, mm, + start, len, flags, + pages, vmas); +} EXPORT_SYMBOL(get_user_pages); DEFINE_RWLOCK(vmlist_lock); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4125230a1b2..5886586fde6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -616,7 +616,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); + 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk +#ifdef CONFIG_UNEVICTABLE_LRU + | 1 << PG_mlocked +#endif + ); set_page_private(page, 0); set_page_refcounted(page); diff --git a/mm/rmap.c b/mm/rmap.c index e8d639b16c6..7e60df99018 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -53,6 +53,8 @@ #include +#include "internal.h" + struct kmem_cache *anon_vma_cachep; /** @@ -290,6 +292,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, return NULL; } +/** + * page_mapped_in_vma - check whether a page is really mapped in a VMA + * @page: the page to test + * @vma: the VMA to test + * + * Returns 1 if the page is mapped into the page tables of the VMA, 0 + * if the page is not mapped into the page tables of this VMA. Only + * valid for normal file or anonymous VMAs. + */ +static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address; + pte_t *pte; + spinlock_t *ptl; + + address = vma_address(page, vma); + if (address == -EFAULT) /* out of vma range */ + return 0; + pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); + if (!pte) /* the page is not in this mm */ + return 0; + pte_unmap_unlock(pte, ptl); + + return 1; +} + /* * Subfunctions of page_referenced: page_referenced_one called * repeatedly from either page_referenced_anon or page_referenced_file. @@ -311,10 +339,17 @@ static int page_referenced_one(struct page *page, if (!pte) goto out; + /* + * Don't want to elevate referenced for mlocked page that gets this far, + * in order that it progresses to try_to_unmap and is moved to the + * unevictable list. + */ if (vma->vm_flags & VM_LOCKED) { - referenced++; *mapcount = 1; /* break early from loop */ - } else if (ptep_clear_flush_young_notify(vma, address, pte)) + goto out_unmap; + } + + if (ptep_clear_flush_young_notify(vma, address, pte)) referenced++; /* Pretend the page is referenced if the task has the @@ -323,6 +358,7 @@ static int page_referenced_one(struct page *page, rwsem_is_locked(&mm->mmap_sem)) referenced++; +out_unmap: (*mapcount)--; pte_unmap_unlock(pte, ptl); out: @@ -412,11 +448,6 @@ static int page_referenced_file(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) - == (VM_LOCKED|VM_MAYSHARE)) { - referenced++; - break; - } referenced += page_referenced_one(page, vma, &mapcount); if (!mapcount) break; @@ -739,11 +770,16 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration && ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young_notify(vma, address, pte)))) { - ret = SWAP_FAIL; - goto out_unmap; - } + if (!migration) { + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_MLOCK; + goto out_unmap; + } + if (ptep_clear_flush_young_notify(vma, address, pte)) { + ret = SWAP_FAIL; + goto out_unmap; + } + } /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); @@ -824,12 +860,17 @@ out: * For very sparsely populated VMAs this is a little inefficient - chances are * there there won't be many ptes located within the scan cluster. In this case * maybe we could scan further - to the end of the pte page, perhaps. + * + * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can + * acquire it without blocking. If vma locked, mlock the pages in the cluster, + * rather than unmapping them. If we encounter the "check_page" that vmscan is + * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. */ #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) -static void try_to_unmap_cluster(unsigned long cursor, - unsigned int *mapcount, struct vm_area_struct *vma) +static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, + struct vm_area_struct *vma, struct page *check_page) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -841,6 +882,8 @@ static void try_to_unmap_cluster(unsigned long cursor, struct page *page; unsigned long address; unsigned long end; + int ret = SWAP_AGAIN; + int locked_vma = 0; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -851,15 +894,26 @@ static void try_to_unmap_cluster(unsigned long cursor, pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - return; + return ret; pud = pud_offset(pgd, address); if (!pud_present(*pud)) - return; + return ret; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) - return; + return ret; + + /* + * MLOCK_PAGES => feature is configured. + * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, + * keep the sem while scanning the cluster for mlocking pages. + */ + if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { + locked_vma = (vma->vm_flags & VM_LOCKED); + if (!locked_vma) + up_read(&vma->vm_mm->mmap_sem); /* don't need it */ + } pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -872,6 +926,13 @@ static void try_to_unmap_cluster(unsigned long cursor, page = vm_normal_page(vma, address, *pte); BUG_ON(!page || PageAnon(page)); + if (locked_vma) { + mlock_vma_page(page); /* no-op if already mlocked */ + if (page == check_page) + ret = SWAP_MLOCK; + continue; /* don't unmap */ + } + if (ptep_clear_flush_young_notify(vma, address, pte)) continue; @@ -893,39 +954,104 @@ static void try_to_unmap_cluster(unsigned long cursor, (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + if (locked_vma) + up_read(&vma->vm_mm->mmap_sem); + return ret; } -static int try_to_unmap_anon(struct page *page, int migration) +/* + * common handling for pages mapped in VM_LOCKED vmas + */ +static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) +{ + int mlocked = 0; + + if (down_read_trylock(&vma->vm_mm->mmap_sem)) { + if (vma->vm_flags & VM_LOCKED) { + mlock_vma_page(page); + mlocked++; /* really mlocked the page */ + } + up_read(&vma->vm_mm->mmap_sem); + } + return mlocked; +} + +/** + * try_to_unmap_anon - unmap or unlock anonymous page using the object-based + * rmap method + * @page: the page to unmap/unlock + * @unlock: request for unlock rather than unmap [unlikely] + * @migration: unmapping for migration - ignored if @unlock + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the anon_vma struct it points to. + * + * This function is only called from try_to_unmap/try_to_munlock for + * anonymous pages. + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * 'LOCKED. + */ +static int try_to_unmap_anon(struct page *page, int unlock, int migration) { struct anon_vma *anon_vma; struct vm_area_struct *vma; + unsigned int mlocked = 0; int ret = SWAP_AGAIN; + if (MLOCK_PAGES && unlikely(unlock)) + ret = SWAP_SUCCESS; /* default for try_to_munlock() */ + anon_vma = page_lock_anon_vma(page); if (!anon_vma) return ret; list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - ret = try_to_unmap_one(page, vma, migration); - if (ret == SWAP_FAIL || !page_mapped(page)) - break; + if (MLOCK_PAGES && unlikely(unlock)) { + if (!((vma->vm_flags & VM_LOCKED) && + page_mapped_in_vma(page, vma))) + continue; /* must visit all unlocked vmas */ + ret = SWAP_MLOCK; /* saw at least one mlocked vma */ + } else { + ret = try_to_unmap_one(page, vma, migration); + if (ret == SWAP_FAIL || !page_mapped(page)) + break; + } + if (ret == SWAP_MLOCK) { + mlocked = try_to_mlock_page(page, vma); + if (mlocked) + break; /* stop if actually mlocked page */ + } } page_unlock_anon_vma(anon_vma); + + if (mlocked) + ret = SWAP_MLOCK; /* actually mlocked the page */ + else if (ret == SWAP_MLOCK) + ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ + return ret; } /** - * try_to_unmap_file - unmap file page using the object-based rmap method - * @page: the page to unmap - * @migration: migration flag + * try_to_unmap_file - unmap/unlock file page using the object-based rmap method + * @page: the page to unmap/unlock + * @unlock: request for unlock rather than unmap [unlikely] + * @migration: unmapping for migration - ignored if @unlock * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. * - * This function is only called from try_to_unmap for object-based pages. + * This function is only called from try_to_unmap/try_to_munlock for + * object-based pages. + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * 'LOCKED. */ -static int try_to_unmap_file(struct page *page, int migration) +static int try_to_unmap_file(struct page *page, int unlock, int migration) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -936,20 +1062,44 @@ static int try_to_unmap_file(struct page *page, int migration) unsigned long max_nl_cursor = 0; unsigned long max_nl_size = 0; unsigned int mapcount; + unsigned int mlocked = 0; + + if (MLOCK_PAGES && unlikely(unlock)) + ret = SWAP_SUCCESS; /* default for try_to_munlock() */ spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - ret = try_to_unmap_one(page, vma, migration); - if (ret == SWAP_FAIL || !page_mapped(page)) - goto out; + if (MLOCK_PAGES && unlikely(unlock)) { + if (!(vma->vm_flags & VM_LOCKED)) + continue; /* must visit all vmas */ + ret = SWAP_MLOCK; + } else { + ret = try_to_unmap_one(page, vma, migration); + if (ret == SWAP_FAIL || !page_mapped(page)) + goto out; + } + if (ret == SWAP_MLOCK) { + mlocked = try_to_mlock_page(page, vma); + if (mlocked) + break; /* stop if actually mlocked page */ + } } + if (mlocked) + goto out; + if (list_empty(&mapping->i_mmap_nonlinear)) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if ((vma->vm_flags & VM_LOCKED) && !migration) + if (MLOCK_PAGES && unlikely(unlock)) { + if (!(vma->vm_flags & VM_LOCKED)) + continue; /* must visit all vmas */ + ret = SWAP_MLOCK; /* leave mlocked == 0 */ + goto out; /* no need to look further */ + } + if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) @@ -959,7 +1109,7 @@ static int try_to_unmap_file(struct page *page, int migration) max_nl_size = cursor; } - if (max_nl_size == 0) { /* any nonlinears locked or reserved */ + if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ ret = SWAP_FAIL; goto out; } @@ -983,12 +1133,16 @@ static int try_to_unmap_file(struct page *page, int migration) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if ((vma->vm_flags & VM_LOCKED) && !migration) + if (!MLOCK_PAGES && !migration && + (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { - try_to_unmap_cluster(cursor, &mapcount, vma); + ret = try_to_unmap_cluster(cursor, &mapcount, + vma, page); + if (ret == SWAP_MLOCK) + mlocked = 2; /* to return below */ cursor += CLUSTER_SIZE; vma->vm_private_data = (void *) cursor; if ((int)mapcount <= 0) @@ -1009,6 +1163,10 @@ static int try_to_unmap_file(struct page *page, int migration) vma->vm_private_data = NULL; out: spin_unlock(&mapping->i_mmap_lock); + if (mlocked) + ret = SWAP_MLOCK; /* actually mlocked the page */ + else if (ret == SWAP_MLOCK) + ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ return ret; } @@ -1024,6 +1182,7 @@ out: * SWAP_SUCCESS - we succeeded in removing all mappings * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable + * SWAP_MLOCK - page is mlocked. */ int try_to_unmap(struct page *page, int migration) { @@ -1032,12 +1191,36 @@ int try_to_unmap(struct page *page, int migration) BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page, migration); + ret = try_to_unmap_anon(page, 0, migration); else - ret = try_to_unmap_file(page, migration); - - if (!page_mapped(page)) + ret = try_to_unmap_file(page, 0, migration); + if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; } +#ifdef CONFIG_UNEVICTABLE_LRU +/** + * try_to_munlock - try to munlock a page + * @page: the page to be munlocked + * + * Called from munlock code. Checks all of the VMAs mapping the page + * to make sure nobody else has this page mlocked. The page will be + * returned with PG_mlocked cleared if no other vmas have it mlocked. + * + * Return values are: + * + * SWAP_SUCCESS - no vma's holding page mlocked. + * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem + * SWAP_MLOCK - page is now mlocked. + */ +int try_to_munlock(struct page *page) +{ + VM_BUG_ON(!PageLocked(page) || PageLRU(page)); + + if (PageAnon(page)) + return try_to_unmap_anon(page, 1, 0); + else + return try_to_unmap_file(page, 1, 0); +} +#endif diff --git a/mm/swap.c b/mm/swap.c index fee6b973f14..bc58c1369dd 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -278,7 +278,7 @@ void lru_add_drain(void) put_cpu(); } -#ifdef CONFIG_NUMA +#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU) static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_drain(); diff --git a/mm/vmscan.c b/mm/vmscan.c index dfb342e0db9..e5aaaad159e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -582,11 +582,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, sc->nr_scanned++; - if (unlikely(!page_evictable(page, NULL))) { - unlock_page(page); - putback_lru_page(page); - continue; - } + if (unlikely(!page_evictable(page, NULL))) + goto cull_mlocked; if (!sc->may_swap && page_mapped(page)) goto keep_locked; @@ -624,9 +621,19 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) + if (PageAnon(page) && !PageSwapCache(page)) { + switch (try_to_munlock(page)) { + case SWAP_FAIL: /* shouldn't happen */ + case SWAP_AGAIN: + goto keep_locked; + case SWAP_MLOCK: + goto cull_mlocked; + case SWAP_SUCCESS: + ; /* fall thru'; add to swap cache */ + } if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; + } #endif /* CONFIG_SWAP */ mapping = page_mapping(page); @@ -641,6 +648,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto activate_locked; case SWAP_AGAIN: goto keep_locked; + case SWAP_MLOCK: + goto cull_mlocked; case SWAP_SUCCESS: ; /* try to free the page below */ } @@ -731,6 +740,11 @@ free_it: } continue; +cull_mlocked: + unlock_page(page); + putback_lru_page(page); + continue; + activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && vm_swap_full()) @@ -742,7 +756,7 @@ keep_locked: unlock_page(page); keep: list_add(&page->lru, &ret_pages); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) @@ -2329,12 +2343,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * @vma: the VMA in which the page is or will be mapped, may be NULL * * Test whether page is evictable--i.e., should be placed on active/inactive - * lists vs unevictable list. + * lists vs unevictable list. The vma argument is !NULL when called from the + * fault path to determine how to instantate a new page. * * Reasons page might not be evictable: * (1) page's mapping marked unevictable + * (2) page is part of an mlocked VMA * - * TODO - later patches */ int page_evictable(struct page *page, struct vm_area_struct *vma) { @@ -2342,7 +2357,8 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) if (mapping_unevictable(page_mapping(page))) return 0; - /* TODO: test page [!]evictable conditions */ + if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) + return 0; return 1; }