diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 34cbefd2ebd..93a849f742d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -149,14 +149,16 @@ struct zone { unsigned long pages_scanned; /* since last reclaim */ int all_unreclaimable; /* All pages pinned */ - /* - * Does the allocator try to reclaim pages from the zone as soon - * as it fails a watermark_ok() in __alloc_pages? - */ - int reclaim_pages; /* A count of how many reclaimers are scanning this zone */ atomic_t reclaim_in_progress; + /* + * timestamp (in jiffies) of the last zone reclaim that did not + * result in freeing of pages. This is used to avoid repeated scans + * if all memory in the zone is in use. + */ + unsigned long last_unsuccessful_zone_reclaim; + /* * prev_priority holds the scanning priority for this zone. It is * defined as the scanning priority at which we achieved our reclaim diff --git a/include/linux/swap.h b/include/linux/swap.h index d01f7efb0f2..4a99e4a7fbf 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -176,6 +176,17 @@ extern int try_to_free_pages(struct zone **, gfp_t); extern int shrink_all_memory(int); extern int vm_swappiness; +#ifdef CONFIG_NUMA +extern int zone_reclaim_mode; +extern int zone_reclaim(struct zone *, gfp_t, unsigned int); +#else +#define zone_reclaim_mode 0 +static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) +{ + return 0; +} +#endif + #ifdef CONFIG_MIGRATION extern int isolate_lru_page(struct page *p); extern int putback_lru_pages(struct list_head *l); diff --git a/include/linux/topology.h b/include/linux/topology.h index 315a5163d6a..e8eb0040ce3 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -56,6 +56,14 @@ #define REMOTE_DISTANCE 20 #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif +#ifndef RECLAIM_DISTANCE +/* + * If the distance between nodes in a system is larger than RECLAIM_DISTANCE + * (in whatever arch specific measurement units returned by node_distance()) + * then switch on zone reclaim on boot. + */ +#define RECLAIM_DISTANCE 20 +#endif #ifndef PENALTY_FOR_NODE_WITH_CPUS #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2e29743a8d..df54e2fc8ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, mark = (*z)->pages_high; if (!zone_watermark_ok(*z, order, mark, classzone_idx, alloc_flags)) - continue; + if (!zone_reclaim_mode || + !zone_reclaim(*z, gfp_mask, order)) + continue; } page = buffered_rmqueue(zonelist, *z, order, gfp_mask); @@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat) prev_node = local_node; nodes_clear(used_mask); while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { + int distance = node_distance(local_node, node); + + /* + * If another node is sufficiently far away then it is better + * to reclaim pages in a zone before going off node. + */ + if (distance > RECLAIM_DISTANCE) + zone_reclaim_mode = 1; + /* * We don't want to pressure a particular node. * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (node_distance(local_node, node) != - node_distance(local_node, prev_node)) + + if (distance != node_distance(local_node, prev_node)) node_load[node] += load; prev_node = node; load--; diff --git a/mm/vmscan.c b/mm/vmscan.c index e5117b6897a..2e34b61a70c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1572,3 +1572,71 @@ static int __init kswapd_init(void) } module_init(kswapd_init) + +#ifdef CONFIG_NUMA +/* + * Zone reclaim mode + * + * If non-zero call zone_reclaim when the number of free pages falls below + * the watermarks. + * + * In the future we may add flags to the mode. However, the page allocator + * should only have to check that zone_reclaim_mode != 0 before calling + * zone_reclaim(). + */ +int zone_reclaim_mode __read_mostly; + +/* + * Mininum time between zone reclaim scans + */ +#define ZONE_RECLAIM_INTERVAL HZ/2 +/* + * Try to free up some pages from this zone through reclaim. + */ +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + int nr_pages = 1 << order; + struct task_struct *p = current; + struct reclaim_state reclaim_state; + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = 0, + .may_swap = 0, + .nr_mapped = read_page_state(nr_mapped), + .nr_scanned = 0, + .nr_reclaimed = 0, + .priority = 0 + }; + + if (!(gfp_mask & __GFP_WAIT) || + zone->zone_pgdat->node_id != numa_node_id() || + zone->all_unreclaimable || + atomic_read(&zone->reclaim_in_progress) > 0) + return 0; + + if (time_before(jiffies, + zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL)) + return 0; + + disable_swap_token(); + + if (nr_pages > SWAP_CLUSTER_MAX) + sc.swap_cluster_max = nr_pages; + else + sc.swap_cluster_max = SWAP_CLUSTER_MAX; + + cond_resched(); + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + shrink_zone(zone, &sc); + p->reclaim_state = NULL; + current->flags &= ~PF_MEMALLOC; + + if (sc.nr_reclaimed == 0) + zone->last_unsuccessful_zone_reclaim = jiffies; + + return sc.nr_reclaimed > nr_pages; +} +#endif +