mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
If swap is backed by network storage such as NBD, there is a risk that a large number of reclaimers can hang the system by consuming all PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune min_free_kbytes in advance which is a bit fragile. This patch throttles direct reclaimers if half the PF_MEMALLOC reserves are in use. If the system is routinely getting throttled the system administrator can increase min_free_kbytes so degradation is smoother but the system will keep running. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Miller <davem@davemloft.net> Cc: Neil Brown <neilb@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Eric B Munson <emunson@mgebm.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Mel Gorman <mgorman@suse.de> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
7f338fe454
commit
5515061d22
3 changed files with 122 additions and 8 deletions
|
@ -705,6 +705,7 @@ typedef struct pglist_data {
|
||||||
range, including holes */
|
range, including holes */
|
||||||
int node_id;
|
int node_id;
|
||||||
wait_queue_head_t kswapd_wait;
|
wait_queue_head_t kswapd_wait;
|
||||||
|
wait_queue_head_t pfmemalloc_wait;
|
||||||
struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
|
struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
|
||||||
int kswapd_max_order;
|
int kswapd_max_order;
|
||||||
enum zone_type classzone_idx;
|
enum zone_type classzone_idx;
|
||||||
|
|
|
@ -4389,6 +4389,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
||||||
pgdat_resize_init(pgdat);
|
pgdat_resize_init(pgdat);
|
||||||
pgdat->nr_zones = 0;
|
pgdat->nr_zones = 0;
|
||||||
init_waitqueue_head(&pgdat->kswapd_wait);
|
init_waitqueue_head(&pgdat->kswapd_wait);
|
||||||
|
init_waitqueue_head(&pgdat->pfmemalloc_wait);
|
||||||
pgdat->kswapd_max_order = 0;
|
pgdat->kswapd_max_order = 0;
|
||||||
pgdat_page_cgroup_init(pgdat);
|
pgdat_page_cgroup_init(pgdat);
|
||||||
|
|
||||||
|
|
128
mm/vmscan.c
128
mm/vmscan.c
|
@ -2112,6 +2112,80 @@ out:
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
|
||||||
|
{
|
||||||
|
struct zone *zone;
|
||||||
|
unsigned long pfmemalloc_reserve = 0;
|
||||||
|
unsigned long free_pages = 0;
|
||||||
|
int i;
|
||||||
|
bool wmark_ok;
|
||||||
|
|
||||||
|
for (i = 0; i <= ZONE_NORMAL; i++) {
|
||||||
|
zone = &pgdat->node_zones[i];
|
||||||
|
pfmemalloc_reserve += min_wmark_pages(zone);
|
||||||
|
free_pages += zone_page_state(zone, NR_FREE_PAGES);
|
||||||
|
}
|
||||||
|
|
||||||
|
wmark_ok = free_pages > pfmemalloc_reserve / 2;
|
||||||
|
|
||||||
|
/* kswapd must be awake if processes are being throttled */
|
||||||
|
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
|
||||||
|
pgdat->classzone_idx = min(pgdat->classzone_idx,
|
||||||
|
(enum zone_type)ZONE_NORMAL);
|
||||||
|
wake_up_interruptible(&pgdat->kswapd_wait);
|
||||||
|
}
|
||||||
|
|
||||||
|
return wmark_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Throttle direct reclaimers if backing storage is backed by the network
|
||||||
|
* and the PFMEMALLOC reserve for the preferred node is getting dangerously
|
||||||
|
* depleted. kswapd will continue to make progress and wake the processes
|
||||||
|
* when the low watermark is reached
|
||||||
|
*/
|
||||||
|
static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
|
||||||
|
nodemask_t *nodemask)
|
||||||
|
{
|
||||||
|
struct zone *zone;
|
||||||
|
int high_zoneidx = gfp_zone(gfp_mask);
|
||||||
|
pg_data_t *pgdat;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Kernel threads should not be throttled as they may be indirectly
|
||||||
|
* responsible for cleaning pages necessary for reclaim to make forward
|
||||||
|
* progress. kjournald for example may enter direct reclaim while
|
||||||
|
* committing a transaction where throttling it could forcing other
|
||||||
|
* processes to block on log_wait_commit().
|
||||||
|
*/
|
||||||
|
if (current->flags & PF_KTHREAD)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* Check if the pfmemalloc reserves are ok */
|
||||||
|
first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
|
||||||
|
pgdat = zone->zone_pgdat;
|
||||||
|
if (pfmemalloc_watermark_ok(pgdat))
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the caller cannot enter the filesystem, it's possible that it
|
||||||
|
* is due to the caller holding an FS lock or performing a journal
|
||||||
|
* transaction in the case of a filesystem like ext[3|4]. In this case,
|
||||||
|
* it is not safe to block on pfmemalloc_wait as kswapd could be
|
||||||
|
* blocked waiting on the same lock. Instead, throttle for up to a
|
||||||
|
* second before continuing.
|
||||||
|
*/
|
||||||
|
if (!(gfp_mask & __GFP_FS)) {
|
||||||
|
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
|
||||||
|
pfmemalloc_watermark_ok(pgdat), HZ);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Throttle until kswapd wakes the process */
|
||||||
|
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
|
||||||
|
pfmemalloc_watermark_ok(pgdat));
|
||||||
|
}
|
||||||
|
|
||||||
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
|
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
|
||||||
gfp_t gfp_mask, nodemask_t *nodemask)
|
gfp_t gfp_mask, nodemask_t *nodemask)
|
||||||
{
|
{
|
||||||
|
@ -2131,6 +2205,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
|
||||||
.gfp_mask = sc.gfp_mask,
|
.gfp_mask = sc.gfp_mask,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Do not enter reclaim if fatal signal is pending. 1 is returned so
|
||||||
|
* that the page allocator does not consider triggering OOM
|
||||||
|
*/
|
||||||
|
if (fatal_signal_pending(current))
|
||||||
|
return 1;
|
||||||
|
|
||||||
trace_mm_vmscan_direct_reclaim_begin(order,
|
trace_mm_vmscan_direct_reclaim_begin(order,
|
||||||
sc.may_writepage,
|
sc.may_writepage,
|
||||||
gfp_mask);
|
gfp_mask);
|
||||||
|
@ -2275,8 +2358,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
|
||||||
return balanced_pages >= (present_pages >> 2);
|
return balanced_pages >= (present_pages >> 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* is kswapd sleeping prematurely? */
|
/*
|
||||||
static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
|
* Prepare kswapd for sleeping. This verifies that there are no processes
|
||||||
|
* waiting in throttle_direct_reclaim() and that watermarks have been met.
|
||||||
|
*
|
||||||
|
* Returns true if kswapd is ready to sleep
|
||||||
|
*/
|
||||||
|
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
|
||||||
int classzone_idx)
|
int classzone_idx)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
@ -2285,7 +2373,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
|
||||||
|
|
||||||
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
|
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
|
||||||
if (remaining)
|
if (remaining)
|
||||||
return true;
|
return false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* There is a potential race between when kswapd checks its watermarks
|
||||||
|
* and a process gets throttled. There is also a potential race if
|
||||||
|
* processes get throttled, kswapd wakes, a large process exits therby
|
||||||
|
* balancing the zones that causes kswapd to miss a wakeup. If kswapd
|
||||||
|
* is going to sleep, no process should be sleeping on pfmemalloc_wait
|
||||||
|
* so wake them now if necessary. If necessary, processes will wake
|
||||||
|
* kswapd and get throttled again
|
||||||
|
*/
|
||||||
|
if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
|
||||||
|
wake_up(&pgdat->pfmemalloc_wait);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/* Check the watermark levels */
|
/* Check the watermark levels */
|
||||||
for (i = 0; i <= classzone_idx; i++) {
|
for (i = 0; i <= classzone_idx; i++) {
|
||||||
|
@ -2318,9 +2420,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
|
||||||
* must be balanced
|
* must be balanced
|
||||||
*/
|
*/
|
||||||
if (order)
|
if (order)
|
||||||
return !pgdat_balanced(pgdat, balanced, classzone_idx);
|
return pgdat_balanced(pgdat, balanced, classzone_idx);
|
||||||
else
|
else
|
||||||
return !all_zones_ok;
|
return all_zones_ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -2546,6 +2648,16 @@ loop_again:
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the low watermark is met there is no need for processes
|
||||||
|
* to be throttled on pfmemalloc_wait as they should not be
|
||||||
|
* able to safely make forward progress. Wake them
|
||||||
|
*/
|
||||||
|
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
|
||||||
|
pfmemalloc_watermark_ok(pgdat))
|
||||||
|
wake_up(&pgdat->pfmemalloc_wait);
|
||||||
|
|
||||||
if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
|
if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
|
||||||
break; /* kswapd: all done */
|
break; /* kswapd: all done */
|
||||||
/*
|
/*
|
||||||
|
@ -2647,7 +2759,7 @@ out:
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the order we were reclaiming at so sleeping_prematurely()
|
* Return the order we were reclaiming at so prepare_kswapd_sleep()
|
||||||
* makes a decision on the order we were last reclaiming at. However,
|
* makes a decision on the order we were last reclaiming at. However,
|
||||||
* if another caller entered the allocator slow path while kswapd
|
* if another caller entered the allocator slow path while kswapd
|
||||||
* was awake, order will remain at the higher level
|
* was awake, order will remain at the higher level
|
||||||
|
@ -2667,7 +2779,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
|
||||||
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
|
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
|
||||||
|
|
||||||
/* Try to sleep for a short interval */
|
/* Try to sleep for a short interval */
|
||||||
if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
|
if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
|
||||||
remaining = schedule_timeout(HZ/10);
|
remaining = schedule_timeout(HZ/10);
|
||||||
finish_wait(&pgdat->kswapd_wait, &wait);
|
finish_wait(&pgdat->kswapd_wait, &wait);
|
||||||
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
|
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
|
||||||
|
@ -2677,7 +2789,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
|
||||||
* After a short sleep, check if it was a premature sleep. If not, then
|
* After a short sleep, check if it was a premature sleep. If not, then
|
||||||
* go fully to sleep until explicitly woken up.
|
* go fully to sleep until explicitly woken up.
|
||||||
*/
|
*/
|
||||||
if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
|
if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
|
||||||
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
|
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Add table
Reference in a new issue