diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index f72370b440b1..3c2e3c372782 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -30,6 +30,7 @@ Currently, these files are in /proc/sys/vm: - dirty_writeback_centisecs - drop_caches - extfrag_threshold +- extra_free_kbytes - hugepages_treat_as_movable - hugetlb_shm_group - laptop_mode @@ -236,6 +237,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500. ============================================================== +extra_free_kbytes + +This parameter tells the VM to keep extra free memory between the threshold +where background reclaim (kswapd) kicks in, and the threshold where direct +reclaim (by allocating processes) kicks in. + +This is useful for workloads that require low latency memory allocations +and have a bounded burstiness in memory allocations, for example a +realtime application that receives and transmits network traffic +(causing in-kernel memory allocations) with a maximum total message burst +size of 200MB may need 200MB of extra free memory to avoid direct reclaim +related latencies. + +============================================================== + hugepages_treat_as_movable This parameter controls whether we can allocate hugepages from ZONE_MOVABLE diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dc6858d6639e..81fe3bde9fee 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -104,6 +104,7 @@ extern char core_pattern[]; extern unsigned int core_pipe_limit; #endif extern int pid_max; +extern int extra_free_kbytes; extern int pid_max_min, pid_max_max; extern int percpu_pagelist_fraction; extern int compat_log; @@ -1392,6 +1393,14 @@ static struct ctl_table vm_table[] = { .proc_handler = min_free_kbytes_sysctl_handler, .extra1 = &zero, }, + { + .procname = "extra_free_kbytes", + .data = &extra_free_kbytes, + .maxlen = sizeof(extra_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = &zero, + }, { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d666df5ef95..89ef8347900c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -238,9 +238,21 @@ compound_page_dtor * const compound_page_dtors[] = { #endif }; +/* + * Try to keep at least this much lowmem free. Do not allow normal + * allocations below this point, only high priority ones. Automatically + * tuned according to the amount of memory in the system. + */ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +/* + * Extra memory for the system to try freeing. Used to temporarily + * free memory, to make space for new workloads. Anyone can allocate + * down to the min watermarks controlled by min_free_kbytes above. + */ +int extra_free_kbytes = 0; + static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; @@ -6015,6 +6027,7 @@ static void setup_per_zone_lowmem_reserve(void) static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; @@ -6026,11 +6039,14 @@ static void __setup_per_zone_wmarks(void) } for_each_zone(zone) { - u64 tmp; + u64 min, low; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->managed_pages; - do_div(tmp, lowmem_pages); + min = (u64)pages_min * zone->managed_pages; + do_div(min, lowmem_pages); + low = (u64)pages_low * zone->managed_pages; + do_div(low, vm_total_pages); + if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -6051,11 +6067,13 @@ static void __setup_per_zone_wmarks(void) * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->watermark[WMARK_MIN] = tmp; + zone->watermark[WMARK_MIN] = min; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + + low + (min >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + + low + (min >> 1); __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -6178,7 +6196,7 @@ module_init(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so * that we can call two helper functions whenever min_free_kbytes - * changes. + * or extra_free_kbytes changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos)