android_kernel_oneplus_msm8998/mm
Miles Chen 3d29e6420b mm/memcontrol.c: fix use after free in mem_cgroup_iter()
commit 54a83d6bcbf8f4700013766b974bf9190d40b689 upstream.

This patch is sent to report an use after free in mem_cgroup_iter()
after merging commit be2657752e9e ("mm: memcg: fix use after free in
mem_cgroup_iter()").

I work with android kernel tree (4.9 & 4.14), and commit be2657752e9e
("mm: memcg: fix use after free in mem_cgroup_iter()") has been merged
to the trees.  However, I can still observe use after free issues
addressed in the commit be2657752e9e.  (on low-end devices, a few times
this month)

backtrace:
        css_tryget <- crash here
        mem_cgroup_iter
        shrink_node
        shrink_zones
        do_try_to_free_pages
        try_to_free_pages
        __perform_reclaim
        __alloc_pages_direct_reclaim
        __alloc_pages_slowpath
        __alloc_pages_nodemask

To debug, I poisoned mem_cgroup before freeing it:

  static void __mem_cgroup_free(struct mem_cgroup *memcg)
        for_each_node(node)
        free_mem_cgroup_per_node_info(memcg, node);
        free_percpu(memcg->stat);
  +     /* poison memcg before freeing it */
  +     memset(memcg, 0x78, sizeof(struct mem_cgroup));
        kfree(memcg);
  }

The coredump shows the position=0xdbbc2a00 is freed.

  (gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
  $13 = {position = 0xdbbc2a00, generation = 0x2efd}

  0xdbbc2a00:     0xdbbc2e00      0x00000000      0xdbbc2800      0x00000100
  0xdbbc2a10:     0x00000200      0x78787878      0x00026218      0x00000000
  0xdbbc2a20:     0xdcad6000      0x00000001      0x78787800      0x00000000
  0xdbbc2a30:     0x78780000      0x00000000      0x0068fb84      0x78787878
  0xdbbc2a40:     0x78787878      0x78787878      0x78787878      0xe3fa5cc0
  0xdbbc2a50:     0x78787878      0x78787878      0x00000000      0x00000000
  0xdbbc2a60:     0x00000000      0x00000000      0x00000000      0x00000000
  0xdbbc2a70:     0x00000000      0x00000000      0x00000000      0x00000000
  0xdbbc2a80:     0x00000000      0x00000000      0x00000000      0x00000000
  0xdbbc2a90:     0x00000001      0x00000000      0x00000000      0x00100000
  0xdbbc2aa0:     0x00000001      0xdbbc2ac8      0x00000000      0x00000000
  0xdbbc2ab0:     0x00000000      0x00000000      0x00000000      0x00000000
  0xdbbc2ac0:     0x00000000      0x00000000      0xe5b02618      0x00001000
  0xdbbc2ad0:     0x00000000      0x78787878      0x78787878      0x78787878
  0xdbbc2ae0:     0x78787878      0x78787878      0x78787878      0x78787878
  0xdbbc2af0:     0x78787878      0x78787878      0x78787878      0x78787878
  0xdbbc2b00:     0x78787878      0x78787878      0x78787878      0x78787878
  0xdbbc2b10:     0x78787878      0x78787878      0x78787878      0x78787878
  0xdbbc2b20:     0x78787878      0x78787878      0x78787878      0x78787878
  0xdbbc2b30:     0x78787878      0x78787878      0x78787878      0x78787878
  0xdbbc2b40:     0x78787878      0x78787878      0x78787878      0x78787878
  0xdbbc2b50:     0x78787878      0x78787878      0x78787878      0x78787878
  0xdbbc2b60:     0x78787878      0x78787878      0x78787878      0x78787878
  0xdbbc2b70:     0x78787878      0x78787878      0x78787878      0x78787878
  0xdbbc2b80:     0x78787878      0x78787878      0x00000000      0x78787878
  0xdbbc2b90:     0x78787878      0x78787878      0x78787878      0x78787878
  0xdbbc2ba0:     0x78787878      0x78787878      0x78787878      0x78787878

In the reclaim path, try_to_free_pages() does not setup
sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
shrink_node().

In mem_cgroup_iter(), root is set to root_mem_cgroup because
sc->target_mem_cgroup is NULL.  It is possible to assign a memcg to
root_mem_cgroup.nodeinfo.iter in mem_cgroup_iter().

        try_to_free_pages
        	struct scan_control sc = {...}, target_mem_cgroup is 0x0;
        do_try_to_free_pages
        shrink_zones
        shrink_node
        	 mem_cgroup *root = sc->target_mem_cgroup;
        	 memcg = mem_cgroup_iter(root, NULL, &reclaim);
        mem_cgroup_iter()
        	if (!root)
        		root = root_mem_cgroup;
        	...

        	css = css_next_descendant_pre(css, &root->css);
        	memcg = mem_cgroup_from_css(css);
        	cmpxchg(&iter->position, pos, memcg);

My device uses memcg non-hierarchical mode.  When we release a memcg:
invalidate_reclaim_iterators() reaches only dead_memcg and its parents.
If non-hierarchical mode is used, invalidate_reclaim_iterators() never
reaches root_mem_cgroup.

  static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
  {
        struct mem_cgroup *memcg = dead_memcg;

        for (; memcg; memcg = parent_mem_cgroup(memcg)
        ...
  }

So the use after free scenario looks like:

  CPU1						CPU2

  try_to_free_pages
  do_try_to_free_pages
  shrink_zones
  shrink_node
  mem_cgroup_iter()
      if (!root)
      	root = root_mem_cgroup;
      ...
      css = css_next_descendant_pre(css, &root->css);
      memcg = mem_cgroup_from_css(css);
      cmpxchg(&iter->position, pos, memcg);

        				invalidate_reclaim_iterators(memcg);
        				...
        				__mem_cgroup_free()
        					kfree(memcg);

  try_to_free_pages
  do_try_to_free_pages
  shrink_zones
  shrink_node
  mem_cgroup_iter()
      if (!root)
      	root = root_mem_cgroup;
      ...
      mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
      iter = &mz->iter[reclaim->priority];
      pos = READ_ONCE(iter->position);
      css_tryget(&pos->css) <- use after free

To avoid this, we should also invalidate root_mem_cgroup.nodeinfo.iter
in invalidate_reclaim_iterators().

[cai@lca.pw: fix -Wparentheses compilation warning]
  Link: http://lkml.kernel.org/r/1564580753-17531-1-git-send-email-cai@lca.pw
Link: http://lkml.kernel.org/r/20190730015729.4406-1-miles.chen@mediatek.com
Fixes: 5ac8fb31ad ("mm: memcontrol: convert reclaim iterator to simple css refcounting")
Signed-off-by: Miles Chen <miles.chen@mediatek.com>
Signed-off-by: Qian Cai <cai@lca.pw>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-08-25 10:52:56 +02:00
..
kasan kasan: fix shadow_size calculation error in kasan_module_alloc 2018-08-24 13:26:58 +02:00
backing-dev.c writeback: synchronize sync(2) against cgroup writeback membership switches 2019-06-11 12:23:41 +02:00
balloon_compaction.c virtio_balloon: fix race between migration and ballooning 2016-03-03 15:07:18 -08:00
bootmem.c bootmem: avoid freeing to bootmem after bootmem is done 2015-09-08 15:35:28 -07:00
cleancache.c cleancache: remove limit on the number of cleancache enabled filesystems 2015-04-14 16:49:03 -07:00
cma.c mm/cma.c: fail if fixed declaration can't be honored 2019-08-06 18:28:27 +02:00
cma.h mm: cma: mark cma_bitmap_maxno() inline in header 2015-08-14 15:56:32 -07:00
cma_debug.c mm/cma_debug.c: fix the break condition in cma_maxchunk_get() 2019-06-22 08:18:18 +02:00
compaction.c mm/compaction: pass only pageblock aligned range to pageblock_pfn_to_page 2018-01-17 09:35:26 +01:00
debug-pagealloc.c mm, hwpoison: fixup "mm: check the return value of lookup_page_ext for all call sites" 2017-11-24 11:26:29 +01:00
debug.c mm: get rid of vmacache_flush_all() entirely 2018-09-19 22:49:00 +02:00
dmapool.c mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd 2015-11-06 17:50:42 -08:00
early_ioremap.c mm/early_ioremap: Fix boot hang with earlyprintk=efi,keep 2018-02-25 11:03:41 +01:00
fadvise.c mm/fadvise.c: fix signed overflow UBSAN complaint 2018-09-15 09:40:38 +02:00
failslab.c mm, page_alloc: rename __GFP_WAIT to __GFP_RECLAIM 2015-11-06 17:50:42 -08:00
filemap.c mm: filemap: avoid unnecessary calls to lock_page when waiting for IO to complete during a read 2018-05-26 08:48:54 +02:00
frame_vector.c mm: replace get_vaddr_frames() write/force parameters with gup_flags 2018-12-17 21:55:16 +01:00
frontswap.c frontswap: allow multiple backends 2015-06-24 17:49:45 -07:00
gup.c proc: do not access cmdline nor environ from file-backed areas 2018-12-17 21:55:17 +01:00
highmem.c
huge_memory.c mremap: properly flush TLB before releasing the page 2018-11-10 07:41:42 -08:00
hugetlb.c hugetlbfs: on restore reserve error path retain subpool reservation 2019-06-22 08:18:17 +02:00
hugetlb_cgroup.c mm: make compound_head() robust 2015-11-06 17:50:42 -08:00
hwpoison-inject.c hwpoison: use page_cgroup_ino for filtering by memcg 2015-09-10 13:29:01 -07:00
init-mm.c mm: Add a user_ns owner to mm_struct and fix ptrace permission checks 2017-01-06 11:16:11 +01:00
internal.h mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries 2017-08-11 09:08:50 -07:00
interval_tree.c
Kconfig mm: don't allow deferred pages with NEED_PER_CPU_KM 2018-05-26 08:48:55 +02:00
Kconfig.debug
kmemcheck.c
kmemleak-test.c
kmemleak.c mm/kmemleak.c: fix check for softirq context 2019-08-04 09:34:59 +02:00
ksm.c mm/ksm: fix interaction with THP 2018-05-30 07:49:08 +02:00
list_lru.c mm/list_lru.c: fix memory leak in __memcg_init_list_lru_node 2019-06-22 08:18:22 +02:00
maccess.c mm/maccess.c: actually return -EFAULT from strncpy_from_unsafe 2015-11-05 19:34:48 -08:00
madvise.c mm: madvise(MADV_DODUMP): allow hugetlbfs pages 2018-10-10 08:52:11 +02:00
Makefile media updates for v4.3-rc1 2015-09-11 16:42:39 -07:00
memblock.c mm: consider memblock reservations for deferred memory initialization sizing 2017-06-14 13:16:26 +02:00
memcontrol.c mm/memcontrol.c: fix use after free in mem_cgroup_iter() 2019-08-25 10:52:56 +02:00
memory-failure.c hwpoison, memcg: forcibly uncharge LRU pages 2018-01-31 12:06:09 +01:00
memory.c mm: replace access_remote_vm() write parameter with gup_flags 2018-12-17 21:55:17 +01:00
memory_hotplug.c mm, memory_hotplug: test_pages_in_a_zone do not pass the end of zone 2019-03-23 08:44:26 +01:00
mempolicy.c mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified 2019-04-27 09:33:47 +02:00
mempool.c mm/mempool: avoid KASAN marking mempool poison checks as use-after-free 2017-08-12 19:29:09 -07:00
memtest.c memtest: remove unused header files 2015-09-08 15:35:28 -07:00
migrate.c hugetlbfs: fix races and page leaks during migration 2019-03-23 08:44:23 +01:00
mincore.c mm/mincore.c: make mincore() more conservative 2019-06-11 12:23:36 +02:00
mlock.c mm: mlock: avoid increase mm->locked_vm on mlock() when already mlock2(,MLOCK_ONFAULT) 2018-12-13 09:21:33 +01:00
mm_init.c mm: meminit: remove mminit_verify_page_links 2015-06-30 19:44:56 -07:00
mmap.c coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping 2019-06-22 08:18:27 +02:00
mmu_context.c mm/mmu_context, sched/core: Fix mmu_context.h assumption 2017-12-25 14:22:09 +01:00
mmu_notifier.c mm/mmu_notifier: use hlist_add_head_rcu() 2019-08-04 09:34:59 +02:00
mmzone.c
mprotect.c x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings 2018-08-15 17:42:10 +02:00
mremap.c mremap: properly flush TLB before releasing the page 2018-11-10 07:41:42 -08:00
msync.c mm/msync: use offset_in_page macro 2015-11-05 19:34:48 -08:00
nobootmem.c mm: page_alloc: pass PFN to __free_pages_bootmem 2015-06-30 19:44:55 -07:00
nommu.c mm: replace access_remote_vm() write parameter with gup_flags 2018-12-17 21:55:17 +01:00
oom_kill.c mm, oom: fix use-after-free in oom_kill_process 2019-02-06 19:43:07 +01:00
page-writeback.c mm/page-writeback.c: don't break integrity writeback on ->writepage() error 2019-01-26 09:42:55 +01:00
page_alloc.c mm, page_alloc: do not break __GFP_THISNODE by zonelist reset 2018-07-11 16:03:51 +02:00
page_counter.c mm: page_counter: let page_counter_try_charge() return bool 2015-11-05 19:34:48 -08:00
page_ext.c mm/page_ext.c: fix an imbalance with kmemleak 2019-04-27 09:33:48 +02:00
page_idle.c mm/page_idle.c: fix oops because end_pfn is larger than max_pfn 2019-07-10 09:56:30 +02:00
page_io.c fs: use helper bio_add_page() instead of open coding on bi_io_vec 2015-08-13 12:32:00 -06:00
page_isolation.c mm: fix invalid node in alloc_migrate_target() 2016-04-20 15:41:53 +09:00
page_owner.c mm: check the return value of lookup_page_ext for all call sites 2017-11-24 08:32:25 +01:00
pagewalk.c mm/pagewalk.c: report holes in hugetlb ranges 2017-11-24 08:32:25 +01:00
percpu-km.c
percpu-vm.c
percpu.c percpu: include linux/sched.h for cond_resched() 2018-05-16 10:06:46 +02:00
pgtable-generic.c mm,thp: khugepaged: call pte flush at the time of collapse 2016-02-25 12:01:23 -08:00
process_vm_access.c mm: replace get_user_pages_unlocked() write/force parameters with gup_flags 2018-12-17 21:55:16 +01:00
quicklist.c
readahead.c mm, fs: introduce mapping_gfp_constraint() 2015-11-06 17:50:42 -08:00
rmap.c mm/rmap: replace BUG_ON(anon_vma->degree) with VM_WARN_ON 2019-04-03 06:23:18 +02:00
shmem.c tmpfs: fix uninitialized return value in shmem_link 2019-03-23 08:44:34 +01:00
slab.c mm/slab.c: kmemleak no scan alien caches 2019-04-27 09:33:48 +02:00
slab.h slab/slub: adjust kmem_cache_alloc_bulk API 2015-11-22 11:58:44 -08:00
slab_common.c slub: do not merge cache if slub_debug contains a never-merge flag 2017-10-21 17:09:05 +02:00
slob.c slab/slub: adjust kmem_cache_alloc_bulk API 2015-11-22 11:58:44 -08:00
slub.c slub: make ->cpu_partial unsigned int 2018-10-10 08:52:08 +02:00
sparse-vmemmap.c
sparse.c
swap.c mm: make compound_head() robust 2015-11-06 17:50:42 -08:00
swap_cgroup.c mm, swap_cgroup: reschedule when neeed in swap_cgroup_swapoff() 2017-07-05 14:37:15 +02:00
swap_state.c mm: swap: zswap: maybe_preload & refactoring 2015-09-08 15:35:28 -07:00
swapfile.c x86/speculation/l1tf: Limit swap file size to MAX_PA/2 2018-08-15 17:42:10 +02:00
truncate.c mm: cleancache: fix corruption on missed inode invalidation 2018-12-13 09:21:33 +01:00
userfaultfd.c userfaultfd: avoid mmap_sem read recursion in mcopy_atomic 2015-09-04 16:54:41 -07:00
util.c mm: replace get_user_pages_unlocked() write/force parameters with gup_flags 2018-12-17 21:55:16 +01:00
vmacache.c mm: get rid of vmacache_flush_all() entirely 2018-09-19 22:49:00 +02:00
vmalloc.c mm/vmalloc: Sync unmappings in __purge_vmap_area_lazy() 2019-08-25 10:52:44 +02:00
vmpressure.c mm: vmpressure: fix sending wrong events on underflow 2017-03-12 06:37:25 +01:00
vmscan.c mm: fix the NULL mapping case in __isolate_lru_page() 2018-06-06 16:46:23 +02:00
vmstat.c mm, vmstat: make quiet_vmstat lighter 2019-08-04 09:35:01 +02:00
workingset.c mm: workingset: fix crash in shadow node shrinker caused by replace_page_cache_page() 2016-10-28 03:01:34 -04:00
zbud.c mm: zsmalloc: constify struct zs_pool name 2015-11-06 17:50:42 -08:00
zpool.c mm: zsmalloc: constify struct zs_pool name 2015-11-06 17:50:42 -08:00
zsmalloc.c zsmalloc: fix zs_can_compact() integer overflow 2016-05-18 17:06:44 -07:00
zswap.c zswap: re-check zswap_is_full() after do zswap_shrink() 2018-09-05 09:18:36 +02:00