android_kernel_oneplus_msm8998/mm/mprotect.c
Colin Cross 586278d78b mm: add a field to store names for private anonymous memory
Userspace processes often have multiple allocators that each do
anonymous mmaps to get memory.  When examining memory usage of
individual processes or systems as a whole, it is useful to be
able to break down the various heaps that were allocated by
each layer and examine their size, RSS, and physical memory
usage.

This patch adds a user pointer to the shared union in
vm_area_struct that points to a null terminated string inside
the user process containing a name for the vma.  vmas that
point to the same address will be merged, but vmas that
point to equivalent strings at different addresses will
not be merged.

Userspace can set the name for a region of memory by calling
prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
Setting the name to NULL clears it.

The names of named anonymous vmas are shown in /proc/pid/maps
as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
that is only present for named vmas.  If the userspace pointer
is no longer valid all or part of the name will be replaced
with "<fault>".

The idea to store a userspace pointer to reduce the complexity
within mm (at the expense of the complexity of reading
/proc/pid/mem) came from Dave Hansen.  This results in no
runtime overhead in the mm subsystem other than comparing
the anon_name pointers when considering vma merging.  The pointer
is stored in a union with fieds that are only used on file-backed
mappings, so it does not increase memory usage.

Includes fix from Jed Davis <jld@mozilla.com> for typo in
prctl_set_vma_anon_name, which could attempt to set the name
across two vmas at the same time due to a typo, which might
corrupt the vma list.  Fix it to use tmp instead of end to limit
the name setting to a single vma at a time.

Change-Id: I9aa7b6b5ef536cd780599ba4e2fba8ceebe8b59f
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
2016-02-16 13:54:13 -08:00

445 lines
11 KiB
C

/*
* mm/mprotect.c
*
* (C) Copyright 1994 Linus Torvalds
* (C) Copyright 2002 Christoph Hellwig
*
* Address space accounting code <alan@lxorguk.ukuu.org.uk>
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
*/
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/mempolicy.h>
#include <linux/personality.h>
#include <linux/syscalls.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/perf_event.h>
#include <linux/ksm.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include "internal.h"
/*
* For a prot_numa update we only hold mmap_sem for read so there is a
* potential race with faulting where a pmd was temporarily none. This
* function checks for a transhuge pmd under the appropriate lock. It
* returns a pte if it was successfully locked or NULL if it raced with
* a transhuge insertion.
*/
static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, int prot_numa, spinlock_t **ptl)
{
pte_t *pte;
spinlock_t *pmdl;
/* !prot_numa is protected by mmap_sem held for write */
if (!prot_numa)
return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
pmdl = pmd_lock(vma->vm_mm, pmd);
if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
spin_unlock(pmdl);
return NULL;
}
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
spin_unlock(pmdl);
return pte;
}
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
if (!pte)
return 0;
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
bool preserve_write = prot_numa && pte_write(oldpte);
/*
* Avoid trapping faults against the zero or KSM
* pages. See similar comment in change_huge_pmd.
*/
if (prot_numa) {
struct page *page;
page = vm_normal_page(vma, addr, oldpte);
if (!page || PageKsm(page))
continue;
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
continue;
}
ptent = ptep_modify_prot_start(mm, addr, pte);
ptent = pte_modify(ptent, newprot);
if (preserve_write)
ptent = pte_mkwrite(ptent);
/* Avoid taking write faults for known dirty pages */
if (dirty_accountable && pte_dirty(ptent) &&
(pte_soft_dirty(ptent) ||
!(vma->vm_flags & VM_SOFTDIRTY))) {
ptent = pte_mkwrite(ptent);
}
ptep_modify_prot_commit(mm, addr, pte, ptent);
pages++;
} else if (IS_ENABLED(CONFIG_MIGRATION)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
if (is_write_migration_entry(entry)) {
pte_t newpte;
/*
* A protection check is difficult so
* just be safe and disable write
*/
make_migration_entry_read(&entry);
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
set_pte_at(mm, addr, pte, newpte);
pages++;
}
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
return pages;
}
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
struct mm_struct *mm = vma->vm_mm;
unsigned long next;
unsigned long pages = 0;
unsigned long nr_huge_updates = 0;
unsigned long mni_start = 0;
pmd = pmd_offset(pud, addr);
do {
unsigned long this_pages;
next = pmd_addr_end(addr, end);
if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
continue;
/* invoke the mmu notifier if the pmd is populated */
if (!mni_start) {
mni_start = addr;
mmu_notifier_invalidate_range_start(mm, mni_start, end);
}
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
split_huge_page_pmd(vma, addr, pmd);
else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
newprot, prot_numa);
if (nr_ptes) {
if (nr_ptes == HPAGE_PMD_NR) {
pages += HPAGE_PMD_NR;
nr_huge_updates++;
}
/* huge pmd was handled */
continue;
}
}
/* fall through, the trans huge pmd just split */
}
this_pages = change_pte_range(vma, pmd, addr, next, newprot,
dirty_accountable, prot_numa);
pages += this_pages;
} while (pmd++, addr = next, addr != end);
if (mni_start)
mmu_notifier_invalidate_range_end(mm, mni_start, end);
if (nr_huge_updates)
count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
return pages;
}
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
pgd_t *pgd, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pud_t *pud;
unsigned long next;
unsigned long pages = 0;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
pages += change_pmd_range(vma, pud, addr, next, newprot,
dirty_accountable, prot_numa);
} while (pud++, addr = next, addr != end);
return pages;
}
static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
unsigned long start = addr;
unsigned long pages = 0;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
set_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
pages += change_pud_range(vma, pgd, addr, next, newprot,
dirty_accountable, prot_numa);
} while (pgd++, addr = next, addr != end);
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
clear_tlb_flush_pending(mm);
return pages;
}
unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
{
unsigned long pages;
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot);
else
pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
return pages;
}
int
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
unsigned long start, unsigned long end, unsigned long newflags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long oldflags = vma->vm_flags;
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned long charged = 0;
pgoff_t pgoff;
int error;
int dirty_accountable = 0;
if (newflags == oldflags) {
*pprev = vma;
return 0;
}
/*
* If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we
* make it unwritable again. hugetlb mapping were accounted for
* even if read-only so there is no need to account for them here
*/
if (newflags & VM_WRITE) {
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
VM_SHARED|VM_NORESERVE))) {
charged = nrpages;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
newflags |= VM_ACCOUNT;
}
}
/*
* First try to merge with previous and/or next vma.
*/
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
if (*pprev) {
vma = *pprev;
goto success;
}
*pprev = vma;
if (start != vma->vm_start) {
error = split_vma(mm, vma, start, 1);
if (error)
goto fail;
}
if (end != vma->vm_end) {
error = split_vma(mm, vma, end, 0);
if (error)
goto fail;
}
success:
/*
* vm_flags and vm_page_prot are protected by the mmap_sem
* held in write mode.
*/
vma->vm_flags = newflags;
dirty_accountable = vma_wants_writenotify(vma);
vma_set_page_prot(vma);
change_protection(vma, start, end, vma->vm_page_prot,
dirty_accountable, 0);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
* fault on access.
*/
if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
(newflags & VM_WRITE)) {
populate_vma_page_range(vma, start, end, NULL);
}
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
perf_event_mmap(vma);
return 0;
fail:
vm_unacct_memory(charged);
return error;
}
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
unsigned long, prot)
{
unsigned long vm_flags, nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
int error = -EINVAL;
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
return -EINVAL;
if (start & ~PAGE_MASK)
return -EINVAL;
if (!len)
return 0;
len = PAGE_ALIGN(len);
end = start + len;
if (end <= start)
return -ENOMEM;
if (!arch_validate_prot(prot))
return -EINVAL;
reqprot = prot;
/*
* Does the application expect PROT_READ to imply PROT_EXEC:
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
prot |= PROT_EXEC;
vm_flags = calc_vm_prot_bits(prot);
down_write(&current->mm->mmap_sem);
vma = find_vma(current->mm, start);
error = -ENOMEM;
if (!vma)
goto out;
prev = vma->vm_prev;
if (unlikely(grows & PROT_GROWSDOWN)) {
if (vma->vm_start >= end)
goto out;
start = vma->vm_start;
error = -EINVAL;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto out;
} else {
if (vma->vm_start > start)
goto out;
if (unlikely(grows & PROT_GROWSUP)) {
end = vma->vm_end;
error = -EINVAL;
if (!(vma->vm_flags & VM_GROWSUP))
goto out;
}
}
if (start > vma->vm_start)
prev = vma;
for (nstart = start ; ; ) {
unsigned long newflags;
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
newflags = vm_flags;
newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/* newflags >> 4 shift VM_MAY% in place of VM_% */
if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
error = -EACCES;
goto out;
}
error = security_file_mprotect(vma, reqprot, prot);
if (error)
goto out;
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
goto out;
nstart = tmp;
if (nstart < prev->vm_end)
nstart = prev->vm_end;
if (nstart >= end)
goto out;
vma = prev->vm_next;
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
goto out;
}
}
out:
up_write(&current->mm->mmap_sem);
return error;
}