android_kernel_oneplus_msm8998/arch/x86/include/asm/pgtable_64.h
Andi Kleen 9ee2d2da67 x86/speculation/l1tf: Protect PROT_NONE PTEs against speculation
commit 6b28baca9b1f0d4a42b865da7a05b1c81424bd5c upstream

When PTEs are set to PROT_NONE the kernel just clears the Present bit and
preserves the PFN, which creates attack surface for L1TF speculation
speculation attacks.

This is important inside guests, because L1TF speculation bypasses physical
page remapping. While the host has its own migitations preventing leaking
data from other VMs into the guest, this would still risk leaking the wrong
page inside the current guest.

This uses the same technique as Linus' swap entry patch: while an entry is
is in PROTNONE state invert the complete PFN part part of it. This ensures
that the the highest bit will point to non existing memory.

The invert is done by pte/pmd_modify and pfn/pmd/pud_pte for PROTNONE and
pte/pmd/pud_pfn undo it.

This assume that no code path touches the PFN part of a PTE directly
without using these primitives.

This doesn't handle the case that MMIO is on the top of the CPU physical
memory. If such an MMIO region was exposed by an unpriviledged driver for
mmap it would be possible to attack some real memory.  However this
situation is all rather unlikely.

For 32bit non PAE the inversion is not done because there are really not
enough bits to protect anything.

Q: Why does the guest need to be protected when the HyperVisor already has
   L1TF mitigations?

A: Here's an example:

   Physical pages 1 2 get mapped into a guest as
   GPA 1 -> PA 2
   GPA 2 -> PA 1
   through EPT.

   The L1TF speculation ignores the EPT remapping.

   Now the guest kernel maps GPA 1 to process A and GPA 2 to process B, and
   they belong to different users and should be isolated.

   A sets the GPA 1 PA 2 PTE to PROT_NONE to bypass the EPT remapping and
   gets read access to the underlying physical page. Which in this case
   points to PA 2, so it can read process B's data, if it happened to be in
   L1, so isolation inside the guest is broken.

   There's nothing the hypervisor can do about this. This mitigation has to
   be done in the guest itself.

[ tglx: Massaged changelog ]
[ dwmw2: backported to 4.9 ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2018-08-15 17:42:09 +02:00

242 lines
6.3 KiB
C

#ifndef _ASM_X86_PGTABLE_64_H
#define _ASM_X86_PGTABLE_64_H
#include <linux/const.h>
#include <asm/pgtable_64_types.h>
#ifndef __ASSEMBLY__
/*
* This file contains the functions and defines necessary to modify and use
* the x86-64 page table tree.
*/
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
extern pte_t level1_fixmap_pgt[512];
extern pgd_t init_level4_pgt[];
#define swapper_pg_dir init_level4_pgt
extern void paging_init(void);
#define pte_ERROR(e) \
pr_err("%s:%d: bad pte %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pte_val(e))
#define pmd_ERROR(e) \
pr_err("%s:%d: bad pmd %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pmd_val(e))
#define pud_ERROR(e) \
pr_err("%s:%d: bad pud %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pud_val(e))
#define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
struct mm_struct;
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
*ptep = native_make_pte(0);
}
static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
*ptep = pte;
}
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
native_set_pte(ptep, pte);
}
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
*pmdp = pmd;
}
static inline void native_pmd_clear(pmd_t *pmd)
{
native_set_pmd(pmd, native_make_pmd(0));
}
static inline pte_t native_ptep_get_and_clear(pte_t *xp)
{
#ifdef CONFIG_SMP
return native_make_pte(xchg(&xp->pte, 0));
#else
/* native_local_ptep_get_and_clear,
but duplicated because of cyclic dependency */
pte_t ret = *xp;
native_pte_clear(NULL, 0, xp);
return ret;
#endif
}
static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
#ifdef CONFIG_SMP
return native_make_pmd(xchg(&xp->pmd, 0));
#else
/* native_local_pmdp_get_and_clear,
but duplicated because of cyclic dependency */
pmd_t ret = *xp;
native_pmd_clear(xp);
return ret;
#endif
}
static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
*pudp = pud;
}
static inline void native_pud_clear(pud_t *pud)
{
native_set_pud(pud, native_make_pud(0));
}
#ifdef CONFIG_PAGE_TABLE_ISOLATION
extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
{
#ifdef CONFIG_DEBUG_VM
/* linux/mmdebug.h may not have been included at this point */
BUG_ON(!kaiser_enabled);
#endif
return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
}
#else
static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
{
return pgd;
}
static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
{
BUILD_BUG_ON(1);
return NULL;
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
}
static inline void native_pgd_clear(pgd_t *pgd)
{
native_set_pgd(pgd, native_make_pgd(0));
}
extern void sync_global_pgds(unsigned long start, unsigned long end,
int removed);
/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
*/
/*
* Level 4 access.
*/
static inline int pgd_large(pgd_t pgd) { return 0; }
#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
/* PUD - Level3 access */
/* PMD - Level 2 access */
/* PTE - Level 1 access. */
/* x86-64 always has all page tables mapped. */
#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
#define pte_unmap(pte) ((void)(pte))/* NOP */
/*
* Encode and de-code a swap entry
*
* | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
* | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
* | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry
*
* G (8) is aliased and used as a PROT_NONE indicator for
* !present ptes. We need to start storing swap entries above
* there. We also need to avoid using A and D because of an
* erratum where they can be incorrectly set by hardware on
* non-present PTEs.
*
* SD (1) in swp entry is used to store soft dirty bit, which helps us
* remember soft dirty over page migration
*
* Bit 7 in swp entry should be 0 because pmd_present checks not only P,
* but also L and G.
*
* The offset is inverted by a binary not operation to make the high
* physical bits set.
*/
#define SWP_TYPE_BITS 5
#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
/* We always extract/encode the offset by shifting it all the way up, and then down again */
#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
/* Extract the high bits for type */
#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))
/* Shift up (to get rid of type), then down to get value */
#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)
/*
* Shift the offset up "too far" by TYPE bits, then down again
* The offset is inverted by a binary not operation to make the high
* physical bits set.
*/
#define __swp_entry(type, offset) ((swp_entry_t) { \
(~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
| ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
#define pgtable_cache_init() do { } while (0)
#define check_pgt_cache() do { } while (0)
#define PAGE_AGP PAGE_KERNEL_NOCACHE
#define HAVE_PAGE_AGP 1
/* fs/proc/kcore.c */
#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
#define __HAVE_ARCH_PTE_SAME
#define vmemmap ((struct page *)VMEMMAP_START)
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
#include <asm/pgtable-invert.h>
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */