In entry_64.S we have code like this:
/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
pushq %rax
/* mask off "user" bit of pgd address and 12 PCID bits: */
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
movq %rax, %cr3
2:
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
call do_nmi
With this instruction:
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
We unconditionally switch from whatever our CR3 was to kernel page table.
But, in arch/x86/platform/efi/efi_64.c We temporarily set a different page
table, that does not have the kernel page table with 0x1000 offset from it.
Look in efi_thunk() and efi_thunk_set_virtual_address_map().
So, while CR3 points to the other page table, we get an NMI interrupt,
and clear 0x1000 from CR3, resulting in a bogus CR3 if the 0x1000 bit was
set.
The efi page table comes from realmode/rm/trampoline_64.S:
arch/x86/realmode/rm/trampoline_64.S
141 .bss
142 .balign PAGE_SIZE
143 GLOBAL(trampoline_pgd) .space PAGE_SIZE
Notice: alignment is PAGE_SIZE, so after applying KAISER_SHADOW_PGD_OFFSET
which equal to PAGE_SIZE, we can get a different page table.
But, even if we fix alignment, here the trampoline binary is later copied
into dynamically allocated memory in reserve_real_mode(), so we need to
fix that place as well.
Fixes: 8a43ddfb93
("KAISER: Kernel Address Isolation")
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Steven Sistare <steven.sistare@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
151 lines
4.6 KiB
C
151 lines
4.6 KiB
C
#ifndef _ASM_X86_KAISER_H
|
|
#define _ASM_X86_KAISER_H
|
|
|
|
#include <uapi/asm/processor-flags.h> /* For PCID constants */
|
|
|
|
/*
|
|
* This file includes the definitions for the KAISER feature.
|
|
* KAISER is a counter measure against x86_64 side channel attacks on
|
|
* the kernel virtual memory. It has a shadow pgd for every process: the
|
|
* shadow pgd has a minimalistic kernel-set mapped, but includes the whole
|
|
* user memory. Within a kernel context switch, or when an interrupt is handled,
|
|
* the pgd is switched to the normal one. When the system switches to user mode,
|
|
* the shadow pgd is enabled. By this, the virtual memory caches are freed,
|
|
* and the user may not attack the whole kernel memory.
|
|
*
|
|
* A minimalistic kernel mapping holds the parts needed to be mapped in user
|
|
* mode, such as the entry/exit functions of the user space, or the stacks.
|
|
*/
|
|
|
|
#define KAISER_SHADOW_PGD_OFFSET 0x1000
|
|
|
|
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
|
/*
|
|
* A page table address must have this alignment to stay the same when
|
|
* KAISER_SHADOW_PGD_OFFSET mask is applied
|
|
*/
|
|
#define KAISER_KERNEL_PGD_ALIGNMENT (KAISER_SHADOW_PGD_OFFSET << 1)
|
|
#else
|
|
#define KAISER_KERNEL_PGD_ALIGNMENT PAGE_SIZE
|
|
#endif
|
|
|
|
#ifdef __ASSEMBLY__
|
|
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
|
|
|
.macro _SWITCH_TO_KERNEL_CR3 reg
|
|
movq %cr3, \reg
|
|
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
|
|
/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
|
|
ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
|
|
movq \reg, %cr3
|
|
.endm
|
|
|
|
.macro _SWITCH_TO_USER_CR3 reg regb
|
|
/*
|
|
* regb must be the low byte portion of reg: because we have arranged
|
|
* for the low byte of the user PCID to serve as the high byte of NOFLUSH
|
|
* (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
|
|
* not enabled): so that the one register can update both memory and cr3.
|
|
*/
|
|
movq %cr3, \reg
|
|
orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
|
|
js 9f
|
|
/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
|
|
movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
|
|
9:
|
|
movq \reg, %cr3
|
|
.endm
|
|
|
|
.macro SWITCH_KERNEL_CR3
|
|
ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
|
|
_SWITCH_TO_KERNEL_CR3 %rax
|
|
popq %rax
|
|
8:
|
|
.endm
|
|
|
|
.macro SWITCH_USER_CR3
|
|
ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
|
|
_SWITCH_TO_USER_CR3 %rax %al
|
|
popq %rax
|
|
8:
|
|
.endm
|
|
|
|
.macro SWITCH_KERNEL_CR3_NO_STACK
|
|
ALTERNATIVE "jmp 8f", \
|
|
__stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
|
|
X86_FEATURE_KAISER
|
|
_SWITCH_TO_KERNEL_CR3 %rax
|
|
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
|
|
8:
|
|
.endm
|
|
|
|
#else /* CONFIG_PAGE_TABLE_ISOLATION */
|
|
|
|
.macro SWITCH_KERNEL_CR3
|
|
.endm
|
|
.macro SWITCH_USER_CR3
|
|
.endm
|
|
.macro SWITCH_KERNEL_CR3_NO_STACK
|
|
.endm
|
|
|
|
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
|
|
|
|
#else /* __ASSEMBLY__ */
|
|
|
|
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
|
/*
|
|
* Upon kernel/user mode switch, it may happen that the address
|
|
* space has to be switched before the registers have been
|
|
* stored. To change the address space, another register is
|
|
* needed. A register therefore has to be stored/restored.
|
|
*/
|
|
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
|
|
|
|
DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
|
|
|
|
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
|
|
|
|
extern int kaiser_enabled;
|
|
extern void __init kaiser_check_boottime_disable(void);
|
|
#else
|
|
#define kaiser_enabled 0
|
|
static inline void __init kaiser_check_boottime_disable(void) {}
|
|
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
|
|
|
|
/*
|
|
* Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,
|
|
* so as to build with tests on kaiser_enabled instead of #ifdefs.
|
|
*/
|
|
|
|
/**
|
|
* kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
|
|
* @addr: the start address of the range
|
|
* @size: the size of the range
|
|
* @flags: The mapping flags of the pages
|
|
*
|
|
* The mapping is done on a global scope, so no bigger
|
|
* synchronization has to be done. the pages have to be
|
|
* manually unmapped again when they are not needed any longer.
|
|
*/
|
|
extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
|
|
|
|
/**
|
|
* kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
|
|
* @addr: the start address of the range
|
|
* @size: the size of the range
|
|
*/
|
|
extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
|
|
|
|
/**
|
|
* kaiser_init - Initialize the shadow mapping
|
|
*
|
|
* Most parts of the shadow mapping can be mapped upon boot
|
|
* time. Only per-process things like the thread stacks
|
|
* or a new LDT have to be mapped at runtime. These boot-
|
|
* time mappings are permanent and never unmapped.
|
|
*/
|
|
extern void kaiser_init(void);
|
|
|
|
#endif /* __ASSEMBLY */
|
|
|
|
#endif /* _ASM_X86_KAISER_H */
|