In entry_64.S we have code like this:
/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
pushq %rax
/* mask off "user" bit of pgd address and 12 PCID bits: */
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
movq %rax, %cr3
2:
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
call do_nmi
With this instruction:
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
We unconditionally switch from whatever our CR3 was to kernel page table.
But, in arch/x86/platform/efi/efi_64.c We temporarily set a different page
table, that does not have the kernel page table with 0x1000 offset from it.
Look in efi_thunk() and efi_thunk_set_virtual_address_map().
So, while CR3 points to the other page table, we get an NMI interrupt,
and clear 0x1000 from CR3, resulting in a bogus CR3 if the 0x1000 bit was
set.
The efi page table comes from realmode/rm/trampoline_64.S:
arch/x86/realmode/rm/trampoline_64.S
141 .bss
142 .balign PAGE_SIZE
143 GLOBAL(trampoline_pgd) .space PAGE_SIZE
Notice: alignment is PAGE_SIZE, so after applying KAISER_SHADOW_PGD_OFFSET
which equal to PAGE_SIZE, we can get a different page table.
But, even if we fix alignment, here the trampoline binary is later copied
into dynamically allocated memory in reserve_real_mode(), so we need to
fix that place as well.
Fixes: 8a43ddfb93
("KAISER: Kernel Address Isolation")
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Steven Sistare <steven.sistare@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
124 lines
3.4 KiB
C
124 lines
3.4 KiB
C
#include <linux/io.h>
|
|
#include <linux/memblock.h>
|
|
|
|
#include <asm/cacheflush.h>
|
|
#include <asm/pgtable.h>
|
|
#include <asm/realmode.h>
|
|
#include <asm/kaiser.h>
|
|
|
|
struct real_mode_header *real_mode_header;
|
|
u32 *trampoline_cr4_features;
|
|
|
|
void __init reserve_real_mode(void)
|
|
{
|
|
phys_addr_t mem;
|
|
unsigned char *base;
|
|
size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
|
|
|
|
/* Has to be under 1M so we can execute real-mode AP code. */
|
|
mem = memblock_find_in_range(0, 1 << 20, size,
|
|
KAISER_KERNEL_PGD_ALIGNMENT);
|
|
if (!mem)
|
|
panic("Cannot allocate trampoline\n");
|
|
|
|
base = __va(mem);
|
|
memblock_reserve(mem, size);
|
|
real_mode_header = (struct real_mode_header *) base;
|
|
printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
|
|
base, (unsigned long long)mem, size);
|
|
}
|
|
|
|
void __init setup_real_mode(void)
|
|
{
|
|
u16 real_mode_seg;
|
|
const u32 *rel;
|
|
u32 count;
|
|
unsigned char *base;
|
|
unsigned long phys_base;
|
|
struct trampoline_header *trampoline_header;
|
|
size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
|
|
#ifdef CONFIG_X86_64
|
|
u64 *trampoline_pgd;
|
|
u64 efer;
|
|
#endif
|
|
|
|
base = (unsigned char *)real_mode_header;
|
|
|
|
memcpy(base, real_mode_blob, size);
|
|
|
|
phys_base = __pa(base);
|
|
real_mode_seg = phys_base >> 4;
|
|
|
|
rel = (u32 *) real_mode_relocs;
|
|
|
|
/* 16-bit segment relocations. */
|
|
count = *rel++;
|
|
while (count--) {
|
|
u16 *seg = (u16 *) (base + *rel++);
|
|
*seg = real_mode_seg;
|
|
}
|
|
|
|
/* 32-bit linear relocations. */
|
|
count = *rel++;
|
|
while (count--) {
|
|
u32 *ptr = (u32 *) (base + *rel++);
|
|
*ptr += phys_base;
|
|
}
|
|
|
|
/* Must be perfomed *after* relocation. */
|
|
trampoline_header = (struct trampoline_header *)
|
|
__va(real_mode_header->trampoline_header);
|
|
|
|
#ifdef CONFIG_X86_32
|
|
trampoline_header->start = __pa_symbol(startup_32_smp);
|
|
trampoline_header->gdt_limit = __BOOT_DS + 7;
|
|
trampoline_header->gdt_base = __pa_symbol(boot_gdt);
|
|
#else
|
|
/*
|
|
* Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR
|
|
* so we need to mask it out.
|
|
*/
|
|
rdmsrl(MSR_EFER, efer);
|
|
trampoline_header->efer = efer & ~EFER_LMA;
|
|
|
|
trampoline_header->start = (u64) secondary_startup_64;
|
|
trampoline_cr4_features = &trampoline_header->cr4;
|
|
*trampoline_cr4_features = __read_cr4();
|
|
|
|
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
|
|
trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
|
|
trampoline_pgd[511] = init_level4_pgt[511].pgd;
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* reserve_real_mode() gets called very early, to guarantee the
|
|
* availability of low memory. This is before the proper kernel page
|
|
* tables are set up, so we cannot set page permissions in that
|
|
* function. Also trampoline code will be executed by APs so we
|
|
* need to mark it executable at do_pre_smp_initcalls() at least,
|
|
* thus run it as a early_initcall().
|
|
*/
|
|
static int __init set_real_mode_permissions(void)
|
|
{
|
|
unsigned char *base = (unsigned char *) real_mode_header;
|
|
size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
|
|
|
|
size_t ro_size =
|
|
PAGE_ALIGN(real_mode_header->ro_end) -
|
|
__pa(base);
|
|
|
|
size_t text_size =
|
|
PAGE_ALIGN(real_mode_header->ro_end) -
|
|
real_mode_header->text_start;
|
|
|
|
unsigned long text_start =
|
|
(unsigned long) __va(real_mode_header->text_start);
|
|
|
|
set_memory_nx((unsigned long) base, size >> PAGE_SHIFT);
|
|
set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT);
|
|
set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
|
|
|
|
return 0;
|
|
}
|
|
early_initcall(set_real_mode_permissions);
|