This resolves a crash if loaded under qemu + haxm under windows. See https://www.spinics.net/lists/kernel/msg2689835.html for details. Here is a boot log (the log is from chromeos-4.4, but Tao Wu says that the same log is also seen with vanilla v4.4.110-rc1). [ 0.712750] Freeing unused kernel memory: 552K [ 0.721821] init: Corrupted page table at address 57b029b332e0 [ 0.722761] PGD 80000000bb238067 PUD bc36a067 PMD bc369067 PTE 45d2067 [ 0.722761] Bad pagetable: 000b [#1] PREEMPT SMP [ 0.722761] Modules linked in: [ 0.722761] CPU: 1 PID: 1 Comm: init Not tainted 4.4.96 #31 [ 0.722761] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014 [ 0.722761] task: ffff8800bc290000 ti: ffff8800bc28c000 task.ti: ffff8800bc28c000 [ 0.722761] RIP: 0010:[<ffffffff83f4129e>] [<ffffffff83f4129e>] __clear_user+0x42/0x67 [ 0.722761] RSP: 0000:ffff8800bc28fcf8 EFLAGS: 00010202 [ 0.722761] RAX: 0000000000000000 RBX: 00000000000001a4 RCX: 00000000000001a4 [ 0.722761] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000057b029b332e0 [ 0.722761] RBP: ffff8800bc28fd08 R08: ffff8800bc290000 R09: ffff8800bb2f4000 [ 0.722761] R10: ffff8800bc290000 R11: ffff8800bb2f4000 R12: 000057b029b332e0 [ 0.722761] R13: 0000000000000000 R14: 000057b029b33340 R15: ffff8800bb1e2a00 [ 0.722761] FS: 0000000000000000(0000) GS:ffff8800bfb00000(0000) knlGS:0000000000000000 [ 0.722761] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 0.722761] CR2: 000057b029b332e0 CR3: 00000000bb2f8000 CR4: 00000000000006e0 [ 0.722761] Stack: [ 0.722761] 000057b029b332e0 ffff8800bb95fa80 ffff8800bc28fd18 ffffffff83f4120c [ 0.722761] ffff8800bc28fe18 ffffffff83e9e7a1 ffff8800bc28fd68 0000000000000000 [ 0.722761] ffff8800bc290000 ffff8800bc290000 ffff8800bc290000 ffff8800bc290000 [ 0.722761] Call Trace: [ 0.722761] [<ffffffff83f4120c>] clear_user+0x2e/0x30 [ 0.722761] [<ffffffff83e9e7a1>] load_elf_binary+0xa7f/0x18f7 [ 0.722761] [<ffffffff83de2088>] search_binary_handler+0x86/0x19c [ 0.722761] [<ffffffff83de389e>] do_execveat_common.isra.26+0x909/0xf98 [ 0.722761] [<ffffffff844febe0>] ? rest_init+0x87/0x87 [ 0.722761] [<ffffffff83de40be>] do_execve+0x23/0x25 [ 0.722761] [<ffffffff83c002e3>] run_init_process+0x2b/0x2d [ 0.722761] [<ffffffff844fec4d>] kernel_init+0x6d/0xda [ 0.722761] [<ffffffff84505b2f>] ret_from_fork+0x3f/0x70 [ 0.722761] [<ffffffff844febe0>] ? rest_init+0x87/0x87 [ 0.722761] Code: 86 84 be 12 00 00 00 e8 87 0d e8 ff 66 66 90 48 89 d8 48 c1 eb 03 4c 89 e7 83 e0 07 48 89 d9 be 08 00 00 00 31 d2 48 85 c9 74 0a <48> 89 17 48 01 f7 ff c9 75 f6 48 89 c1 85 c9 74 09 88 17 48 ff [ 0.722761] RIP [<ffffffff83f4129e>] __clear_user+0x42/0x67 [ 0.722761] RSP <ffff8800bc28fcf8> [ 0.722761] ---[ end trace def703879b4ff090 ]--- [ 0.722761] BUG: sleeping function called from invalid context at /mnt/host/source/src/third_party/kernel/v4.4/kernel/locking/rwsem.c:21 [ 0.722761] in_atomic(): 0, irqs_disabled(): 1, pid: 1, name: init [ 0.722761] CPU: 1 PID: 1 Comm: init Tainted: G D 4.4.96 #31 [ 0.722761] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014 [ 0.722761] 0000000000000086 dcb5d76098c89836 ffff8800bc28fa30 ffffffff83f34004 [ 0.722761] ffffffff84839dc2 0000000000000015 ffff8800bc28fa40 ffffffff83d57dc9 [ 0.722761] ffff8800bc28fa68 ffffffff83d57e6a ffffffff84a53640 0000000000000000 [ 0.722761] Call Trace: [ 0.722761] [<ffffffff83f34004>] dump_stack+0x4d/0x63 [ 0.722761] [<ffffffff83d57dc9>] ___might_sleep+0x13a/0x13c [ 0.722761] [<ffffffff83d57e6a>] __might_sleep+0x9f/0xa6 [ 0.722761] [<ffffffff84502788>] down_read+0x20/0x31 [ 0.722761] [<ffffffff83cc5d9b>] __blocking_notifier_call_chain+0x35/0x63 [ 0.722761] [<ffffffff83cc5ddd>] blocking_notifier_call_chain+0x14/0x16 [ 0.800374] usb 1-1: new full-speed USB device number 2 using uhci_hcd [ 0.722761] [<ffffffff83cefe97>] profile_task_exit+0x1a/0x1c [ 0.802309] [<ffffffff83cac84e>] do_exit+0x39/0xe7f [ 0.802309] [<ffffffff83ce5938>] ? vprintk_default+0x1d/0x1f [ 0.802309] [<ffffffff83d7bb95>] ? printk+0x57/0x73 [ 0.802309] [<ffffffff83c46e25>] oops_end+0x80/0x85 [ 0.802309] [<ffffffff83c7b747>] pgtable_bad+0x8a/0x95 [ 0.802309] [<ffffffff83ca7f4a>] __do_page_fault+0x8c/0x352 [ 0.802309] [<ffffffff83eefba5>] ? file_has_perm+0xc4/0xe5 [ 0.802309] [<ffffffff83ca821c>] do_page_fault+0xc/0xe [ 0.802309] [<ffffffff84507682>] page_fault+0x22/0x30 [ 0.802309] [<ffffffff83f4129e>] ? __clear_user+0x42/0x67 [ 0.802309] [<ffffffff83f4127f>] ? __clear_user+0x23/0x67 [ 0.802309] [<ffffffff83f4120c>] clear_user+0x2e/0x30 [ 0.802309] [<ffffffff83e9e7a1>] load_elf_binary+0xa7f/0x18f7 [ 0.802309] [<ffffffff83de2088>] search_binary_handler+0x86/0x19c [ 0.802309] [<ffffffff83de389e>] do_execveat_common.isra.26+0x909/0xf98 [ 0.802309] [<ffffffff844febe0>] ? rest_init+0x87/0x87 [ 0.802309] [<ffffffff83de40be>] do_execve+0x23/0x25 [ 0.802309] [<ffffffff83c002e3>] run_init_process+0x2b/0x2d [ 0.802309] [<ffffffff844fec4d>] kernel_init+0x6d/0xda [ 0.802309] [<ffffffff84505b2f>] ret_from_fork+0x3f/0x70 [ 0.802309] [<ffffffff844febe0>] ? rest_init+0x87/0x87 [ 0.830559] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 [ 0.830559] [ 0.831305] Kernel Offset: 0x2c00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 0.831305] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 The crash part of this problem may be solved with the following patch (thanks to Hugh for the hint). There is still another problem, though - with this patch applied, the qemu session aborts with "VCPU Shutdown request", whatever that means. Cc: lepton <ytht.net@gmail.com> Signed-off-by: Guenter Roeck <groeck@chromium.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
456 lines
12 KiB
C
456 lines
12 KiB
C
#include <linux/bug.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/string.h>
|
|
#include <linux/types.h>
|
|
#include <linux/bug.h>
|
|
#include <linux/init.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/uaccess.h>
|
|
#include <linux/ftrace.h>
|
|
|
|
#undef pr_fmt
|
|
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
|
|
|
|
#include <asm/kaiser.h>
|
|
#include <asm/tlbflush.h> /* to verify its kaiser declarations */
|
|
#include <asm/pgtable.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/desc.h>
|
|
#include <asm/cmdline.h>
|
|
|
|
int kaiser_enabled __read_mostly = 1;
|
|
EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
|
|
|
|
__visible
|
|
DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
|
|
|
|
/*
|
|
* These can have bit 63 set, so we can not just use a plain "or"
|
|
* instruction to get their value or'd into CR3. It would take
|
|
* another register. So, we use a memory reference to these instead.
|
|
*
|
|
* This is also handy because systems that do not support PCIDs
|
|
* just end up or'ing a 0 into their CR3, which does no harm.
|
|
*/
|
|
DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
|
|
|
|
/*
|
|
* At runtime, the only things we map are some things for CPU
|
|
* hotplug, and stacks for new processes. No two CPUs will ever
|
|
* be populating the same addresses, so we only need to ensure
|
|
* that we protect between two CPUs trying to allocate and
|
|
* populate the same page table page.
|
|
*
|
|
* Only take this lock when doing a set_p[4um]d(), but it is not
|
|
* needed for doing a set_pte(). We assume that only the *owner*
|
|
* of a given allocation will be doing this for _their_
|
|
* allocation.
|
|
*
|
|
* This ensures that once a system has been running for a while
|
|
* and there have been stacks all over and these page tables
|
|
* are fully populated, there will be no further acquisitions of
|
|
* this lock.
|
|
*/
|
|
static DEFINE_SPINLOCK(shadow_table_allocation_lock);
|
|
|
|
/*
|
|
* Returns -1 on error.
|
|
*/
|
|
static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
|
|
{
|
|
pgd_t *pgd;
|
|
pud_t *pud;
|
|
pmd_t *pmd;
|
|
pte_t *pte;
|
|
|
|
pgd = pgd_offset_k(vaddr);
|
|
/*
|
|
* We made all the kernel PGDs present in kaiser_init().
|
|
* We expect them to stay that way.
|
|
*/
|
|
BUG_ON(pgd_none(*pgd));
|
|
/*
|
|
* PGDs are either 512GB or 128TB on all x86_64
|
|
* configurations. We don't handle these.
|
|
*/
|
|
BUG_ON(pgd_large(*pgd));
|
|
|
|
pud = pud_offset(pgd, vaddr);
|
|
if (pud_none(*pud)) {
|
|
WARN_ON_ONCE(1);
|
|
return -1;
|
|
}
|
|
|
|
if (pud_large(*pud))
|
|
return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
|
|
|
|
pmd = pmd_offset(pud, vaddr);
|
|
if (pmd_none(*pmd)) {
|
|
WARN_ON_ONCE(1);
|
|
return -1;
|
|
}
|
|
|
|
if (pmd_large(*pmd))
|
|
return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
|
|
|
|
pte = pte_offset_kernel(pmd, vaddr);
|
|
if (pte_none(*pte)) {
|
|
WARN_ON_ONCE(1);
|
|
return -1;
|
|
}
|
|
|
|
return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
|
|
}
|
|
|
|
/*
|
|
* This is a relatively normal page table walk, except that it
|
|
* also tries to allocate page tables pages along the way.
|
|
*
|
|
* Returns a pointer to a PTE on success, or NULL on failure.
|
|
*/
|
|
static pte_t *kaiser_pagetable_walk(unsigned long address)
|
|
{
|
|
pmd_t *pmd;
|
|
pud_t *pud;
|
|
pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
|
|
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
|
|
|
|
if (pgd_none(*pgd)) {
|
|
WARN_ONCE(1, "All shadow pgds should have been populated");
|
|
return NULL;
|
|
}
|
|
BUILD_BUG_ON(pgd_large(*pgd) != 0);
|
|
|
|
pud = pud_offset(pgd, address);
|
|
/* The shadow page tables do not use large mappings: */
|
|
if (pud_large(*pud)) {
|
|
WARN_ON(1);
|
|
return NULL;
|
|
}
|
|
if (pud_none(*pud)) {
|
|
unsigned long new_pmd_page = __get_free_page(gfp);
|
|
if (!new_pmd_page)
|
|
return NULL;
|
|
spin_lock(&shadow_table_allocation_lock);
|
|
if (pud_none(*pud)) {
|
|
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
|
|
__inc_zone_page_state(virt_to_page((void *)
|
|
new_pmd_page), NR_KAISERTABLE);
|
|
} else
|
|
free_page(new_pmd_page);
|
|
spin_unlock(&shadow_table_allocation_lock);
|
|
}
|
|
|
|
pmd = pmd_offset(pud, address);
|
|
/* The shadow page tables do not use large mappings: */
|
|
if (pmd_large(*pmd)) {
|
|
WARN_ON(1);
|
|
return NULL;
|
|
}
|
|
if (pmd_none(*pmd)) {
|
|
unsigned long new_pte_page = __get_free_page(gfp);
|
|
if (!new_pte_page)
|
|
return NULL;
|
|
spin_lock(&shadow_table_allocation_lock);
|
|
if (pmd_none(*pmd)) {
|
|
set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
|
|
__inc_zone_page_state(virt_to_page((void *)
|
|
new_pte_page), NR_KAISERTABLE);
|
|
} else
|
|
free_page(new_pte_page);
|
|
spin_unlock(&shadow_table_allocation_lock);
|
|
}
|
|
|
|
return pte_offset_kernel(pmd, address);
|
|
}
|
|
|
|
static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
|
|
unsigned long flags)
|
|
{
|
|
int ret = 0;
|
|
pte_t *pte;
|
|
unsigned long start_addr = (unsigned long )__start_addr;
|
|
unsigned long address = start_addr & PAGE_MASK;
|
|
unsigned long end_addr = PAGE_ALIGN(start_addr + size);
|
|
unsigned long target_address;
|
|
|
|
/*
|
|
* It is convenient for callers to pass in __PAGE_KERNEL etc,
|
|
* and there is no actual harm from setting _PAGE_GLOBAL, so
|
|
* long as CR4.PGE is not set. But it is nonetheless troubling
|
|
* to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
|
|
* requires that not to be #defined to 0): so mask it off here.
|
|
*/
|
|
flags &= ~_PAGE_GLOBAL;
|
|
|
|
for (; address < end_addr; address += PAGE_SIZE) {
|
|
target_address = get_pa_from_mapping(address);
|
|
if (target_address == -1) {
|
|
ret = -EIO;
|
|
break;
|
|
}
|
|
pte = kaiser_pagetable_walk(address);
|
|
if (!pte) {
|
|
ret = -ENOMEM;
|
|
break;
|
|
}
|
|
if (pte_none(*pte)) {
|
|
set_pte(pte, __pte(flags | target_address));
|
|
} else {
|
|
pte_t tmp;
|
|
set_pte(&tmp, __pte(flags | target_address));
|
|
WARN_ON_ONCE(!pte_same(*pte, tmp));
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
|
|
{
|
|
unsigned long size = end - start;
|
|
|
|
return kaiser_add_user_map(start, size, flags);
|
|
}
|
|
|
|
/*
|
|
* Ensure that the top level of the (shadow) page tables are
|
|
* entirely populated. This ensures that all processes that get
|
|
* forked have the same entries. This way, we do not have to
|
|
* ever go set up new entries in older processes.
|
|
*
|
|
* Note: we never free these, so there are no updates to them
|
|
* after this.
|
|
*/
|
|
static void __init kaiser_init_all_pgds(void)
|
|
{
|
|
pgd_t *pgd;
|
|
int i = 0;
|
|
|
|
pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
|
|
for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
|
|
pgd_t new_pgd;
|
|
pud_t *pud = pud_alloc_one(&init_mm,
|
|
PAGE_OFFSET + i * PGDIR_SIZE);
|
|
if (!pud) {
|
|
WARN_ON(1);
|
|
break;
|
|
}
|
|
inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
|
|
new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
|
|
/*
|
|
* Make sure not to stomp on some other pgd entry.
|
|
*/
|
|
if (!pgd_none(pgd[i])) {
|
|
WARN_ON(1);
|
|
continue;
|
|
}
|
|
set_pgd(pgd + i, new_pgd);
|
|
}
|
|
}
|
|
|
|
#define kaiser_add_user_map_early(start, size, flags) do { \
|
|
int __ret = kaiser_add_user_map(start, size, flags); \
|
|
WARN_ON(__ret); \
|
|
} while (0)
|
|
|
|
#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
|
|
int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
|
|
WARN_ON(__ret); \
|
|
} while (0)
|
|
|
|
void __init kaiser_check_boottime_disable(void)
|
|
{
|
|
bool enable = true;
|
|
char arg[5];
|
|
int ret;
|
|
|
|
if (boot_cpu_has(X86_FEATURE_XENPV))
|
|
goto silent_disable;
|
|
|
|
ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
|
|
if (ret > 0) {
|
|
if (!strncmp(arg, "on", 2))
|
|
goto enable;
|
|
|
|
if (!strncmp(arg, "off", 3))
|
|
goto disable;
|
|
|
|
if (!strncmp(arg, "auto", 4))
|
|
goto skip;
|
|
}
|
|
|
|
if (cmdline_find_option_bool(boot_command_line, "nopti"))
|
|
goto disable;
|
|
|
|
skip:
|
|
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
|
|
goto disable;
|
|
|
|
enable:
|
|
if (enable)
|
|
setup_force_cpu_cap(X86_FEATURE_KAISER);
|
|
|
|
return;
|
|
|
|
disable:
|
|
pr_info("disabled\n");
|
|
|
|
silent_disable:
|
|
kaiser_enabled = 0;
|
|
setup_clear_cpu_cap(X86_FEATURE_KAISER);
|
|
}
|
|
|
|
/*
|
|
* If anything in here fails, we will likely die on one of the
|
|
* first kernel->user transitions and init will die. But, we
|
|
* will have most of the kernel up by then and should be able to
|
|
* get a clean warning out of it. If we BUG_ON() here, we run
|
|
* the risk of being before we have good console output.
|
|
*/
|
|
void __init kaiser_init(void)
|
|
{
|
|
int cpu;
|
|
|
|
if (!kaiser_enabled)
|
|
return;
|
|
|
|
kaiser_init_all_pgds();
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
void *percpu_vaddr = __per_cpu_user_mapped_start +
|
|
per_cpu_offset(cpu);
|
|
unsigned long percpu_sz = __per_cpu_user_mapped_end -
|
|
__per_cpu_user_mapped_start;
|
|
kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
|
|
__PAGE_KERNEL);
|
|
}
|
|
|
|
/*
|
|
* Map the entry/exit text section, which is needed at
|
|
* switches from user to and from kernel.
|
|
*/
|
|
kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
|
|
__PAGE_KERNEL_RX);
|
|
|
|
#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
|
|
kaiser_add_user_map_ptrs_early(__irqentry_text_start,
|
|
__irqentry_text_end,
|
|
__PAGE_KERNEL_RX);
|
|
#endif
|
|
kaiser_add_user_map_early((void *)idt_descr.address,
|
|
sizeof(gate_desc) * NR_VECTORS,
|
|
__PAGE_KERNEL_RO);
|
|
#ifdef CONFIG_TRACING
|
|
kaiser_add_user_map_early(&trace_idt_descr,
|
|
sizeof(trace_idt_descr),
|
|
__PAGE_KERNEL);
|
|
kaiser_add_user_map_early(&trace_idt_table,
|
|
sizeof(gate_desc) * NR_VECTORS,
|
|
__PAGE_KERNEL);
|
|
#endif
|
|
kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
|
|
__PAGE_KERNEL);
|
|
kaiser_add_user_map_early(&debug_idt_table,
|
|
sizeof(gate_desc) * NR_VECTORS,
|
|
__PAGE_KERNEL);
|
|
|
|
pr_info("enabled\n");
|
|
}
|
|
|
|
/* Add a mapping to the shadow mapping, and synchronize the mappings */
|
|
int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
|
|
{
|
|
if (!kaiser_enabled)
|
|
return 0;
|
|
return kaiser_add_user_map((const void *)addr, size, flags);
|
|
}
|
|
|
|
void kaiser_remove_mapping(unsigned long start, unsigned long size)
|
|
{
|
|
extern void unmap_pud_range_nofree(pgd_t *pgd,
|
|
unsigned long start, unsigned long end);
|
|
unsigned long end = start + size;
|
|
unsigned long addr, next;
|
|
pgd_t *pgd;
|
|
|
|
if (!kaiser_enabled)
|
|
return;
|
|
pgd = native_get_shadow_pgd(pgd_offset_k(start));
|
|
for (addr = start; addr < end; pgd++, addr = next) {
|
|
next = pgd_addr_end(addr, end);
|
|
unmap_pud_range_nofree(pgd, addr, next);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Page table pages are page-aligned. The lower half of the top
|
|
* level is used for userspace and the top half for the kernel.
|
|
* This returns true for user pages that need to get copied into
|
|
* both the user and kernel copies of the page tables, and false
|
|
* for kernel pages that should only be in the kernel copy.
|
|
*/
|
|
static inline bool is_userspace_pgd(pgd_t *pgdp)
|
|
{
|
|
return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
|
|
}
|
|
|
|
pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
|
|
{
|
|
if (!kaiser_enabled)
|
|
return pgd;
|
|
/*
|
|
* Do we need to also populate the shadow pgd? Check _PAGE_USER to
|
|
* skip cases like kexec and EFI which make temporary low mappings.
|
|
*/
|
|
if (pgd.pgd & _PAGE_USER) {
|
|
if (is_userspace_pgd(pgdp)) {
|
|
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
|
|
/*
|
|
* Even if the entry is *mapping* userspace, ensure
|
|
* that userspace can not use it. This way, if we
|
|
* get out to userspace running on the kernel CR3,
|
|
* userspace will crash instead of running.
|
|
*/
|
|
if (__supported_pte_mask & _PAGE_NX)
|
|
pgd.pgd |= _PAGE_NX;
|
|
}
|
|
} else if (!pgd.pgd) {
|
|
/*
|
|
* pgd_clear() cannot check _PAGE_USER, and is even used to
|
|
* clear corrupted pgd entries: so just rely on cases like
|
|
* kexec and EFI never to be using pgd_clear().
|
|
*/
|
|
if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
|
|
is_userspace_pgd(pgdp))
|
|
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
|
|
}
|
|
return pgd;
|
|
}
|
|
|
|
void kaiser_setup_pcid(void)
|
|
{
|
|
unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
|
|
|
|
if (this_cpu_has(X86_FEATURE_PCID))
|
|
user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
|
|
/*
|
|
* These variables are used by the entry/exit
|
|
* code to change PCID and pgd and TLB flushing.
|
|
*/
|
|
this_cpu_write(x86_cr3_pcid_user, user_cr3);
|
|
}
|
|
|
|
/*
|
|
* Make a note that this cpu will need to flush USER tlb on return to user.
|
|
* If cpu does not have PCID, then the NOFLUSH bit will never have been set.
|
|
*/
|
|
void kaiser_flush_tlb_on_return_to_user(void)
|
|
{
|
|
if (this_cpu_has(X86_FEATURE_PCID))
|
|
this_cpu_write(x86_cr3_pcid_user,
|
|
X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
|
|
}
|
|
EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
|