This is the 4.4.110 stable release
-----BEGIN PGP SIGNATURE----- iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAlpPj0wACgkQONu9yGCS aT5QOhAAu3PoT3472I7zuWDUG0KQo5r0wdUO+YPW31VIHrxQ2H3sxR44rSHc5jW/ tTg2TIYNBkNoj4jJDJ9J7f6PSnN1vGFglFW4GzxE3cr2+W7u5M5ex8yCYMcBIY9U 56hbyqX5lf5KjGWJiQThwYsMBokrBJW2igAFN3cW39nNABhl0W39kiysGA9vbNrV +QMA4+ZADA2EeIRcdJmj8uc/cez/7sGAfrSktvATkI+HFamnTs0mrx9cl0eQKvjm y5PCxYUCbi4kqD4WM+UCYO3zpUD+r4iMDXwXBwLWkFvbumY4mVTItP+gq5M4Fb1g MSauGUGH7BDsT9gspricCMcAmjcTn6hth7/7/ZhlNq3NZv89pOquhpE0JOSAmYbA P4WaIRRWwpVrRt+THU7vZpAQWpFSwGmtE7tBfPMt2J7zqY3lMYmO3DoA+gejw3CV igbvmV0UY2uYSFnjawUUJ+k+ggYfGyRkUl2DfcllPhZFqE1XEi3NyjI0wi8vtXTd UlrU55TqsldCw1bjXH3lWrpoNybWvqUD2a249ZVs/h06Q5NKwNL8mTye+2BBQtCP QzAqHYbkBKv/f8M6Kg+HtTzgqUbWxVCeQTWFXHMAPVo4bCwGvVGrXbGJIj15lBuQ GWqc3dt69zxpn1tlcRHKH0P3KnkC67dARtY+8F8+D+HAHVY71Bg= =Kpwd -----END PGP SIGNATURE----- Merge 4.4.110 into android-4.4 Changes in 4.4.110 x86/boot: Add early cmdline parsing for options with arguments KAISER: Kernel Address Isolation kaiser: merged update kaiser: do not set _PAGE_NX on pgd_none kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE kaiser: fix build and FIXME in alloc_ldt_struct() kaiser: KAISER depends on SMP kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER kaiser: fix perf crashes kaiser: ENOMEM if kaiser_pagetable_walk() NULL kaiser: tidied up asm/kaiser.h somewhat kaiser: tidied up kaiser_add/remove_mapping slightly kaiser: kaiser_remove_mapping() move along the pgd kaiser: cleanups while trying for gold link kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET kaiser: delete KAISER_REAL_SWITCH option kaiser: vmstat show NR_KAISERTABLE as nr_overhead kaiser: enhanced by kernel and user PCIDs kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user kaiser: PCID 0 for kernel and 128 for user kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user kaiser: paranoid_entry pass cr3 need to paranoid_exit kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls kaiser: fix unlikely error in alloc_ldt_struct() kaiser: add "nokaiser" boot option, using ALTERNATIVE x86/kaiser: Rename and simplify X86_FEATURE_KAISER handling x86/kaiser: Check boottime cmdline params kaiser: use ALTERNATIVE instead of x86_cr3_pcid_noflush kaiser: drop is_atomic arg to kaiser_pagetable_walk() kaiser: asm/tlbflush.h handle noPGE at lower level kaiser: kaiser_flush_tlb_on_return_to_user() check PCID x86/paravirt: Dont patch flush_tlb_single x86/kaiser: Reenable PARAVIRT kaiser: disabled on Xen PV x86/kaiser: Move feature detection up KPTI: Rename to PAGE_TABLE_ISOLATION KPTI: Report when enabled x86, vdso, pvclock: Simplify and speed up the vdso pvclock reader x86/vdso: Get pvclock data from the vvar VMA instead of the fixmap x86/kasan: Clear kasan_zero_page after TLB flush kaiser: Set _PAGE_NX only if supported Linux 4.4.110 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
This commit is contained in:
commit
5cc8c2ec61
51 changed files with 1470 additions and 148 deletions
|
@ -2529,6 +2529,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
|
||||
nojitter [IA-64] Disables jitter checking for ITC timers.
|
||||
|
||||
nopti [X86-64] Disable KAISER isolation of kernel from user.
|
||||
|
||||
no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
|
||||
|
||||
no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
|
||||
|
@ -3060,6 +3062,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
pt. [PARIDE]
|
||||
See Documentation/blockdev/paride.txt.
|
||||
|
||||
pti= [X86_64]
|
||||
Control KAISER user/kernel address space isolation:
|
||||
on - enable
|
||||
off - disable
|
||||
auto - default setting
|
||||
|
||||
pty.legacy_count=
|
||||
[KNL] Number of legacy pty's. Overwrites compiled-in
|
||||
default number.
|
||||
|
|
2
Makefile
2
Makefile
|
@ -1,6 +1,6 @@
|
|||
VERSION = 4
|
||||
PATCHLEVEL = 4
|
||||
SUBLEVEL = 109
|
||||
SUBLEVEL = 110
|
||||
EXTRAVERSION =
|
||||
NAME = Blurry Fish Butt
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
*/
|
||||
#undef CONFIG_PARAVIRT
|
||||
#undef CONFIG_PARAVIRT_SPINLOCKS
|
||||
#undef CONFIG_PAGE_TABLE_ISOLATION
|
||||
#undef CONFIG_KASAN
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
#include <asm/asm.h>
|
||||
#include <asm/smap.h>
|
||||
#include <asm/pgtable_types.h>
|
||||
#include <asm/kaiser.h>
|
||||
#include <linux/err.h>
|
||||
|
||||
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
|
||||
|
@ -135,6 +136,7 @@ ENTRY(entry_SYSCALL_64)
|
|||
* it is too small to ever cause noticeable irq latency.
|
||||
*/
|
||||
SWAPGS_UNSAFE_STACK
|
||||
SWITCH_KERNEL_CR3_NO_STACK
|
||||
/*
|
||||
* A hypervisor implementation might want to use a label
|
||||
* after the swapgs, so that it can do the swapgs
|
||||
|
@ -207,9 +209,17 @@ entry_SYSCALL_64_fastpath:
|
|||
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
|
||||
jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
|
||||
|
||||
RESTORE_C_REGS_EXCEPT_RCX_R11
|
||||
movq RIP(%rsp), %rcx
|
||||
movq EFLAGS(%rsp), %r11
|
||||
RESTORE_C_REGS_EXCEPT_RCX_R11
|
||||
/*
|
||||
* This opens a window where we have a user CR3, but are
|
||||
* running in the kernel. This makes using the CS
|
||||
* register useless for telling whether or not we need to
|
||||
* switch CR3 in NMIs. Normal interrupts are OK because
|
||||
* they are off here.
|
||||
*/
|
||||
SWITCH_USER_CR3
|
||||
movq RSP(%rsp), %rsp
|
||||
/*
|
||||
* 64-bit SYSRET restores rip from rcx,
|
||||
|
@ -347,10 +357,26 @@ GLOBAL(int_ret_from_sys_call)
|
|||
syscall_return_via_sysret:
|
||||
/* rcx and r11 are already restored (see code above) */
|
||||
RESTORE_C_REGS_EXCEPT_RCX_R11
|
||||
/*
|
||||
* This opens a window where we have a user CR3, but are
|
||||
* running in the kernel. This makes using the CS
|
||||
* register useless for telling whether or not we need to
|
||||
* switch CR3 in NMIs. Normal interrupts are OK because
|
||||
* they are off here.
|
||||
*/
|
||||
SWITCH_USER_CR3
|
||||
movq RSP(%rsp), %rsp
|
||||
USERGS_SYSRET64
|
||||
|
||||
opportunistic_sysret_failed:
|
||||
/*
|
||||
* This opens a window where we have a user CR3, but are
|
||||
* running in the kernel. This makes using the CS
|
||||
* register useless for telling whether or not we need to
|
||||
* switch CR3 in NMIs. Normal interrupts are OK because
|
||||
* they are off here.
|
||||
*/
|
||||
SWITCH_USER_CR3
|
||||
SWAPGS
|
||||
jmp restore_c_regs_and_iret
|
||||
END(entry_SYSCALL_64)
|
||||
|
@ -509,6 +535,7 @@ END(irq_entries_start)
|
|||
* tracking that we're in kernel mode.
|
||||
*/
|
||||
SWAPGS
|
||||
SWITCH_KERNEL_CR3
|
||||
|
||||
/*
|
||||
* We need to tell lockdep that IRQs are off. We can't do this until
|
||||
|
@ -568,6 +595,7 @@ GLOBAL(retint_user)
|
|||
mov %rsp,%rdi
|
||||
call prepare_exit_to_usermode
|
||||
TRACE_IRQS_IRETQ
|
||||
SWITCH_USER_CR3
|
||||
SWAPGS
|
||||
jmp restore_regs_and_iret
|
||||
|
||||
|
@ -625,6 +653,7 @@ native_irq_return_ldt:
|
|||
pushq %rax
|
||||
pushq %rdi
|
||||
SWAPGS
|
||||
SWITCH_KERNEL_CR3
|
||||
movq PER_CPU_VAR(espfix_waddr), %rdi
|
||||
movq %rax, (0*8)(%rdi) /* RAX */
|
||||
movq (2*8)(%rsp), %rax /* RIP */
|
||||
|
@ -640,6 +669,7 @@ native_irq_return_ldt:
|
|||
andl $0xffff0000, %eax
|
||||
popq %rdi
|
||||
orq PER_CPU_VAR(espfix_stack), %rax
|
||||
SWITCH_USER_CR3
|
||||
SWAPGS
|
||||
movq %rax, %rsp
|
||||
popq %rax
|
||||
|
@ -1001,7 +1031,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
|
|||
/*
|
||||
* Save all registers in pt_regs, and switch gs if needed.
|
||||
* Use slow, but surefire "are we in kernel?" check.
|
||||
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
|
||||
*
|
||||
* Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
|
||||
* ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
|
||||
* ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
|
||||
* ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
|
||||
*/
|
||||
ENTRY(paranoid_entry)
|
||||
cld
|
||||
|
@ -1014,7 +1048,26 @@ ENTRY(paranoid_entry)
|
|||
js 1f /* negative -> in kernel */
|
||||
SWAPGS
|
||||
xorl %ebx, %ebx
|
||||
1: ret
|
||||
1:
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* We might have come in between a swapgs and a SWITCH_KERNEL_CR3
|
||||
* on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
|
||||
* Do a conditional SWITCH_KERNEL_CR3: this could safely be done
|
||||
* unconditionally, but we need to find out whether the reverse
|
||||
* should be done on return (conveyed to paranoid_exit in %ebx).
|
||||
*/
|
||||
ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
|
||||
testl $KAISER_SHADOW_PGD_OFFSET, %eax
|
||||
jz 2f
|
||||
orl $2, %ebx
|
||||
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
|
||||
/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
|
||||
ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
|
||||
movq %rax, %cr3
|
||||
2:
|
||||
#endif
|
||||
ret
|
||||
END(paranoid_entry)
|
||||
|
||||
/*
|
||||
|
@ -1027,19 +1080,26 @@ END(paranoid_entry)
|
|||
* be complicated. Fortunately, we there's no good reason
|
||||
* to try to handle preemption here.
|
||||
*
|
||||
* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
|
||||
* On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
|
||||
* ebx=1: needs neither swapgs nor SWITCH_USER_CR3
|
||||
* ebx=2: needs both swapgs and SWITCH_USER_CR3
|
||||
* ebx=3: needs SWITCH_USER_CR3 but not swapgs
|
||||
*/
|
||||
ENTRY(paranoid_exit)
|
||||
DISABLE_INTERRUPTS(CLBR_NONE)
|
||||
TRACE_IRQS_OFF_DEBUG
|
||||
testl %ebx, %ebx /* swapgs needed? */
|
||||
jnz paranoid_exit_no_swapgs
|
||||
TRACE_IRQS_IRETQ
|
||||
SWAPGS_UNSAFE_STACK
|
||||
jmp paranoid_exit_restore
|
||||
paranoid_exit_no_swapgs:
|
||||
TRACE_IRQS_IRETQ_DEBUG
|
||||
paranoid_exit_restore:
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
|
||||
testl $2, %ebx /* SWITCH_USER_CR3 needed? */
|
||||
jz paranoid_exit_no_switch
|
||||
SWITCH_USER_CR3
|
||||
paranoid_exit_no_switch:
|
||||
#endif
|
||||
testl $1, %ebx /* swapgs needed? */
|
||||
jnz paranoid_exit_no_swapgs
|
||||
SWAPGS_UNSAFE_STACK
|
||||
paranoid_exit_no_swapgs:
|
||||
RESTORE_EXTRA_REGS
|
||||
RESTORE_C_REGS
|
||||
REMOVE_PT_GPREGS_FROM_STACK 8
|
||||
|
@ -1054,6 +1114,13 @@ ENTRY(error_entry)
|
|||
cld
|
||||
SAVE_C_REGS 8
|
||||
SAVE_EXTRA_REGS 8
|
||||
/*
|
||||
* error_entry() always returns with a kernel gsbase and
|
||||
* CR3. We must also have a kernel CR3/gsbase before
|
||||
* calling TRACE_IRQS_*. Just unconditionally switch to
|
||||
* the kernel CR3 here.
|
||||
*/
|
||||
SWITCH_KERNEL_CR3
|
||||
xorl %ebx, %ebx
|
||||
testb $3, CS+8(%rsp)
|
||||
jz .Lerror_kernelspace
|
||||
|
@ -1216,6 +1283,10 @@ ENTRY(nmi)
|
|||
*/
|
||||
|
||||
SWAPGS_UNSAFE_STACK
|
||||
/*
|
||||
* percpu variables are mapped with user CR3, so no need
|
||||
* to switch CR3 here.
|
||||
*/
|
||||
cld
|
||||
movq %rsp, %rdx
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
|
@ -1249,12 +1320,34 @@ ENTRY(nmi)
|
|||
|
||||
movq %rsp, %rdi
|
||||
movq $-1, %rsi
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/* Unconditionally use kernel CR3 for do_nmi() */
|
||||
/* %rax is saved above, so OK to clobber here */
|
||||
ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
|
||||
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
|
||||
ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
|
||||
pushq %rax
|
||||
/* mask off "user" bit of pgd address and 12 PCID bits: */
|
||||
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
|
||||
movq %rax, %cr3
|
||||
2:
|
||||
#endif
|
||||
call do_nmi
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Unconditionally restore CR3. I know we return to
|
||||
* kernel code that needs user CR3, but do we ever return
|
||||
* to "user mode" where we need the kernel CR3?
|
||||
*/
|
||||
ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Return back to user mode. We must *not* do the normal exit
|
||||
* work, because we don't want to enable interrupts. Fortunately,
|
||||
* do_nmi doesn't modify pt_regs.
|
||||
* work, because we don't want to enable interrupts. Do not
|
||||
* switch to user CR3: we might be going back to kernel code
|
||||
* that had a user CR3 set.
|
||||
*/
|
||||
SWAPGS
|
||||
jmp restore_c_regs_and_iret
|
||||
|
@ -1451,22 +1544,55 @@ end_repeat_nmi:
|
|||
ALLOC_PT_GPREGS_ON_STACK
|
||||
|
||||
/*
|
||||
* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
|
||||
* as we should not be calling schedule in NMI context.
|
||||
* Even with normal interrupts enabled. An NMI should not be
|
||||
* setting NEED_RESCHED or anything that normal interrupts and
|
||||
* exceptions might do.
|
||||
* Use the same approach as paranoid_entry to handle SWAPGS, but
|
||||
* without CR3 handling since we do that differently in NMIs. No
|
||||
* need to use paranoid_exit as we should not be calling schedule
|
||||
* in NMI context. Even with normal interrupts enabled. An NMI
|
||||
* should not be setting NEED_RESCHED or anything that normal
|
||||
* interrupts and exceptions might do.
|
||||
*/
|
||||
call paranoid_entry
|
||||
|
||||
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
|
||||
cld
|
||||
SAVE_C_REGS
|
||||
SAVE_EXTRA_REGS
|
||||
movl $1, %ebx
|
||||
movl $MSR_GS_BASE, %ecx
|
||||
rdmsr
|
||||
testl %edx, %edx
|
||||
js 1f /* negative -> in kernel */
|
||||
SWAPGS
|
||||
xorl %ebx, %ebx
|
||||
1:
|
||||
movq %rsp, %rdi
|
||||
movq $-1, %rsi
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/* Unconditionally use kernel CR3 for do_nmi() */
|
||||
/* %rax is saved above, so OK to clobber here */
|
||||
ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
|
||||
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
|
||||
ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
|
||||
pushq %rax
|
||||
/* mask off "user" bit of pgd address and 12 PCID bits: */
|
||||
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
|
||||
movq %rax, %cr3
|
||||
2:
|
||||
#endif
|
||||
|
||||
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
|
||||
call do_nmi
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Unconditionally restore CR3. We might be returning to
|
||||
* kernel code that needs user CR3, like just just before
|
||||
* a sysret.
|
||||
*/
|
||||
ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
|
||||
#endif
|
||||
|
||||
testl %ebx, %ebx /* swapgs needed? */
|
||||
jnz nmi_restore
|
||||
nmi_swapgs:
|
||||
/* We fixed up CR3 above, so no need to switch it here */
|
||||
SWAPGS_UNSAFE_STACK
|
||||
nmi_restore:
|
||||
RESTORE_EXTRA_REGS
|
||||
|
|
|
@ -13,6 +13,8 @@
|
|||
#include <asm/irqflags.h>
|
||||
#include <asm/asm.h>
|
||||
#include <asm/smap.h>
|
||||
#include <asm/pgtable_types.h>
|
||||
#include <asm/kaiser.h>
|
||||
#include <linux/linkage.h>
|
||||
#include <linux/err.h>
|
||||
|
||||
|
@ -50,6 +52,7 @@ ENDPROC(native_usergs_sysret32)
|
|||
ENTRY(entry_SYSENTER_compat)
|
||||
/* Interrupts are off on entry. */
|
||||
SWAPGS_UNSAFE_STACK
|
||||
SWITCH_KERNEL_CR3_NO_STACK
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
|
||||
/*
|
||||
|
@ -161,6 +164,7 @@ ENDPROC(entry_SYSENTER_compat)
|
|||
ENTRY(entry_SYSCALL_compat)
|
||||
/* Interrupts are off on entry. */
|
||||
SWAPGS_UNSAFE_STACK
|
||||
SWITCH_KERNEL_CR3_NO_STACK
|
||||
|
||||
/* Stash user ESP and switch to the kernel stack. */
|
||||
movl %esp, %r8d
|
||||
|
@ -208,6 +212,7 @@ ENTRY(entry_SYSCALL_compat)
|
|||
/* Opportunistic SYSRET */
|
||||
sysret32_from_system_call:
|
||||
TRACE_IRQS_ON /* User mode traces as IRQs on. */
|
||||
SWITCH_USER_CR3
|
||||
movq RBX(%rsp), %rbx /* pt_regs->rbx */
|
||||
movq RBP(%rsp), %rbp /* pt_regs->rbp */
|
||||
movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
|
||||
|
@ -269,6 +274,7 @@ ENTRY(entry_INT80_compat)
|
|||
PARAVIRT_ADJUST_EXCEPTION_FRAME
|
||||
ASM_CLAC /* Do this early to minimize exposure */
|
||||
SWAPGS
|
||||
SWITCH_KERNEL_CR3_NO_STACK
|
||||
|
||||
/*
|
||||
* User tracing code (ptrace or signal handlers) might assume that
|
||||
|
@ -311,6 +317,7 @@ ENTRY(entry_INT80_compat)
|
|||
|
||||
/* Go back to user mode. */
|
||||
TRACE_IRQS_ON
|
||||
SWITCH_USER_CR3
|
||||
SWAPGS
|
||||
jmp restore_regs_and_iret
|
||||
END(entry_INT80_compat)
|
||||
|
|
|
@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PARAVIRT_CLOCK
|
||||
extern u8 pvclock_page
|
||||
__attribute__((visibility("hidden")));
|
||||
#endif
|
||||
|
||||
#ifndef BUILD_VDSO32
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
@ -62,63 +67,65 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
|
|||
|
||||
#ifdef CONFIG_PARAVIRT_CLOCK
|
||||
|
||||
static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
|
||||
static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
|
||||
{
|
||||
const struct pvclock_vsyscall_time_info *pvti_base;
|
||||
int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
|
||||
int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
|
||||
|
||||
BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
|
||||
|
||||
pvti_base = (struct pvclock_vsyscall_time_info *)
|
||||
__fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
|
||||
|
||||
return &pvti_base[offset];
|
||||
return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
|
||||
}
|
||||
|
||||
static notrace cycle_t vread_pvclock(int *mode)
|
||||
{
|
||||
const struct pvclock_vsyscall_time_info *pvti;
|
||||
const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
|
||||
cycle_t ret;
|
||||
u64 last;
|
||||
u32 version;
|
||||
u8 flags;
|
||||
unsigned cpu, cpu1;
|
||||
|
||||
u64 tsc, pvti_tsc;
|
||||
u64 last, delta, pvti_system_time;
|
||||
u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
|
||||
|
||||
/*
|
||||
* Note: hypervisor must guarantee that:
|
||||
* 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
|
||||
* 2. that per-CPU pvclock time info is updated if the
|
||||
* underlying CPU changes.
|
||||
* 3. that version is increased whenever underlying CPU
|
||||
* changes.
|
||||
* Note: The kernel and hypervisor must guarantee that cpu ID
|
||||
* number maps 1:1 to per-CPU pvclock time info.
|
||||
*
|
||||
* Because the hypervisor is entirely unaware of guest userspace
|
||||
* preemption, it cannot guarantee that per-CPU pvclock time
|
||||
* info is updated if the underlying CPU changes or that that
|
||||
* version is increased whenever underlying CPU changes.
|
||||
*
|
||||
* On KVM, we are guaranteed that pvti updates for any vCPU are
|
||||
* atomic as seen by *all* vCPUs. This is an even stronger
|
||||
* guarantee than we get with a normal seqlock.
|
||||
*
|
||||
* On Xen, we don't appear to have that guarantee, but Xen still
|
||||
* supplies a valid seqlock using the version field.
|
||||
|
||||
* We only do pvclock vdso timing at all if
|
||||
* PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
|
||||
* mean that all vCPUs have matching pvti and that the TSC is
|
||||
* synced, so we can just look at vCPU 0's pvti.
|
||||
*/
|
||||
do {
|
||||
cpu = __getcpu() & VGETCPU_CPU_MASK;
|
||||
/* TODO: We can put vcpu id into higher bits of pvti.version.
|
||||
* This will save a couple of cycles by getting rid of
|
||||
* __getcpu() calls (Gleb).
|
||||
*/
|
||||
|
||||
pvti = get_pvti(cpu);
|
||||
|
||||
version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
|
||||
|
||||
/*
|
||||
* Test we're still on the cpu as well as the version.
|
||||
* We could have been migrated just after the first
|
||||
* vgetcpu but before fetching the version, so we
|
||||
* wouldn't notice a version change.
|
||||
*/
|
||||
cpu1 = __getcpu() & VGETCPU_CPU_MASK;
|
||||
} while (unlikely(cpu != cpu1 ||
|
||||
(pvti->pvti.version & 1) ||
|
||||
pvti->pvti.version != version));
|
||||
|
||||
if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
|
||||
if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
|
||||
*mode = VCLOCK_NONE;
|
||||
return 0;
|
||||
}
|
||||
|
||||
do {
|
||||
version = pvti->version;
|
||||
|
||||
/* This is also a read barrier, so we'll read version first. */
|
||||
tsc = rdtsc_ordered();
|
||||
|
||||
pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
|
||||
pvti_tsc_shift = pvti->tsc_shift;
|
||||
pvti_system_time = pvti->system_time;
|
||||
pvti_tsc = pvti->tsc_timestamp;
|
||||
|
||||
/* Make sure that the version double-check is last. */
|
||||
smp_rmb();
|
||||
} while (unlikely((version & 1) || version != pvti->version));
|
||||
|
||||
delta = tsc - pvti_tsc;
|
||||
ret = pvti_system_time +
|
||||
pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
|
||||
pvti_tsc_shift);
|
||||
|
||||
/* refer to tsc.c read_tsc() comment for rationale */
|
||||
last = gtod->cycle_last;
|
||||
|
|
|
@ -25,7 +25,7 @@ SECTIONS
|
|||
* segment.
|
||||
*/
|
||||
|
||||
vvar_start = . - 2 * PAGE_SIZE;
|
||||
vvar_start = . - 3 * PAGE_SIZE;
|
||||
vvar_page = vvar_start;
|
||||
|
||||
/* Place all vvars at the offsets in asm/vvar.h. */
|
||||
|
@ -36,6 +36,7 @@ SECTIONS
|
|||
#undef EMIT_VVAR
|
||||
|
||||
hpet_page = vvar_start + PAGE_SIZE;
|
||||
pvclock_page = vvar_start + 2 * PAGE_SIZE;
|
||||
|
||||
. = SIZEOF_HEADERS;
|
||||
|
||||
|
|
|
@ -73,6 +73,7 @@ enum {
|
|||
sym_vvar_start,
|
||||
sym_vvar_page,
|
||||
sym_hpet_page,
|
||||
sym_pvclock_page,
|
||||
sym_VDSO_FAKE_SECTION_TABLE_START,
|
||||
sym_VDSO_FAKE_SECTION_TABLE_END,
|
||||
};
|
||||
|
@ -80,6 +81,7 @@ enum {
|
|||
const int special_pages[] = {
|
||||
sym_vvar_page,
|
||||
sym_hpet_page,
|
||||
sym_pvclock_page,
|
||||
};
|
||||
|
||||
struct vdso_sym {
|
||||
|
@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {
|
|||
[sym_vvar_start] = {"vvar_start", true},
|
||||
[sym_vvar_page] = {"vvar_page", true},
|
||||
[sym_hpet_page] = {"hpet_page", true},
|
||||
[sym_pvclock_page] = {"pvclock_page", true},
|
||||
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
|
||||
"VDSO_FAKE_SECTION_TABLE_START", false
|
||||
},
|
||||
|
|
|
@ -100,6 +100,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
|
|||
.name = "[vvar]",
|
||||
.pages = no_pages,
|
||||
};
|
||||
struct pvclock_vsyscall_time_info *pvti;
|
||||
|
||||
if (calculate_addr) {
|
||||
addr = vdso_addr(current->mm->start_stack,
|
||||
|
@ -169,6 +170,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
|
|||
}
|
||||
#endif
|
||||
|
||||
pvti = pvclock_pvti_cpu0_va();
|
||||
if (pvti && image->sym_pvclock_page) {
|
||||
ret = remap_pfn_range(vma,
|
||||
text_start + image->sym_pvclock_page,
|
||||
__pa(pvti) >> PAGE_SHIFT,
|
||||
PAGE_SIZE,
|
||||
PAGE_READONLY);
|
||||
|
||||
if (ret)
|
||||
goto up_fail;
|
||||
}
|
||||
|
||||
up_fail:
|
||||
if (ret)
|
||||
current->mm->context.vdso = NULL;
|
||||
|
|
|
@ -2,5 +2,7 @@
|
|||
#define _ASM_X86_CMDLINE_H
|
||||
|
||||
int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
|
||||
int cmdline_find_option(const char *cmdline_ptr, const char *option,
|
||||
char *buffer, int bufsize);
|
||||
|
||||
#endif /* _ASM_X86_CMDLINE_H */
|
||||
|
|
|
@ -187,6 +187,7 @@
|
|||
#define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */
|
||||
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
|
||||
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
|
||||
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
|
||||
#define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */
|
||||
#define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */
|
||||
#define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
|
||||
|
@ -199,6 +200,9 @@
|
|||
#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
|
||||
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
|
||||
|
||||
/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
|
||||
#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
|
||||
|
||||
/* Virtualization flags: Linux defined, word 8 */
|
||||
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
|
||||
#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
|
||||
|
|
|
@ -43,7 +43,7 @@ struct gdt_page {
|
|||
struct desc_struct gdt[GDT_ENTRIES];
|
||||
} __attribute__((aligned(PAGE_SIZE)));
|
||||
|
||||
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
|
||||
DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
|
||||
|
||||
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
|
||||
{
|
||||
|
|
|
@ -187,7 +187,7 @@ extern char irq_entries_start[];
|
|||
#define VECTOR_RETRIGGERED ((void *)~0UL)
|
||||
|
||||
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
|
||||
DECLARE_PER_CPU(vector_irq_t, vector_irq);
|
||||
DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
|
||||
|
||||
#endif /* !ASSEMBLY_ */
|
||||
|
||||
|
|
141
arch/x86/include/asm/kaiser.h
Normal file
141
arch/x86/include/asm/kaiser.h
Normal file
|
@ -0,0 +1,141 @@
|
|||
#ifndef _ASM_X86_KAISER_H
|
||||
#define _ASM_X86_KAISER_H
|
||||
|
||||
#include <uapi/asm/processor-flags.h> /* For PCID constants */
|
||||
|
||||
/*
|
||||
* This file includes the definitions for the KAISER feature.
|
||||
* KAISER is a counter measure against x86_64 side channel attacks on
|
||||
* the kernel virtual memory. It has a shadow pgd for every process: the
|
||||
* shadow pgd has a minimalistic kernel-set mapped, but includes the whole
|
||||
* user memory. Within a kernel context switch, or when an interrupt is handled,
|
||||
* the pgd is switched to the normal one. When the system switches to user mode,
|
||||
* the shadow pgd is enabled. By this, the virtual memory caches are freed,
|
||||
* and the user may not attack the whole kernel memory.
|
||||
*
|
||||
* A minimalistic kernel mapping holds the parts needed to be mapped in user
|
||||
* mode, such as the entry/exit functions of the user space, or the stacks.
|
||||
*/
|
||||
|
||||
#define KAISER_SHADOW_PGD_OFFSET 0x1000
|
||||
|
||||
#ifdef __ASSEMBLY__
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
|
||||
.macro _SWITCH_TO_KERNEL_CR3 reg
|
||||
movq %cr3, \reg
|
||||
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
|
||||
/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
|
||||
ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
|
||||
movq \reg, %cr3
|
||||
.endm
|
||||
|
||||
.macro _SWITCH_TO_USER_CR3 reg regb
|
||||
/*
|
||||
* regb must be the low byte portion of reg: because we have arranged
|
||||
* for the low byte of the user PCID to serve as the high byte of NOFLUSH
|
||||
* (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
|
||||
* not enabled): so that the one register can update both memory and cr3.
|
||||
*/
|
||||
movq %cr3, \reg
|
||||
orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
|
||||
js 9f
|
||||
/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
|
||||
movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
|
||||
9:
|
||||
movq \reg, %cr3
|
||||
.endm
|
||||
|
||||
.macro SWITCH_KERNEL_CR3
|
||||
ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
|
||||
_SWITCH_TO_KERNEL_CR3 %rax
|
||||
popq %rax
|
||||
8:
|
||||
.endm
|
||||
|
||||
.macro SWITCH_USER_CR3
|
||||
ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
|
||||
_SWITCH_TO_USER_CR3 %rax %al
|
||||
popq %rax
|
||||
8:
|
||||
.endm
|
||||
|
||||
.macro SWITCH_KERNEL_CR3_NO_STACK
|
||||
ALTERNATIVE "jmp 8f", \
|
||||
__stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
|
||||
X86_FEATURE_KAISER
|
||||
_SWITCH_TO_KERNEL_CR3 %rax
|
||||
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
|
||||
8:
|
||||
.endm
|
||||
|
||||
#else /* CONFIG_PAGE_TABLE_ISOLATION */
|
||||
|
||||
.macro SWITCH_KERNEL_CR3
|
||||
.endm
|
||||
.macro SWITCH_USER_CR3
|
||||
.endm
|
||||
.macro SWITCH_KERNEL_CR3_NO_STACK
|
||||
.endm
|
||||
|
||||
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
|
||||
|
||||
#else /* __ASSEMBLY__ */
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Upon kernel/user mode switch, it may happen that the address
|
||||
* space has to be switched before the registers have been
|
||||
* stored. To change the address space, another register is
|
||||
* needed. A register therefore has to be stored/restored.
|
||||
*/
|
||||
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
|
||||
|
||||
DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
|
||||
|
||||
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
|
||||
|
||||
extern int kaiser_enabled;
|
||||
extern void __init kaiser_check_boottime_disable(void);
|
||||
#else
|
||||
#define kaiser_enabled 0
|
||||
static inline void __init kaiser_check_boottime_disable(void) {}
|
||||
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
|
||||
|
||||
/*
|
||||
* Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,
|
||||
* so as to build with tests on kaiser_enabled instead of #ifdefs.
|
||||
*/
|
||||
|
||||
/**
|
||||
* kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
|
||||
* @addr: the start address of the range
|
||||
* @size: the size of the range
|
||||
* @flags: The mapping flags of the pages
|
||||
*
|
||||
* The mapping is done on a global scope, so no bigger
|
||||
* synchronization has to be done. the pages have to be
|
||||
* manually unmapped again when they are not needed any longer.
|
||||
*/
|
||||
extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
|
||||
|
||||
/**
|
||||
* kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
|
||||
* @addr: the start address of the range
|
||||
* @size: the size of the range
|
||||
*/
|
||||
extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
|
||||
|
||||
/**
|
||||
* kaiser_init - Initialize the shadow mapping
|
||||
*
|
||||
* Most parts of the shadow mapping can be mapped upon boot
|
||||
* time. Only per-process things like the thread stacks
|
||||
* or a new LDT have to be mapped at runtime. These boot-
|
||||
* time mappings are permanent and never unmapped.
|
||||
*/
|
||||
extern void kaiser_init(void);
|
||||
|
||||
#endif /* __ASSEMBLY */
|
||||
|
||||
#endif /* _ASM_X86_KAISER_H */
|
|
@ -18,6 +18,12 @@
|
|||
#ifndef __ASSEMBLY__
|
||||
#include <asm/x86_init.h>
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
extern int kaiser_enabled;
|
||||
#else
|
||||
#define kaiser_enabled 0
|
||||
#endif
|
||||
|
||||
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
|
||||
void ptdump_walk_pgd_level_checkwx(void);
|
||||
|
||||
|
@ -653,7 +659,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
|
|||
|
||||
static inline int pgd_bad(pgd_t pgd)
|
||||
{
|
||||
return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
|
||||
pgdval_t ignore_flags = _PAGE_USER;
|
||||
/*
|
||||
* We set NX on KAISER pgds that map userspace memory so
|
||||
* that userspace can not meaningfully use the kernel
|
||||
* page table by accident; it will fault on the first
|
||||
* instruction it tries to run. See native_set_pgd().
|
||||
*/
|
||||
if (kaiser_enabled)
|
||||
ignore_flags |= _PAGE_NX;
|
||||
|
||||
return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
|
||||
}
|
||||
|
||||
static inline int pgd_none(pgd_t pgd)
|
||||
|
@ -855,7 +871,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
|
|||
*/
|
||||
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
|
||||
{
|
||||
memcpy(dst, src, count * sizeof(pgd_t));
|
||||
memcpy(dst, src, count * sizeof(pgd_t));
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
if (kaiser_enabled) {
|
||||
/* Clone the shadow pgd part as well */
|
||||
memcpy(native_get_shadow_pgd(dst),
|
||||
native_get_shadow_pgd(src),
|
||||
count * sizeof(pgd_t));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
|
||||
|
|
|
@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)
|
|||
native_set_pud(pud, native_make_pud(0));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
|
||||
|
||||
static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
|
||||
{
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
/* linux/mmdebug.h may not have been included at this point */
|
||||
BUG_ON(!kaiser_enabled);
|
||||
#endif
|
||||
return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
|
||||
}
|
||||
#else
|
||||
static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
|
||||
{
|
||||
return pgd;
|
||||
}
|
||||
static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
|
||||
{
|
||||
BUILD_BUG_ON(1);
|
||||
return NULL;
|
||||
}
|
||||
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
|
||||
|
||||
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
|
||||
{
|
||||
*pgdp = pgd;
|
||||
*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
|
||||
}
|
||||
|
||||
static inline void native_pgd_clear(pgd_t *pgd)
|
||||
|
|
|
@ -89,7 +89,7 @@
|
|||
#define _PAGE_NX (_AT(pteval_t, 0))
|
||||
#endif
|
||||
|
||||
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
|
||||
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
|
||||
|
||||
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
|
||||
_PAGE_ACCESSED | _PAGE_DIRTY)
|
||||
|
@ -102,6 +102,33 @@
|
|||
_PAGE_SOFT_DIRTY)
|
||||
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
|
||||
|
||||
/* The ASID is the lower 12 bits of CR3 */
|
||||
#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
|
||||
|
||||
/* Mask for all the PCID-related bits in CR3: */
|
||||
#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
|
||||
#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
|
||||
|
||||
#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
|
||||
/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
|
||||
#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
|
||||
|
||||
#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
|
||||
#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
|
||||
#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
|
||||
#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
|
||||
#else
|
||||
#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
|
||||
/*
|
||||
* PCIDs are unsupported on 32-bit and none of these bits can be
|
||||
* set in CR3:
|
||||
*/
|
||||
#define X86_CR3_PCID_KERN_FLUSH (0)
|
||||
#define X86_CR3_PCID_USER_FLUSH (0)
|
||||
#define X86_CR3_PCID_KERN_NOFLUSH (0)
|
||||
#define X86_CR3_PCID_USER_NOFLUSH (0)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The cache modes defined here are used to translate between pure SW usage
|
||||
* and the HW defined cache mode bits and/or PAT entries.
|
||||
|
|
|
@ -305,7 +305,7 @@ struct tss_struct {
|
|||
|
||||
} ____cacheline_aligned;
|
||||
|
||||
DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
|
||||
DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
|
||||
|
|
|
@ -4,6 +4,15 @@
|
|||
#include <linux/clocksource.h>
|
||||
#include <asm/pvclock-abi.h>
|
||||
|
||||
#ifdef CONFIG_PARAVIRT_CLOCK
|
||||
extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
|
||||
#else
|
||||
static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* some helper functions for xen and kvm pv clock sources */
|
||||
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
|
||||
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
|
||||
|
|
|
@ -131,6 +131,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
|
|||
cr4_set_bits(mask);
|
||||
}
|
||||
|
||||
/*
|
||||
* Declare a couple of kaiser interfaces here for convenience,
|
||||
* to avoid the need for asm/kaiser.h in unexpected places.
|
||||
*/
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
extern int kaiser_enabled;
|
||||
extern void kaiser_setup_pcid(void);
|
||||
extern void kaiser_flush_tlb_on_return_to_user(void);
|
||||
#else
|
||||
#define kaiser_enabled 0
|
||||
static inline void kaiser_setup_pcid(void)
|
||||
{
|
||||
}
|
||||
static inline void kaiser_flush_tlb_on_return_to_user(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void __native_flush_tlb(void)
|
||||
{
|
||||
/*
|
||||
|
@ -139,6 +157,8 @@ static inline void __native_flush_tlb(void)
|
|||
* back:
|
||||
*/
|
||||
preempt_disable();
|
||||
if (kaiser_enabled)
|
||||
kaiser_flush_tlb_on_return_to_user();
|
||||
native_write_cr3(native_read_cr3());
|
||||
preempt_enable();
|
||||
}
|
||||
|
@ -148,20 +168,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
|
|||
unsigned long cr4;
|
||||
|
||||
cr4 = this_cpu_read(cpu_tlbstate.cr4);
|
||||
/* clear PGE */
|
||||
native_write_cr4(cr4 & ~X86_CR4_PGE);
|
||||
/* write old PGE again and flush TLBs */
|
||||
native_write_cr4(cr4);
|
||||
if (cr4 & X86_CR4_PGE) {
|
||||
/* clear PGE and flush TLB of all entries */
|
||||
native_write_cr4(cr4 & ~X86_CR4_PGE);
|
||||
/* restore PGE as it was before */
|
||||
native_write_cr4(cr4);
|
||||
} else {
|
||||
/* do it with cr3, letting kaiser flush user PCID */
|
||||
__native_flush_tlb();
|
||||
}
|
||||
}
|
||||
|
||||
static inline void __native_flush_tlb_global(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
if (static_cpu_has(X86_FEATURE_INVPCID)) {
|
||||
if (this_cpu_has(X86_FEATURE_INVPCID)) {
|
||||
/*
|
||||
* Using INVPCID is considerably faster than a pair of writes
|
||||
* to CR4 sandwiched inside an IRQ flag save/restore.
|
||||
*
|
||||
* Note, this works with CR4.PCIDE=0 or 1.
|
||||
*/
|
||||
invpcid_flush_all();
|
||||
return;
|
||||
|
@ -173,24 +200,45 @@ static inline void __native_flush_tlb_global(void)
|
|||
* be called from deep inside debugging code.)
|
||||
*/
|
||||
raw_local_irq_save(flags);
|
||||
|
||||
__native_flush_tlb_global_irq_disabled();
|
||||
|
||||
raw_local_irq_restore(flags);
|
||||
}
|
||||
|
||||
static inline void __native_flush_tlb_single(unsigned long addr)
|
||||
{
|
||||
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
|
||||
/*
|
||||
* SIMICS #GP's if you run INVPCID with type 2/3
|
||||
* and X86_CR4_PCIDE clear. Shame!
|
||||
*
|
||||
* The ASIDs used below are hard-coded. But, we must not
|
||||
* call invpcid(type=1/2) before CR4.PCIDE=1. Just call
|
||||
* invlpg in the case we are called early.
|
||||
*/
|
||||
|
||||
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
|
||||
if (kaiser_enabled)
|
||||
kaiser_flush_tlb_on_return_to_user();
|
||||
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
|
||||
return;
|
||||
}
|
||||
/* Flush the address out of both PCIDs. */
|
||||
/*
|
||||
* An optimization here might be to determine addresses
|
||||
* that are only kernel-mapped and only flush the kernel
|
||||
* ASID. But, userspace flushes are probably much more
|
||||
* important performance-wise.
|
||||
*
|
||||
* Make sure to do only a single invpcid when KAISER is
|
||||
* disabled and we have only a single ASID.
|
||||
*/
|
||||
if (kaiser_enabled)
|
||||
invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
|
||||
invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
|
||||
}
|
||||
|
||||
static inline void __flush_tlb_all(void)
|
||||
{
|
||||
if (cpu_has_pge)
|
||||
__flush_tlb_global();
|
||||
else
|
||||
__flush_tlb();
|
||||
|
||||
__flush_tlb_global();
|
||||
/*
|
||||
* Note: if we somehow had PCID but not PGE, then this wouldn't work --
|
||||
* we'd end up flushing kernel translations for the current ASID but
|
||||
|
|
|
@ -22,6 +22,7 @@ struct vdso_image {
|
|||
|
||||
long sym_vvar_page;
|
||||
long sym_hpet_page;
|
||||
long sym_pvclock_page;
|
||||
long sym_VDSO32_NOTE_MASK;
|
||||
long sym___kernel_sigreturn;
|
||||
long sym___kernel_rt_sigreturn;
|
||||
|
|
|
@ -77,7 +77,8 @@
|
|||
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
|
||||
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
|
||||
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
|
||||
#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
|
||||
#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
|
||||
#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
|
||||
|
||||
/*
|
||||
* Intel CPU features in CR4
|
||||
|
|
|
@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = {
|
|||
|
||||
static const struct cpu_dev *this_cpu = &default_cpu;
|
||||
|
||||
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
|
||||
DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* We need valid kernel segments for data and code in long mode too
|
||||
|
@ -324,8 +324,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
|
|||
static void setup_pcid(struct cpuinfo_x86 *c)
|
||||
{
|
||||
if (cpu_has(c, X86_FEATURE_PCID)) {
|
||||
if (cpu_has(c, X86_FEATURE_PGE)) {
|
||||
if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
|
||||
cr4_set_bits(X86_CR4_PCIDE);
|
||||
/*
|
||||
* INVPCID has two "groups" of types:
|
||||
* 1/2: Invalidate an individual address
|
||||
* 3/4: Invalidate all contexts
|
||||
*
|
||||
* 1/2 take a PCID, but 3/4 do not. So, 3/4
|
||||
* ignore the PCID argument in the descriptor.
|
||||
* But, we have to be careful not to call 1/2
|
||||
* with an actual non-zero PCID in them before
|
||||
* we do the above cr4_set_bits().
|
||||
*/
|
||||
if (cpu_has(c, X86_FEATURE_INVPCID))
|
||||
set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
|
||||
} else {
|
||||
/*
|
||||
* flush_tlb_all(), as currently implemented, won't
|
||||
|
@ -338,6 +351,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
|
|||
clear_cpu_cap(c, X86_FEATURE_PCID);
|
||||
}
|
||||
}
|
||||
kaiser_setup_pcid();
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1229,7 +1243,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
|
|||
[DEBUG_STACK - 1] = DEBUG_STKSZ
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
|
||||
DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
|
||||
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
|
||||
|
||||
/* May not be marked __init: used by software suspend */
|
||||
|
@ -1392,6 +1406,14 @@ void cpu_init(void)
|
|||
* try to read it.
|
||||
*/
|
||||
cr4_init_shadow();
|
||||
if (!kaiser_enabled) {
|
||||
/*
|
||||
* secondary_startup_64() deferred setting PGE in cr4:
|
||||
* probe_page_size_mask() sets it on the boot cpu,
|
||||
* but it needs to be set on each secondary cpu.
|
||||
*/
|
||||
cr4_set_bits(X86_CR4_PGE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Load microcode on this cpu if a valid microcode is available.
|
||||
|
|
|
@ -2,11 +2,15 @@
|
|||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include <asm/kaiser.h>
|
||||
#include <asm/perf_event.h>
|
||||
#include <asm/insn.h>
|
||||
|
||||
#include "perf_event.h"
|
||||
|
||||
static
|
||||
DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
|
||||
|
||||
/* The size of a BTS record in bytes: */
|
||||
#define BTS_RECORD_SIZE 24
|
||||
|
||||
|
@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
|
|||
|
||||
static DEFINE_PER_CPU(void *, insn_buffer);
|
||||
|
||||
static void *dsalloc(size_t size, gfp_t flags, int node)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
unsigned int order = get_order(size);
|
||||
struct page *page;
|
||||
unsigned long addr;
|
||||
|
||||
page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
|
||||
if (!page)
|
||||
return NULL;
|
||||
addr = (unsigned long)page_address(page);
|
||||
if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
|
||||
__free_pages(page, order);
|
||||
addr = 0;
|
||||
}
|
||||
return (void *)addr;
|
||||
#else
|
||||
return kmalloc_node(size, flags | __GFP_ZERO, node);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void dsfree(const void *buffer, size_t size)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
if (!buffer)
|
||||
return;
|
||||
kaiser_remove_mapping((unsigned long)buffer, size);
|
||||
free_pages((unsigned long)buffer, get_order(size));
|
||||
#else
|
||||
kfree(buffer);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int alloc_pebs_buffer(int cpu)
|
||||
{
|
||||
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
|
||||
|
@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
|
|||
if (!x86_pmu.pebs)
|
||||
return 0;
|
||||
|
||||
buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
|
||||
buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
|
||||
if (unlikely(!buffer))
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
|
|||
if (x86_pmu.intel_cap.pebs_format < 2) {
|
||||
ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
|
||||
if (!ibuffer) {
|
||||
kfree(buffer);
|
||||
dsfree(buffer, x86_pmu.pebs_buffer_size);
|
||||
return -ENOMEM;
|
||||
}
|
||||
per_cpu(insn_buffer, cpu) = ibuffer;
|
||||
|
@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
|
|||
kfree(per_cpu(insn_buffer, cpu));
|
||||
per_cpu(insn_buffer, cpu) = NULL;
|
||||
|
||||
kfree((void *)(unsigned long)ds->pebs_buffer_base);
|
||||
dsfree((void *)(unsigned long)ds->pebs_buffer_base,
|
||||
x86_pmu.pebs_buffer_size);
|
||||
ds->pebs_buffer_base = 0;
|
||||
}
|
||||
|
||||
|
@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
|
|||
if (!x86_pmu.bts)
|
||||
return 0;
|
||||
|
||||
buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
|
||||
buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
|
||||
if (unlikely(!buffer)) {
|
||||
WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
|
||||
return -ENOMEM;
|
||||
|
@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
|
|||
if (!ds || !x86_pmu.bts)
|
||||
return;
|
||||
|
||||
kfree((void *)(unsigned long)ds->bts_buffer_base);
|
||||
dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
|
||||
ds->bts_buffer_base = 0;
|
||||
}
|
||||
|
||||
static int alloc_ds_buffer(int cpu)
|
||||
{
|
||||
int node = cpu_to_node(cpu);
|
||||
struct debug_store *ds;
|
||||
|
||||
ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
|
||||
if (unlikely(!ds))
|
||||
return -ENOMEM;
|
||||
struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
|
||||
|
||||
memset(ds, 0, sizeof(*ds));
|
||||
per_cpu(cpu_hw_events, cpu).ds = ds;
|
||||
|
||||
return 0;
|
||||
|
@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
|
|||
return;
|
||||
|
||||
per_cpu(cpu_hw_events, cpu).ds = NULL;
|
||||
kfree(ds);
|
||||
}
|
||||
|
||||
void release_ds_buffers(void)
|
||||
|
|
|
@ -41,6 +41,7 @@
|
|||
#include <asm/pgalloc.h>
|
||||
#include <asm/setup.h>
|
||||
#include <asm/espfix.h>
|
||||
#include <asm/kaiser.h>
|
||||
|
||||
/*
|
||||
* Note: we only need 6*8 = 48 bytes for the espfix stack, but round
|
||||
|
@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)
|
|||
/* Install the espfix pud into the kernel page directory */
|
||||
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
|
||||
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
|
||||
/*
|
||||
* Just copy the top-level PGD that is mapping the espfix
|
||||
* area to ensure it is mapped into the shadow user page
|
||||
* tables.
|
||||
*/
|
||||
if (kaiser_enabled) {
|
||||
set_pgd(native_get_shadow_pgd(pgd_p),
|
||||
__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
|
||||
}
|
||||
|
||||
/* Randomize the locations */
|
||||
init_espfix_random();
|
||||
|
|
|
@ -183,8 +183,8 @@ ENTRY(secondary_startup_64)
|
|||
movq $(init_level4_pgt - __START_KERNEL_map), %rax
|
||||
1:
|
||||
|
||||
/* Enable PAE mode and PGE */
|
||||
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
|
||||
/* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
|
||||
movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
|
||||
movq %rcx, %cr4
|
||||
|
||||
/* Setup early boot stage 4 level pagetables. */
|
||||
|
@ -441,6 +441,27 @@ early_idt_ripmsg:
|
|||
.balign PAGE_SIZE; \
|
||||
GLOBAL(name)
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Each PGD needs to be 8k long and 8k aligned. We do not
|
||||
* ever go out to userspace with these, so we do not
|
||||
* strictly *need* the second page, but this allows us to
|
||||
* have a single set_pgd() implementation that does not
|
||||
* need to worry about whether it has 4k or 8k to work
|
||||
* with.
|
||||
*
|
||||
* This ensures PGDs are 8k long:
|
||||
*/
|
||||
#define KAISER_USER_PGD_FILL 512
|
||||
/* This ensures they are 8k-aligned: */
|
||||
#define NEXT_PGD_PAGE(name) \
|
||||
.balign 2 * PAGE_SIZE; \
|
||||
GLOBAL(name)
|
||||
#else
|
||||
#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
|
||||
#define KAISER_USER_PGD_FILL 0
|
||||
#endif
|
||||
|
||||
/* Automate the creation of 1 to 1 mapping pmd entries */
|
||||
#define PMDS(START, PERM, COUNT) \
|
||||
i = 0 ; \
|
||||
|
@ -450,9 +471,10 @@ GLOBAL(name)
|
|||
.endr
|
||||
|
||||
__INITDATA
|
||||
NEXT_PAGE(early_level4_pgt)
|
||||
NEXT_PGD_PAGE(early_level4_pgt)
|
||||
.fill 511,8,0
|
||||
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
|
||||
.fill KAISER_USER_PGD_FILL,8,0
|
||||
|
||||
NEXT_PAGE(early_dynamic_pgts)
|
||||
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
|
||||
|
@ -460,16 +482,18 @@ NEXT_PAGE(early_dynamic_pgts)
|
|||
.data
|
||||
|
||||
#ifndef CONFIG_XEN
|
||||
NEXT_PAGE(init_level4_pgt)
|
||||
NEXT_PGD_PAGE(init_level4_pgt)
|
||||
.fill 512,8,0
|
||||
.fill KAISER_USER_PGD_FILL,8,0
|
||||
#else
|
||||
NEXT_PAGE(init_level4_pgt)
|
||||
NEXT_PGD_PAGE(init_level4_pgt)
|
||||
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
|
||||
.org init_level4_pgt + L4_PAGE_OFFSET*8, 0
|
||||
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
|
||||
.org init_level4_pgt + L4_START_KERNEL*8, 0
|
||||
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
|
||||
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
|
||||
.fill KAISER_USER_PGD_FILL,8,0
|
||||
|
||||
NEXT_PAGE(level3_ident_pgt)
|
||||
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
|
||||
|
@ -480,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
|
|||
*/
|
||||
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
|
||||
#endif
|
||||
.fill KAISER_USER_PGD_FILL,8,0
|
||||
|
||||
NEXT_PAGE(level3_kernel_pgt)
|
||||
.fill L3_START_KERNEL,8,0
|
||||
|
|
|
@ -51,7 +51,7 @@ static struct irqaction irq2 = {
|
|||
.flags = IRQF_NO_THREAD,
|
||||
};
|
||||
|
||||
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
|
||||
DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
|
||||
[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
|
||||
};
|
||||
|
||||
|
|
|
@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock);
|
|||
static struct pvclock_vsyscall_time_info *hv_clock;
|
||||
static struct pvclock_wall_clock wall_clock;
|
||||
|
||||
struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
|
||||
{
|
||||
return hv_clock;
|
||||
}
|
||||
|
||||
/*
|
||||
* The wallclock is the time of day when we booted. Since then, some time may
|
||||
* have elapsed since the hypervisor wrote the data. So we try to account for
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <linux/slab.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/kaiser.h>
|
||||
|
||||
#include <asm/ldt.h>
|
||||
#include <asm/desc.h>
|
||||
|
@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
|
|||
set_ldt(pc->ldt->entries, pc->ldt->size);
|
||||
}
|
||||
|
||||
static void __free_ldt_struct(struct ldt_struct *ldt)
|
||||
{
|
||||
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
|
||||
vfree(ldt->entries);
|
||||
else
|
||||
free_page((unsigned long)ldt->entries);
|
||||
kfree(ldt);
|
||||
}
|
||||
|
||||
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
|
||||
static struct ldt_struct *alloc_ldt_struct(int size)
|
||||
{
|
||||
struct ldt_struct *new_ldt;
|
||||
int alloc_size;
|
||||
int ret;
|
||||
|
||||
if (size > LDT_ENTRIES)
|
||||
return NULL;
|
||||
|
@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
|
||||
__PAGE_KERNEL);
|
||||
new_ldt->size = size;
|
||||
if (ret) {
|
||||
__free_ldt_struct(new_ldt);
|
||||
return NULL;
|
||||
}
|
||||
return new_ldt;
|
||||
}
|
||||
|
||||
|
@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
|
|||
if (likely(!ldt))
|
||||
return;
|
||||
|
||||
kaiser_remove_mapping((unsigned long)ldt->entries,
|
||||
ldt->size * LDT_ENTRY_SIZE);
|
||||
paravirt_free_ldt(ldt->entries, ldt->size);
|
||||
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
|
||||
vfree(ldt->entries);
|
||||
else
|
||||
free_page((unsigned long)ldt->entries);
|
||||
kfree(ldt);
|
||||
__free_ldt_struct(ldt);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
|
|||
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
|
||||
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
|
||||
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
|
||||
DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
|
||||
DEF_NATIVE(pv_cpu_ops, clts, "clts");
|
||||
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
|
||||
|
||||
|
@ -62,7 +61,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
|
|||
PATCH_SITE(pv_mmu_ops, read_cr3);
|
||||
PATCH_SITE(pv_mmu_ops, write_cr3);
|
||||
PATCH_SITE(pv_cpu_ops, clts);
|
||||
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
|
||||
PATCH_SITE(pv_cpu_ops, wbinvd);
|
||||
#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
|
||||
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
|
||||
|
|
|
@ -39,7 +39,7 @@
|
|||
* section. Since TSS's are completely CPU-local, we want them
|
||||
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
|
||||
*/
|
||||
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
|
||||
__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
|
||||
.x86_tss = {
|
||||
.sp0 = TOP_OF_INIT_STACK,
|
||||
#ifdef CONFIG_X86_32
|
||||
|
|
|
@ -112,6 +112,7 @@
|
|||
#include <asm/alternative.h>
|
||||
#include <asm/prom.h>
|
||||
#include <asm/microcode.h>
|
||||
#include <asm/kaiser.h>
|
||||
|
||||
/*
|
||||
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
|
||||
|
@ -1016,6 +1017,12 @@ void __init setup_arch(char **cmdline_p)
|
|||
*/
|
||||
init_hypervisor_platform();
|
||||
|
||||
/*
|
||||
* This needs to happen right after XENPV is set on xen and
|
||||
* kaiser_enabled is checked below in cleanup_highmap().
|
||||
*/
|
||||
kaiser_check_boottime_disable();
|
||||
|
||||
x86_init.resources.probe_roms();
|
||||
|
||||
/* after parse_early_param, so could debug it */
|
||||
|
|
|
@ -9,10 +9,12 @@
|
|||
#include <linux/atomic.h>
|
||||
|
||||
atomic_t trace_idt_ctr = ATOMIC_INIT(0);
|
||||
__aligned(PAGE_SIZE)
|
||||
struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
|
||||
(unsigned long) trace_idt_table };
|
||||
|
||||
/* No need to be aligned, but done to keep all IDTs defined the same way. */
|
||||
__aligned(PAGE_SIZE)
|
||||
gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
|
||||
|
||||
static int trace_irq_vector_refcount;
|
||||
|
|
|
@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
|
|||
return 1;
|
||||
|
||||
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
|
||||
if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
|
||||
if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
|
||||
!is_long_mode(vcpu))
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -82,3 +82,108 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
|
|||
|
||||
return 0; /* Buffer overrun */
|
||||
}
|
||||
|
||||
/*
|
||||
* Find a non-boolean option (i.e. option=argument). In accordance with
|
||||
* standard Linux practice, if this option is repeated, this returns the
|
||||
* last instance on the command line.
|
||||
*
|
||||
* @cmdline: the cmdline string
|
||||
* @max_cmdline_size: the maximum size of cmdline
|
||||
* @option: option string to look for
|
||||
* @buffer: memory buffer to return the option argument
|
||||
* @bufsize: size of the supplied memory buffer
|
||||
*
|
||||
* Returns the length of the argument (regardless of if it was
|
||||
* truncated to fit in the buffer), or -1 on not found.
|
||||
*/
|
||||
static int
|
||||
__cmdline_find_option(const char *cmdline, int max_cmdline_size,
|
||||
const char *option, char *buffer, int bufsize)
|
||||
{
|
||||
char c;
|
||||
int pos = 0, len = -1;
|
||||
const char *opptr = NULL;
|
||||
char *bufptr = buffer;
|
||||
enum {
|
||||
st_wordstart = 0, /* Start of word/after whitespace */
|
||||
st_wordcmp, /* Comparing this word */
|
||||
st_wordskip, /* Miscompare, skip */
|
||||
st_bufcpy, /* Copying this to buffer */
|
||||
} state = st_wordstart;
|
||||
|
||||
if (!cmdline)
|
||||
return -1; /* No command line */
|
||||
|
||||
/*
|
||||
* This 'pos' check ensures we do not overrun
|
||||
* a non-NULL-terminated 'cmdline'
|
||||
*/
|
||||
while (pos++ < max_cmdline_size) {
|
||||
c = *(char *)cmdline++;
|
||||
if (!c)
|
||||
break;
|
||||
|
||||
switch (state) {
|
||||
case st_wordstart:
|
||||
if (myisspace(c))
|
||||
break;
|
||||
|
||||
state = st_wordcmp;
|
||||
opptr = option;
|
||||
/* fall through */
|
||||
|
||||
case st_wordcmp:
|
||||
if ((c == '=') && !*opptr) {
|
||||
/*
|
||||
* We matched all the way to the end of the
|
||||
* option we were looking for, prepare to
|
||||
* copy the argument.
|
||||
*/
|
||||
len = 0;
|
||||
bufptr = buffer;
|
||||
state = st_bufcpy;
|
||||
break;
|
||||
} else if (c == *opptr++) {
|
||||
/*
|
||||
* We are currently matching, so continue
|
||||
* to the next character on the cmdline.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
state = st_wordskip;
|
||||
/* fall through */
|
||||
|
||||
case st_wordskip:
|
||||
if (myisspace(c))
|
||||
state = st_wordstart;
|
||||
break;
|
||||
|
||||
case st_bufcpy:
|
||||
if (myisspace(c)) {
|
||||
state = st_wordstart;
|
||||
} else {
|
||||
/*
|
||||
* Increment len, but don't overrun the
|
||||
* supplied buffer and leave room for the
|
||||
* NULL terminator.
|
||||
*/
|
||||
if (++len < bufsize)
|
||||
*bufptr++ = c;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (bufsize)
|
||||
*bufptr = '\0';
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
|
||||
int bufsize)
|
||||
{
|
||||
return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
|
||||
buffer, bufsize);
|
||||
}
|
||||
|
|
|
@ -35,3 +35,4 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
|
|||
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
|
||||
|
||||
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
|
||||
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o
|
||||
|
|
|
@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void)
|
|||
cr4_set_bits_and_update_boot(X86_CR4_PSE);
|
||||
|
||||
/* Enable PGE if available */
|
||||
if (cpu_has_pge) {
|
||||
if (cpu_has_pge && !kaiser_enabled) {
|
||||
cr4_set_bits_and_update_boot(X86_CR4_PGE);
|
||||
__supported_pte_mask |= _PAGE_GLOBAL;
|
||||
} else
|
||||
|
|
|
@ -395,6 +395,16 @@ void __init cleanup_highmap(void)
|
|||
continue;
|
||||
if (vaddr < (unsigned long) _text || vaddr > end)
|
||||
set_pmd(pmd, __pmd(0));
|
||||
else if (kaiser_enabled) {
|
||||
/*
|
||||
* level2_kernel_pgt is initialized with _PAGE_GLOBAL:
|
||||
* clear that now. This is not important, so long as
|
||||
* CR4.PGE remains clear, but it removes an anomaly.
|
||||
* Physical mapping setup below avoids _PAGE_GLOBAL
|
||||
* by use of massage_pgprot() inside pfn_pte() etc.
|
||||
*/
|
||||
set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
456
arch/x86/mm/kaiser.c
Normal file
456
arch/x86/mm/kaiser.c
Normal file
|
@ -0,0 +1,456 @@
|
|||
#include <linux/bug.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/bug.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/ftrace.h>
|
||||
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
|
||||
|
||||
#include <asm/kaiser.h>
|
||||
#include <asm/tlbflush.h> /* to verify its kaiser declarations */
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/cmdline.h>
|
||||
|
||||
int kaiser_enabled __read_mostly = 1;
|
||||
EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
|
||||
|
||||
__visible
|
||||
DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
|
||||
|
||||
/*
|
||||
* These can have bit 63 set, so we can not just use a plain "or"
|
||||
* instruction to get their value or'd into CR3. It would take
|
||||
* another register. So, we use a memory reference to these instead.
|
||||
*
|
||||
* This is also handy because systems that do not support PCIDs
|
||||
* just end up or'ing a 0 into their CR3, which does no harm.
|
||||
*/
|
||||
DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
|
||||
|
||||
/*
|
||||
* At runtime, the only things we map are some things for CPU
|
||||
* hotplug, and stacks for new processes. No two CPUs will ever
|
||||
* be populating the same addresses, so we only need to ensure
|
||||
* that we protect between two CPUs trying to allocate and
|
||||
* populate the same page table page.
|
||||
*
|
||||
* Only take this lock when doing a set_p[4um]d(), but it is not
|
||||
* needed for doing a set_pte(). We assume that only the *owner*
|
||||
* of a given allocation will be doing this for _their_
|
||||
* allocation.
|
||||
*
|
||||
* This ensures that once a system has been running for a while
|
||||
* and there have been stacks all over and these page tables
|
||||
* are fully populated, there will be no further acquisitions of
|
||||
* this lock.
|
||||
*/
|
||||
static DEFINE_SPINLOCK(shadow_table_allocation_lock);
|
||||
|
||||
/*
|
||||
* Returns -1 on error.
|
||||
*/
|
||||
static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
|
||||
pgd = pgd_offset_k(vaddr);
|
||||
/*
|
||||
* We made all the kernel PGDs present in kaiser_init().
|
||||
* We expect them to stay that way.
|
||||
*/
|
||||
BUG_ON(pgd_none(*pgd));
|
||||
/*
|
||||
* PGDs are either 512GB or 128TB on all x86_64
|
||||
* configurations. We don't handle these.
|
||||
*/
|
||||
BUG_ON(pgd_large(*pgd));
|
||||
|
||||
pud = pud_offset(pgd, vaddr);
|
||||
if (pud_none(*pud)) {
|
||||
WARN_ON_ONCE(1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (pud_large(*pud))
|
||||
return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
|
||||
|
||||
pmd = pmd_offset(pud, vaddr);
|
||||
if (pmd_none(*pmd)) {
|
||||
WARN_ON_ONCE(1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (pmd_large(*pmd))
|
||||
return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
|
||||
|
||||
pte = pte_offset_kernel(pmd, vaddr);
|
||||
if (pte_none(*pte)) {
|
||||
WARN_ON_ONCE(1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a relatively normal page table walk, except that it
|
||||
* also tries to allocate page tables pages along the way.
|
||||
*
|
||||
* Returns a pointer to a PTE on success, or NULL on failure.
|
||||
*/
|
||||
static pte_t *kaiser_pagetable_walk(unsigned long address)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
pud_t *pud;
|
||||
pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
|
||||
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
|
||||
|
||||
if (pgd_none(*pgd)) {
|
||||
WARN_ONCE(1, "All shadow pgds should have been populated");
|
||||
return NULL;
|
||||
}
|
||||
BUILD_BUG_ON(pgd_large(*pgd) != 0);
|
||||
|
||||
pud = pud_offset(pgd, address);
|
||||
/* The shadow page tables do not use large mappings: */
|
||||
if (pud_large(*pud)) {
|
||||
WARN_ON(1);
|
||||
return NULL;
|
||||
}
|
||||
if (pud_none(*pud)) {
|
||||
unsigned long new_pmd_page = __get_free_page(gfp);
|
||||
if (!new_pmd_page)
|
||||
return NULL;
|
||||
spin_lock(&shadow_table_allocation_lock);
|
||||
if (pud_none(*pud)) {
|
||||
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
|
||||
__inc_zone_page_state(virt_to_page((void *)
|
||||
new_pmd_page), NR_KAISERTABLE);
|
||||
} else
|
||||
free_page(new_pmd_page);
|
||||
spin_unlock(&shadow_table_allocation_lock);
|
||||
}
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
/* The shadow page tables do not use large mappings: */
|
||||
if (pmd_large(*pmd)) {
|
||||
WARN_ON(1);
|
||||
return NULL;
|
||||
}
|
||||
if (pmd_none(*pmd)) {
|
||||
unsigned long new_pte_page = __get_free_page(gfp);
|
||||
if (!new_pte_page)
|
||||
return NULL;
|
||||
spin_lock(&shadow_table_allocation_lock);
|
||||
if (pmd_none(*pmd)) {
|
||||
set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
|
||||
__inc_zone_page_state(virt_to_page((void *)
|
||||
new_pte_page), NR_KAISERTABLE);
|
||||
} else
|
||||
free_page(new_pte_page);
|
||||
spin_unlock(&shadow_table_allocation_lock);
|
||||
}
|
||||
|
||||
return pte_offset_kernel(pmd, address);
|
||||
}
|
||||
|
||||
static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
|
||||
unsigned long flags)
|
||||
{
|
||||
int ret = 0;
|
||||
pte_t *pte;
|
||||
unsigned long start_addr = (unsigned long )__start_addr;
|
||||
unsigned long address = start_addr & PAGE_MASK;
|
||||
unsigned long end_addr = PAGE_ALIGN(start_addr + size);
|
||||
unsigned long target_address;
|
||||
|
||||
/*
|
||||
* It is convenient for callers to pass in __PAGE_KERNEL etc,
|
||||
* and there is no actual harm from setting _PAGE_GLOBAL, so
|
||||
* long as CR4.PGE is not set. But it is nonetheless troubling
|
||||
* to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
|
||||
* requires that not to be #defined to 0): so mask it off here.
|
||||
*/
|
||||
flags &= ~_PAGE_GLOBAL;
|
||||
|
||||
for (; address < end_addr; address += PAGE_SIZE) {
|
||||
target_address = get_pa_from_mapping(address);
|
||||
if (target_address == -1) {
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
pte = kaiser_pagetable_walk(address);
|
||||
if (!pte) {
|
||||
ret = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
if (pte_none(*pte)) {
|
||||
set_pte(pte, __pte(flags | target_address));
|
||||
} else {
|
||||
pte_t tmp;
|
||||
set_pte(&tmp, __pte(flags | target_address));
|
||||
WARN_ON_ONCE(!pte_same(*pte, tmp));
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
|
||||
{
|
||||
unsigned long size = end - start;
|
||||
|
||||
return kaiser_add_user_map(start, size, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure that the top level of the (shadow) page tables are
|
||||
* entirely populated. This ensures that all processes that get
|
||||
* forked have the same entries. This way, we do not have to
|
||||
* ever go set up new entries in older processes.
|
||||
*
|
||||
* Note: we never free these, so there are no updates to them
|
||||
* after this.
|
||||
*/
|
||||
static void __init kaiser_init_all_pgds(void)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
int i = 0;
|
||||
|
||||
pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
|
||||
for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
|
||||
pgd_t new_pgd;
|
||||
pud_t *pud = pud_alloc_one(&init_mm,
|
||||
PAGE_OFFSET + i * PGDIR_SIZE);
|
||||
if (!pud) {
|
||||
WARN_ON(1);
|
||||
break;
|
||||
}
|
||||
inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
|
||||
new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
|
||||
/*
|
||||
* Make sure not to stomp on some other pgd entry.
|
||||
*/
|
||||
if (!pgd_none(pgd[i])) {
|
||||
WARN_ON(1);
|
||||
continue;
|
||||
}
|
||||
set_pgd(pgd + i, new_pgd);
|
||||
}
|
||||
}
|
||||
|
||||
#define kaiser_add_user_map_early(start, size, flags) do { \
|
||||
int __ret = kaiser_add_user_map(start, size, flags); \
|
||||
WARN_ON(__ret); \
|
||||
} while (0)
|
||||
|
||||
#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
|
||||
int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
|
||||
WARN_ON(__ret); \
|
||||
} while (0)
|
||||
|
||||
void __init kaiser_check_boottime_disable(void)
|
||||
{
|
||||
bool enable = true;
|
||||
char arg[5];
|
||||
int ret;
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_XENPV))
|
||||
goto silent_disable;
|
||||
|
||||
ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
|
||||
if (ret > 0) {
|
||||
if (!strncmp(arg, "on", 2))
|
||||
goto enable;
|
||||
|
||||
if (!strncmp(arg, "off", 3))
|
||||
goto disable;
|
||||
|
||||
if (!strncmp(arg, "auto", 4))
|
||||
goto skip;
|
||||
}
|
||||
|
||||
if (cmdline_find_option_bool(boot_command_line, "nopti"))
|
||||
goto disable;
|
||||
|
||||
skip:
|
||||
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
|
||||
goto disable;
|
||||
|
||||
enable:
|
||||
if (enable)
|
||||
setup_force_cpu_cap(X86_FEATURE_KAISER);
|
||||
|
||||
return;
|
||||
|
||||
disable:
|
||||
pr_info("disabled\n");
|
||||
|
||||
silent_disable:
|
||||
kaiser_enabled = 0;
|
||||
setup_clear_cpu_cap(X86_FEATURE_KAISER);
|
||||
}
|
||||
|
||||
/*
|
||||
* If anything in here fails, we will likely die on one of the
|
||||
* first kernel->user transitions and init will die. But, we
|
||||
* will have most of the kernel up by then and should be able to
|
||||
* get a clean warning out of it. If we BUG_ON() here, we run
|
||||
* the risk of being before we have good console output.
|
||||
*/
|
||||
void __init kaiser_init(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
if (!kaiser_enabled)
|
||||
return;
|
||||
|
||||
kaiser_init_all_pgds();
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
void *percpu_vaddr = __per_cpu_user_mapped_start +
|
||||
per_cpu_offset(cpu);
|
||||
unsigned long percpu_sz = __per_cpu_user_mapped_end -
|
||||
__per_cpu_user_mapped_start;
|
||||
kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
|
||||
__PAGE_KERNEL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Map the entry/exit text section, which is needed at
|
||||
* switches from user to and from kernel.
|
||||
*/
|
||||
kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
|
||||
__PAGE_KERNEL_RX);
|
||||
|
||||
#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
|
||||
kaiser_add_user_map_ptrs_early(__irqentry_text_start,
|
||||
__irqentry_text_end,
|
||||
__PAGE_KERNEL_RX);
|
||||
#endif
|
||||
kaiser_add_user_map_early((void *)idt_descr.address,
|
||||
sizeof(gate_desc) * NR_VECTORS,
|
||||
__PAGE_KERNEL_RO);
|
||||
#ifdef CONFIG_TRACING
|
||||
kaiser_add_user_map_early(&trace_idt_descr,
|
||||
sizeof(trace_idt_descr),
|
||||
__PAGE_KERNEL);
|
||||
kaiser_add_user_map_early(&trace_idt_table,
|
||||
sizeof(gate_desc) * NR_VECTORS,
|
||||
__PAGE_KERNEL);
|
||||
#endif
|
||||
kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
|
||||
__PAGE_KERNEL);
|
||||
kaiser_add_user_map_early(&debug_idt_table,
|
||||
sizeof(gate_desc) * NR_VECTORS,
|
||||
__PAGE_KERNEL);
|
||||
|
||||
pr_info("enabled\n");
|
||||
}
|
||||
|
||||
/* Add a mapping to the shadow mapping, and synchronize the mappings */
|
||||
int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
|
||||
{
|
||||
if (!kaiser_enabled)
|
||||
return 0;
|
||||
return kaiser_add_user_map((const void *)addr, size, flags);
|
||||
}
|
||||
|
||||
void kaiser_remove_mapping(unsigned long start, unsigned long size)
|
||||
{
|
||||
extern void unmap_pud_range_nofree(pgd_t *pgd,
|
||||
unsigned long start, unsigned long end);
|
||||
unsigned long end = start + size;
|
||||
unsigned long addr, next;
|
||||
pgd_t *pgd;
|
||||
|
||||
if (!kaiser_enabled)
|
||||
return;
|
||||
pgd = native_get_shadow_pgd(pgd_offset_k(start));
|
||||
for (addr = start; addr < end; pgd++, addr = next) {
|
||||
next = pgd_addr_end(addr, end);
|
||||
unmap_pud_range_nofree(pgd, addr, next);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Page table pages are page-aligned. The lower half of the top
|
||||
* level is used for userspace and the top half for the kernel.
|
||||
* This returns true for user pages that need to get copied into
|
||||
* both the user and kernel copies of the page tables, and false
|
||||
* for kernel pages that should only be in the kernel copy.
|
||||
*/
|
||||
static inline bool is_userspace_pgd(pgd_t *pgdp)
|
||||
{
|
||||
return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
|
||||
}
|
||||
|
||||
pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
|
||||
{
|
||||
if (!kaiser_enabled)
|
||||
return pgd;
|
||||
/*
|
||||
* Do we need to also populate the shadow pgd? Check _PAGE_USER to
|
||||
* skip cases like kexec and EFI which make temporary low mappings.
|
||||
*/
|
||||
if (pgd.pgd & _PAGE_USER) {
|
||||
if (is_userspace_pgd(pgdp)) {
|
||||
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
|
||||
/*
|
||||
* Even if the entry is *mapping* userspace, ensure
|
||||
* that userspace can not use it. This way, if we
|
||||
* get out to userspace running on the kernel CR3,
|
||||
* userspace will crash instead of running.
|
||||
*/
|
||||
if (__supported_pte_mask & _PAGE_NX)
|
||||
pgd.pgd |= _PAGE_NX;
|
||||
}
|
||||
} else if (!pgd.pgd) {
|
||||
/*
|
||||
* pgd_clear() cannot check _PAGE_USER, and is even used to
|
||||
* clear corrupted pgd entries: so just rely on cases like
|
||||
* kexec and EFI never to be using pgd_clear().
|
||||
*/
|
||||
if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
|
||||
is_userspace_pgd(pgdp))
|
||||
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
|
||||
}
|
||||
return pgd;
|
||||
}
|
||||
|
||||
void kaiser_setup_pcid(void)
|
||||
{
|
||||
unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
|
||||
|
||||
if (this_cpu_has(X86_FEATURE_PCID))
|
||||
user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
|
||||
/*
|
||||
* These variables are used by the entry/exit
|
||||
* code to change PCID and pgd and TLB flushing.
|
||||
*/
|
||||
this_cpu_write(x86_cr3_pcid_user, user_cr3);
|
||||
}
|
||||
|
||||
/*
|
||||
* Make a note that this cpu will need to flush USER tlb on return to user.
|
||||
* If cpu does not have PCID, then the NOFLUSH bit will never have been set.
|
||||
*/
|
||||
void kaiser_flush_tlb_on_return_to_user(void)
|
||||
{
|
||||
if (this_cpu_has(X86_FEATURE_PCID))
|
||||
this_cpu_write(x86_cr3_pcid_user,
|
||||
X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
|
||||
}
|
||||
EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
|
|
@ -121,11 +121,16 @@ void __init kasan_init(void)
|
|||
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
|
||||
(void *)KASAN_SHADOW_END);
|
||||
|
||||
memset(kasan_zero_page, 0, PAGE_SIZE);
|
||||
|
||||
load_cr3(init_level4_pgt);
|
||||
__flush_tlb_all();
|
||||
init_task.kasan_depth = 0;
|
||||
|
||||
/*
|
||||
* kasan_zero_page has been used as early shadow memory, thus it may
|
||||
* contain some garbage. Now we can clear it, since after the TLB flush
|
||||
* no one should write to it.
|
||||
*/
|
||||
memset(kasan_zero_page, 0, PAGE_SIZE);
|
||||
|
||||
init_task.kasan_depth = 0;
|
||||
pr_info("KernelAddressSanitizer initialized\n");
|
||||
}
|
||||
|
|
|
@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
|
|||
#define CPA_FLUSHTLB 1
|
||||
#define CPA_ARRAY 2
|
||||
#define CPA_PAGES_ARRAY 4
|
||||
#define CPA_FREE_PAGETABLES 8
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
static unsigned long direct_pages_count[PG_LEVEL_NUM];
|
||||
|
@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static bool try_to_free_pte_page(pte_t *pte)
|
||||
static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!(cpa->flags & CPA_FREE_PAGETABLES))
|
||||
return false;
|
||||
|
||||
for (i = 0; i < PTRS_PER_PTE; i++)
|
||||
if (!pte_none(pte[i]))
|
||||
return false;
|
||||
|
@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool try_to_free_pmd_page(pmd_t *pmd)
|
||||
static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!(cpa->flags & CPA_FREE_PAGETABLES))
|
||||
return false;
|
||||
|
||||
for (i = 0; i < PTRS_PER_PMD; i++)
|
||||
if (!pmd_none(pmd[i]))
|
||||
return false;
|
||||
|
@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
|
||||
static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
|
||||
unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
pte_t *pte = pte_offset_kernel(pmd, start);
|
||||
|
||||
|
@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
|
|||
pte++;
|
||||
}
|
||||
|
||||
if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
|
||||
if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
|
||||
pmd_clear(pmd);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
|
||||
static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
if (unmap_pte_range(pmd, start, end))
|
||||
if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
|
||||
if (unmap_pte_range(cpa, pmd, start, end))
|
||||
if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
|
||||
pud_clear(pud);
|
||||
}
|
||||
|
||||
static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
|
||||
static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
pmd_t *pmd = pmd_offset(pud, start);
|
||||
|
||||
|
@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
|
|||
unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
|
||||
unsigned long pre_end = min_t(unsigned long, end, next_page);
|
||||
|
||||
__unmap_pmd_range(pud, pmd, start, pre_end);
|
||||
__unmap_pmd_range(cpa, pud, pmd, start, pre_end);
|
||||
|
||||
start = pre_end;
|
||||
pmd++;
|
||||
|
@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
|
|||
if (pmd_large(*pmd))
|
||||
pmd_clear(pmd);
|
||||
else
|
||||
__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
|
||||
__unmap_pmd_range(cpa, pud, pmd,
|
||||
start, start + PMD_SIZE);
|
||||
|
||||
start += PMD_SIZE;
|
||||
pmd++;
|
||||
|
@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
|
|||
* 4K leftovers?
|
||||
*/
|
||||
if (start < end)
|
||||
return __unmap_pmd_range(pud, pmd, start, end);
|
||||
return __unmap_pmd_range(cpa, pud, pmd, start, end);
|
||||
|
||||
/*
|
||||
* Try again to free the PMD page if haven't succeeded above.
|
||||
*/
|
||||
if (!pud_none(*pud))
|
||||
if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
|
||||
if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
|
||||
pud_clear(pud);
|
||||
}
|
||||
|
||||
static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
|
||||
static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
|
||||
unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
pud_t *pud = pud_offset(pgd, start);
|
||||
|
||||
|
@ -840,7 +853,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
|
|||
unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
|
||||
unsigned long pre_end = min_t(unsigned long, end, next_page);
|
||||
|
||||
unmap_pmd_range(pud, start, pre_end);
|
||||
unmap_pmd_range(cpa, pud, start, pre_end);
|
||||
|
||||
start = pre_end;
|
||||
pud++;
|
||||
|
@ -854,7 +867,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
|
|||
if (pud_large(*pud))
|
||||
pud_clear(pud);
|
||||
else
|
||||
unmap_pmd_range(pud, start, start + PUD_SIZE);
|
||||
unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
|
||||
|
||||
start += PUD_SIZE;
|
||||
pud++;
|
||||
|
@ -864,7 +877,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
|
|||
* 2M leftovers?
|
||||
*/
|
||||
if (start < end)
|
||||
unmap_pmd_range(pud, start, end);
|
||||
unmap_pmd_range(cpa, pud, start, end);
|
||||
|
||||
/*
|
||||
* No need to try to free the PUD page because we'll free it in
|
||||
|
@ -872,6 +885,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
|
|||
*/
|
||||
}
|
||||
|
||||
static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
|
||||
{
|
||||
struct cpa_data cpa = {
|
||||
.flags = CPA_FREE_PAGETABLES,
|
||||
};
|
||||
|
||||
__unmap_pud_range(&cpa, pgd, start, end);
|
||||
}
|
||||
|
||||
void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
|
||||
{
|
||||
struct cpa_data cpa = {
|
||||
.flags = 0,
|
||||
};
|
||||
|
||||
__unmap_pud_range(&cpa, pgd, start, end);
|
||||
}
|
||||
|
||||
static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
|
||||
{
|
||||
pgd_t *pgd_entry = root + pgd_index(addr);
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
#include <asm/fixmap.h>
|
||||
#include <asm/mtrr.h>
|
||||
|
||||
#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
|
||||
#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
|
||||
|
||||
#ifdef CONFIG_HIGHPTE
|
||||
#define PGALLOC_USER_GFP __GFP_HIGHMEM
|
||||
|
@ -340,14 +340,24 @@ static inline void _pgd_free(pgd_t *pgd)
|
|||
kmem_cache_free(pgd_cache, pgd);
|
||||
}
|
||||
#else
|
||||
|
||||
/*
|
||||
* Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
|
||||
* both 8k in size and 8k-aligned. That lets us just flip bit 12
|
||||
* in a pointer to swap between the two 4k halves.
|
||||
*/
|
||||
#define PGD_ALLOCATION_ORDER kaiser_enabled
|
||||
|
||||
static inline pgd_t *_pgd_alloc(void)
|
||||
{
|
||||
return (pgd_t *)__get_free_page(PGALLOC_GFP);
|
||||
/* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
|
||||
return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
|
||||
PGD_ALLOCATION_ORDER);
|
||||
}
|
||||
|
||||
static inline void _pgd_free(pgd_t *pgd)
|
||||
{
|
||||
free_page((unsigned long)pgd);
|
||||
free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
|
||||
}
|
||||
#endif /* CONFIG_X86_PAE */
|
||||
|
||||
|
|
|
@ -6,13 +6,14 @@
|
|||
#include <linux/interrupt.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/debugfs.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/cache.h>
|
||||
#include <asm/apic.h>
|
||||
#include <asm/uv/uv.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <asm/kaiser.h>
|
||||
|
||||
/*
|
||||
* TLB flushing, formerly SMP-only
|
||||
|
@ -34,6 +35,36 @@ struct flush_tlb_info {
|
|||
unsigned long flush_end;
|
||||
};
|
||||
|
||||
static void load_new_mm_cr3(pgd_t *pgdir)
|
||||
{
|
||||
unsigned long new_mm_cr3 = __pa(pgdir);
|
||||
|
||||
if (kaiser_enabled) {
|
||||
/*
|
||||
* We reuse the same PCID for different tasks, so we must
|
||||
* flush all the entries for the PCID out when we change tasks.
|
||||
* Flush KERN below, flush USER when returning to userspace in
|
||||
* kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
|
||||
*
|
||||
* invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
|
||||
* do it here, but can only be used if X86_FEATURE_INVPCID is
|
||||
* available - and many machines support pcid without invpcid.
|
||||
*
|
||||
* If X86_CR3_PCID_KERN_FLUSH actually added something, then it
|
||||
* would be needed in the write_cr3() below - if PCIDs enabled.
|
||||
*/
|
||||
BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
|
||||
kaiser_flush_tlb_on_return_to_user();
|
||||
}
|
||||
|
||||
/*
|
||||
* Caution: many callers of this function expect
|
||||
* that load_cr3() is serializing and orders TLB
|
||||
* fills with respect to the mm_cpumask writes.
|
||||
*/
|
||||
write_cr3(new_mm_cr3);
|
||||
}
|
||||
|
||||
/*
|
||||
* We cannot call mmdrop() because we are in interrupt context,
|
||||
* instead update mm->cpu_vm_mask.
|
||||
|
@ -45,7 +76,7 @@ void leave_mm(int cpu)
|
|||
BUG();
|
||||
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
|
||||
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
|
||||
load_cr3(swapper_pg_dir);
|
||||
load_new_mm_cr3(swapper_pg_dir);
|
||||
/*
|
||||
* This gets called in the idle path where RCU
|
||||
* functions differently. Tracing normally
|
||||
|
@ -105,7 +136,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|||
* ordering guarantee we need.
|
||||
*
|
||||
*/
|
||||
load_cr3(next->pgd);
|
||||
load_new_mm_cr3(next->pgd);
|
||||
|
||||
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
|
||||
|
@ -152,7 +183,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|||
* As above, load_cr3() is serializing and orders TLB
|
||||
* fills with respect to the mm_cpumask write.
|
||||
*/
|
||||
load_cr3(next->pgd);
|
||||
load_new_mm_cr3(next->pgd);
|
||||
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
load_mm_cr4(next);
|
||||
load_mm_ldt(next);
|
||||
|
|
|
@ -736,7 +736,14 @@
|
|||
*/
|
||||
#define PERCPU_INPUT(cacheline) \
|
||||
VMLINUX_SYMBOL(__per_cpu_start) = .; \
|
||||
VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
|
||||
*(.data..percpu..first) \
|
||||
. = ALIGN(cacheline); \
|
||||
*(.data..percpu..user_mapped) \
|
||||
*(.data..percpu..user_mapped..shared_aligned) \
|
||||
. = ALIGN(PAGE_SIZE); \
|
||||
*(.data..percpu..user_mapped..page_aligned) \
|
||||
VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
|
||||
. = ALIGN(PAGE_SIZE); \
|
||||
*(.data..percpu..page_aligned) \
|
||||
. = ALIGN(cacheline); \
|
||||
|
|
52
include/linux/kaiser.h
Normal file
52
include/linux/kaiser.h
Normal file
|
@ -0,0 +1,52 @@
|
|||
#ifndef _LINUX_KAISER_H
|
||||
#define _LINUX_KAISER_H
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
#include <asm/kaiser.h>
|
||||
|
||||
static inline int kaiser_map_thread_stack(void *stack)
|
||||
{
|
||||
/*
|
||||
* Map that page of kernel stack on which we enter from user context.
|
||||
*/
|
||||
return kaiser_add_mapping((unsigned long)stack +
|
||||
THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
|
||||
}
|
||||
|
||||
static inline void kaiser_unmap_thread_stack(void *stack)
|
||||
{
|
||||
/*
|
||||
* Note: may be called even when kaiser_map_thread_stack() failed.
|
||||
*/
|
||||
kaiser_remove_mapping((unsigned long)stack +
|
||||
THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
|
||||
}
|
||||
#else
|
||||
|
||||
/*
|
||||
* These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which
|
||||
* includes architectures that support KAISER, but have it disabled.
|
||||
*/
|
||||
|
||||
static inline void kaiser_init(void)
|
||||
{
|
||||
}
|
||||
static inline int kaiser_add_mapping(unsigned long addr,
|
||||
unsigned long size, unsigned long flags)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void kaiser_remove_mapping(unsigned long start,
|
||||
unsigned long size)
|
||||
{
|
||||
}
|
||||
static inline int kaiser_map_thread_stack(void *stack)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void kaiser_unmap_thread_stack(void *stack)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* !CONFIG_PAGE_TABLE_ISOLATION */
|
||||
#endif /* _LINUX_KAISER_H */
|
|
@ -133,8 +133,9 @@ enum zone_stat_item {
|
|||
NR_SLAB_RECLAIMABLE,
|
||||
NR_SLAB_UNRECLAIMABLE,
|
||||
NR_PAGETABLE, /* used for pagetables */
|
||||
NR_KERNEL_STACK,
|
||||
/* Second 128 byte cacheline */
|
||||
NR_KERNEL_STACK,
|
||||
NR_KAISERTABLE,
|
||||
NR_UNSTABLE_NFS, /* NFS unstable pages */
|
||||
NR_BOUNCE,
|
||||
NR_VMSCAN_WRITE,
|
||||
|
|
|
@ -35,6 +35,12 @@
|
|||
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
#define USER_MAPPED_SECTION "..user_mapped"
|
||||
#else
|
||||
#define USER_MAPPED_SECTION ""
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Base implementations of per-CPU variable declarations and definitions, where
|
||||
* the section in which the variable is to be placed is provided by the
|
||||
|
@ -115,6 +121,12 @@
|
|||
#define DEFINE_PER_CPU(type, name) \
|
||||
DEFINE_PER_CPU_SECTION(type, name, "")
|
||||
|
||||
#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
|
||||
DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
|
||||
|
||||
#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
|
||||
DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
|
||||
|
||||
/*
|
||||
* Declaration/definition used for per-CPU variables that must come first in
|
||||
* the set of variables.
|
||||
|
@ -144,6 +156,14 @@
|
|||
DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
|
||||
____cacheline_aligned_in_smp
|
||||
|
||||
#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
|
||||
DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
|
||||
____cacheline_aligned_in_smp
|
||||
|
||||
#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
|
||||
DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
|
||||
____cacheline_aligned_in_smp
|
||||
|
||||
#define DECLARE_PER_CPU_ALIGNED(type, name) \
|
||||
DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
|
||||
____cacheline_aligned
|
||||
|
@ -162,11 +182,21 @@
|
|||
#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
|
||||
DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
|
||||
__aligned(PAGE_SIZE)
|
||||
/*
|
||||
* Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
|
||||
*/
|
||||
#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
|
||||
DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
|
||||
__aligned(PAGE_SIZE)
|
||||
|
||||
#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
|
||||
DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
|
||||
__aligned(PAGE_SIZE)
|
||||
|
||||
/*
|
||||
* Declaration/definition used for per-CPU variables that must be read mostly.
|
||||
*/
|
||||
#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
|
||||
#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
|
||||
DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
|
||||
|
||||
#define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
|
||||
|
|
|
@ -81,6 +81,7 @@
|
|||
#include <linux/integrity.h>
|
||||
#include <linux/proc_ns.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/kaiser.h>
|
||||
|
||||
#include <asm/io.h>
|
||||
#include <asm/bugs.h>
|
||||
|
@ -489,6 +490,7 @@ static void __init mm_init(void)
|
|||
pgtable_init();
|
||||
vmalloc_init();
|
||||
ioremap_huge_init();
|
||||
kaiser_init();
|
||||
}
|
||||
|
||||
asmlinkage __visible void __init start_kernel(void)
|
||||
|
|
|
@ -58,6 +58,7 @@
|
|||
#include <linux/tsacct_kern.h>
|
||||
#include <linux/cn_proc.h>
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/kaiser.h>
|
||||
#include <linux/delayacct.h>
|
||||
#include <linux/taskstats_kern.h>
|
||||
#include <linux/random.h>
|
||||
|
@ -172,6 +173,7 @@ static inline void free_thread_stack(unsigned long *stack)
|
|||
{
|
||||
struct page *page = virt_to_page(stack);
|
||||
|
||||
kaiser_unmap_thread_stack(stack);
|
||||
__free_kmem_pages(page, THREAD_SIZE_ORDER);
|
||||
}
|
||||
# else
|
||||
|
@ -355,6 +357,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
|
|||
goto free_stack;
|
||||
|
||||
tsk->stack = stack;
|
||||
|
||||
err = kaiser_map_thread_stack(tsk->stack);
|
||||
if (err)
|
||||
goto free_stack;
|
||||
#ifdef CONFIG_SECCOMP
|
||||
/*
|
||||
* We must handle setting up seccomp filters once we're under
|
||||
|
|
|
@ -738,6 +738,7 @@ const char * const vmstat_text[] = {
|
|||
"nr_slab_unreclaimable",
|
||||
"nr_page_table_pages",
|
||||
"nr_kernel_stack",
|
||||
"nr_overhead",
|
||||
"nr_unstable",
|
||||
"nr_bounce",
|
||||
"nr_vmscan_write",
|
||||
|
|
|
@ -40,6 +40,16 @@ config SECURITY
|
|||
|
||||
If you are unsure how to answer this question, answer N.
|
||||
|
||||
config PAGE_TABLE_ISOLATION
|
||||
bool "Remove the kernel mapping in user mode"
|
||||
default y
|
||||
depends on X86_64 && SMP
|
||||
help
|
||||
This enforces a strict kernel and user space isolation, in order
|
||||
to close hardware side channels on kernel address information.
|
||||
|
||||
If you are unsure how to answer this question, answer Y.
|
||||
|
||||
config SECURITYFS
|
||||
bool "Enable the securityfs filesystem"
|
||||
help
|
||||
|
|
Loading…
Add table
Reference in a new issue