This is the 4.4.110 stable release

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAlpPj0wACgkQONu9yGCS
 aT5QOhAAu3PoT3472I7zuWDUG0KQo5r0wdUO+YPW31VIHrxQ2H3sxR44rSHc5jW/
 tTg2TIYNBkNoj4jJDJ9J7f6PSnN1vGFglFW4GzxE3cr2+W7u5M5ex8yCYMcBIY9U
 56hbyqX5lf5KjGWJiQThwYsMBokrBJW2igAFN3cW39nNABhl0W39kiysGA9vbNrV
 +QMA4+ZADA2EeIRcdJmj8uc/cez/7sGAfrSktvATkI+HFamnTs0mrx9cl0eQKvjm
 y5PCxYUCbi4kqD4WM+UCYO3zpUD+r4iMDXwXBwLWkFvbumY4mVTItP+gq5M4Fb1g
 MSauGUGH7BDsT9gspricCMcAmjcTn6hth7/7/ZhlNq3NZv89pOquhpE0JOSAmYbA
 P4WaIRRWwpVrRt+THU7vZpAQWpFSwGmtE7tBfPMt2J7zqY3lMYmO3DoA+gejw3CV
 igbvmV0UY2uYSFnjawUUJ+k+ggYfGyRkUl2DfcllPhZFqE1XEi3NyjI0wi8vtXTd
 UlrU55TqsldCw1bjXH3lWrpoNybWvqUD2a249ZVs/h06Q5NKwNL8mTye+2BBQtCP
 QzAqHYbkBKv/f8M6Kg+HtTzgqUbWxVCeQTWFXHMAPVo4bCwGvVGrXbGJIj15lBuQ
 GWqc3dt69zxpn1tlcRHKH0P3KnkC67dARtY+8F8+D+HAHVY71Bg=
 =Kpwd
 -----END PGP SIGNATURE-----

Merge 4.4.110 into android-4.4

Changes in 4.4.110
	x86/boot: Add early cmdline parsing for options with arguments
	KAISER: Kernel Address Isolation
	kaiser: merged update
	kaiser: do not set _PAGE_NX on pgd_none
	kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE
	kaiser: fix build and FIXME in alloc_ldt_struct()
	kaiser: KAISER depends on SMP
	kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER
	kaiser: fix perf crashes
	kaiser: ENOMEM if kaiser_pagetable_walk() NULL
	kaiser: tidied up asm/kaiser.h somewhat
	kaiser: tidied up kaiser_add/remove_mapping slightly
	kaiser: kaiser_remove_mapping() move along the pgd
	kaiser: cleanups while trying for gold link
	kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET
	kaiser: delete KAISER_REAL_SWITCH option
	kaiser: vmstat show NR_KAISERTABLE as nr_overhead
	kaiser: enhanced by kernel and user PCIDs
	kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user
	kaiser: PCID 0 for kernel and 128 for user
	kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user
	kaiser: paranoid_entry pass cr3 need to paranoid_exit
	kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls
	kaiser: fix unlikely error in alloc_ldt_struct()
	kaiser: add "nokaiser" boot option, using ALTERNATIVE
	x86/kaiser: Rename and simplify X86_FEATURE_KAISER handling
	x86/kaiser: Check boottime cmdline params
	kaiser: use ALTERNATIVE instead of x86_cr3_pcid_noflush
	kaiser: drop is_atomic arg to kaiser_pagetable_walk()
	kaiser: asm/tlbflush.h handle noPGE at lower level
	kaiser: kaiser_flush_tlb_on_return_to_user() check PCID
	x86/paravirt: Dont patch flush_tlb_single
	x86/kaiser: Reenable PARAVIRT
	kaiser: disabled on Xen PV
	x86/kaiser: Move feature detection up
	KPTI: Rename to PAGE_TABLE_ISOLATION
	KPTI: Report when enabled
	x86, vdso, pvclock: Simplify and speed up the vdso pvclock reader
	x86/vdso: Get pvclock data from the vvar VMA instead of the fixmap
	x86/kasan: Clear kasan_zero_page after TLB flush
	kaiser: Set _PAGE_NX only if supported
	Linux 4.4.110

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
This commit is contained in:
Greg Kroah-Hartman 2018-01-06 10:53:18 +01:00
commit 5cc8c2ec61
51 changed files with 1470 additions and 148 deletions

View file

@ -2529,6 +2529,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nojitter [IA-64] Disables jitter checking for ITC timers.
nopti [X86-64] Disable KAISER isolation of kernel from user.
no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
@ -3060,6 +3062,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
pt. [PARIDE]
See Documentation/blockdev/paride.txt.
pti= [X86_64]
Control KAISER user/kernel address space isolation:
on - enable
off - disable
auto - default setting
pty.legacy_count=
[KNL] Number of legacy pty's. Overwrites compiled-in
default number.

View file

@ -1,6 +1,6 @@
VERSION = 4
PATCHLEVEL = 4
SUBLEVEL = 109
SUBLEVEL = 110
EXTRAVERSION =
NAME = Blurry Fish Butt

View file

@ -9,6 +9,7 @@
*/
#undef CONFIG_PARAVIRT
#undef CONFIG_PARAVIRT_SPINLOCKS
#undef CONFIG_PAGE_TABLE_ISOLATION
#undef CONFIG_KASAN
#include <linux/linkage.h>

View file

@ -35,6 +35,7 @@
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <asm/kaiser.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@ -135,6 +136,7 @@ ENTRY(entry_SYSCALL_64)
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
SWITCH_KERNEL_CR3_NO_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
@ -207,9 +209,17 @@ entry_SYSCALL_64_fastpath:
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3
movq RSP(%rsp), %rsp
/*
* 64-bit SYSRET restores rip from rcx,
@ -347,10 +357,26 @@ GLOBAL(int_ret_from_sys_call)
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
opportunistic_sysret_failed:
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)
@ -509,6 +535,7 @@ END(irq_entries_start)
* tracking that we're in kernel mode.
*/
SWAPGS
SWITCH_KERNEL_CR3
/*
* We need to tell lockdep that IRQs are off. We can't do this until
@ -568,6 +595,7 @@ GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
@ -625,6 +653,7 @@ native_irq_return_ldt:
pushq %rax
pushq %rdi
SWAPGS
SWITCH_KERNEL_CR3
movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* RAX */
movq (2*8)(%rsp), %rax /* RIP */
@ -640,6 +669,7 @@ native_irq_return_ldt:
andl $0xffff0000, %eax
popq %rdi
orq PER_CPU_VAR(espfix_stack), %rax
SWITCH_USER_CR3
SWAPGS
movq %rax, %rsp
popq %rax
@ -1001,7 +1031,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
/*
* Save all registers in pt_regs, and switch gs if needed.
* Use slow, but surefire "are we in kernel?" check.
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*
* Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
* ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
* ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
* ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
*/
ENTRY(paranoid_entry)
cld
@ -1014,7 +1048,26 @@ ENTRY(paranoid_entry)
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
1: ret
1:
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* We might have come in between a swapgs and a SWITCH_KERNEL_CR3
* on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
* Do a conditional SWITCH_KERNEL_CR3: this could safely be done
* unconditionally, but we need to find out whether the reverse
* should be done on return (conveyed to paranoid_exit in %ebx).
*/
ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
testl $KAISER_SHADOW_PGD_OFFSET, %eax
jz 2f
orl $2, %ebx
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
movq %rax, %cr3
2:
#endif
ret
END(paranoid_entry)
/*
@ -1027,19 +1080,26 @@ END(paranoid_entry)
* be complicated. Fortunately, we there's no good reason
* to try to handle preemption here.
*
* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
* On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
* ebx=1: needs neither swapgs nor SWITCH_USER_CR3
* ebx=2: needs both swapgs and SWITCH_USER_CR3
* ebx=3: needs SWITCH_USER_CR3 but not swapgs
*/
ENTRY(paranoid_exit)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF_DEBUG
testl %ebx, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
jmp paranoid_exit_restore
paranoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
paranoid_exit_restore:
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
testl $2, %ebx /* SWITCH_USER_CR3 needed? */
jz paranoid_exit_no_switch
SWITCH_USER_CR3
paranoid_exit_no_switch:
#endif
testl $1, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
SWAPGS_UNSAFE_STACK
paranoid_exit_no_swapgs:
RESTORE_EXTRA_REGS
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
@ -1054,6 +1114,13 @@ ENTRY(error_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
/*
* error_entry() always returns with a kernel gsbase and
* CR3. We must also have a kernel CR3/gsbase before
* calling TRACE_IRQS_*. Just unconditionally switch to
* the kernel CR3 here.
*/
SWITCH_KERNEL_CR3
xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@ -1216,6 +1283,10 @@ ENTRY(nmi)
*/
SWAPGS_UNSAFE_STACK
/*
* percpu variables are mapped with user CR3, so no need
* to switch CR3 here.
*/
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@ -1249,12 +1320,34 @@ ENTRY(nmi)
movq %rsp, %rdi
movq $-1, %rsi
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
pushq %rax
/* mask off "user" bit of pgd address and 12 PCID bits: */
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
movq %rax, %cr3
2:
#endif
call do_nmi
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Unconditionally restore CR3. I know we return to
* kernel code that needs user CR3, but do we ever return
* to "user mode" where we need the kernel CR3?
*/
ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
#endif
/*
* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts. Fortunately,
* do_nmi doesn't modify pt_regs.
* work, because we don't want to enable interrupts. Do not
* switch to user CR3: we might be going back to kernel code
* that had a user CR3 set.
*/
SWAPGS
jmp restore_c_regs_and_iret
@ -1451,22 +1544,55 @@ end_repeat_nmi:
ALLOC_PT_GPREGS_ON_STACK
/*
* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
* as we should not be calling schedule in NMI context.
* Even with normal interrupts enabled. An NMI should not be
* setting NEED_RESCHED or anything that normal interrupts and
* exceptions might do.
* Use the same approach as paranoid_entry to handle SWAPGS, but
* without CR3 handling since we do that differently in NMIs. No
* need to use paranoid_exit as we should not be calling schedule
* in NMI context. Even with normal interrupts enabled. An NMI
* should not be setting NEED_RESCHED or anything that normal
* interrupts and exceptions might do.
*/
call paranoid_entry
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
cld
SAVE_C_REGS
SAVE_EXTRA_REGS
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
rdmsr
testl %edx, %edx
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
1:
movq %rsp, %rdi
movq $-1, %rsi
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
pushq %rax
/* mask off "user" bit of pgd address and 12 PCID bits: */
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
movq %rax, %cr3
2:
#endif
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
call do_nmi
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Unconditionally restore CR3. We might be returning to
* kernel code that needs user CR3, like just just before
* a sysret.
*/
ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
#endif
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
/* We fixed up CR3 above, so no need to switch it here */
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_EXTRA_REGS

View file

@ -13,6 +13,8 @@
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <asm/kaiser.h>
#include <linux/linkage.h>
#include <linux/err.h>
@ -50,6 +52,7 @@ ENDPROC(native_usergs_sysret32)
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
SWITCH_KERNEL_CR3_NO_STACK
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@ -161,6 +164,7 @@ ENDPROC(entry_SYSENTER_compat)
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
SWITCH_KERNEL_CR3_NO_STACK
/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
@ -208,6 +212,7 @@ ENTRY(entry_SYSCALL_compat)
/* Opportunistic SYSRET */
sysret32_from_system_call:
TRACE_IRQS_ON /* User mode traces as IRQs on. */
SWITCH_USER_CR3
movq RBX(%rsp), %rbx /* pt_regs->rbx */
movq RBP(%rsp), %rbp /* pt_regs->rbp */
movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
@ -269,6 +274,7 @@ ENTRY(entry_INT80_compat)
PARAVIRT_ADJUST_EXCEPTION_FRAME
ASM_CLAC /* Do this early to minimize exposure */
SWAPGS
SWITCH_KERNEL_CR3_NO_STACK
/*
* User tracing code (ptrace or signal handlers) might assume that
@ -311,6 +317,7 @@ ENTRY(entry_INT80_compat)
/* Go back to user mode. */
TRACE_IRQS_ON
SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
END(entry_INT80_compat)

View file

@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void)
}
#endif
#ifdef CONFIG_PARAVIRT_CLOCK
extern u8 pvclock_page
__attribute__((visibility("hidden")));
#endif
#ifndef BUILD_VDSO32
#include <linux/kernel.h>
@ -62,63 +67,65 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
#ifdef CONFIG_PARAVIRT_CLOCK
static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
{
const struct pvclock_vsyscall_time_info *pvti_base;
int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
pvti_base = (struct pvclock_vsyscall_time_info *)
__fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
return &pvti_base[offset];
return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
}
static notrace cycle_t vread_pvclock(int *mode)
{
const struct pvclock_vsyscall_time_info *pvti;
const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
cycle_t ret;
u64 last;
u32 version;
u8 flags;
unsigned cpu, cpu1;
u64 tsc, pvti_tsc;
u64 last, delta, pvti_system_time;
u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
/*
* Note: hypervisor must guarantee that:
* 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
* 2. that per-CPU pvclock time info is updated if the
* underlying CPU changes.
* 3. that version is increased whenever underlying CPU
* changes.
* Note: The kernel and hypervisor must guarantee that cpu ID
* number maps 1:1 to per-CPU pvclock time info.
*
* Because the hypervisor is entirely unaware of guest userspace
* preemption, it cannot guarantee that per-CPU pvclock time
* info is updated if the underlying CPU changes or that that
* version is increased whenever underlying CPU changes.
*
* On KVM, we are guaranteed that pvti updates for any vCPU are
* atomic as seen by *all* vCPUs. This is an even stronger
* guarantee than we get with a normal seqlock.
*
* On Xen, we don't appear to have that guarantee, but Xen still
* supplies a valid seqlock using the version field.
* We only do pvclock vdso timing at all if
* PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
* mean that all vCPUs have matching pvti and that the TSC is
* synced, so we can just look at vCPU 0's pvti.
*/
do {
cpu = __getcpu() & VGETCPU_CPU_MASK;
/* TODO: We can put vcpu id into higher bits of pvti.version.
* This will save a couple of cycles by getting rid of
* __getcpu() calls (Gleb).
*/
pvti = get_pvti(cpu);
version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
/*
* Test we're still on the cpu as well as the version.
* We could have been migrated just after the first
* vgetcpu but before fetching the version, so we
* wouldn't notice a version change.
*/
cpu1 = __getcpu() & VGETCPU_CPU_MASK;
} while (unlikely(cpu != cpu1 ||
(pvti->pvti.version & 1) ||
pvti->pvti.version != version));
if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
*mode = VCLOCK_NONE;
return 0;
}
do {
version = pvti->version;
/* This is also a read barrier, so we'll read version first. */
tsc = rdtsc_ordered();
pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
pvti_tsc_shift = pvti->tsc_shift;
pvti_system_time = pvti->system_time;
pvti_tsc = pvti->tsc_timestamp;
/* Make sure that the version double-check is last. */
smp_rmb();
} while (unlikely((version & 1) || version != pvti->version));
delta = tsc - pvti_tsc;
ret = pvti_system_time +
pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
pvti_tsc_shift);
/* refer to tsc.c read_tsc() comment for rationale */
last = gtod->cycle_last;

View file

@ -25,7 +25,7 @@ SECTIONS
* segment.
*/
vvar_start = . - 2 * PAGE_SIZE;
vvar_start = . - 3 * PAGE_SIZE;
vvar_page = vvar_start;
/* Place all vvars at the offsets in asm/vvar.h. */
@ -36,6 +36,7 @@ SECTIONS
#undef EMIT_VVAR
hpet_page = vvar_start + PAGE_SIZE;
pvclock_page = vvar_start + 2 * PAGE_SIZE;
. = SIZEOF_HEADERS;

View file

@ -73,6 +73,7 @@ enum {
sym_vvar_start,
sym_vvar_page,
sym_hpet_page,
sym_pvclock_page,
sym_VDSO_FAKE_SECTION_TABLE_START,
sym_VDSO_FAKE_SECTION_TABLE_END,
};
@ -80,6 +81,7 @@ enum {
const int special_pages[] = {
sym_vvar_page,
sym_hpet_page,
sym_pvclock_page,
};
struct vdso_sym {
@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {
[sym_vvar_start] = {"vvar_start", true},
[sym_vvar_page] = {"vvar_page", true},
[sym_hpet_page] = {"hpet_page", true},
[sym_pvclock_page] = {"pvclock_page", true},
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
"VDSO_FAKE_SECTION_TABLE_START", false
},

View file

@ -100,6 +100,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
.name = "[vvar]",
.pages = no_pages,
};
struct pvclock_vsyscall_time_info *pvti;
if (calculate_addr) {
addr = vdso_addr(current->mm->start_stack,
@ -169,6 +170,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
}
#endif
pvti = pvclock_pvti_cpu0_va();
if (pvti && image->sym_pvclock_page) {
ret = remap_pfn_range(vma,
text_start + image->sym_pvclock_page,
__pa(pvti) >> PAGE_SHIFT,
PAGE_SIZE,
PAGE_READONLY);
if (ret)
goto up_fail;
}
up_fail:
if (ret)
current->mm->context.vdso = NULL;

View file

@ -2,5 +2,7 @@
#define _ASM_X86_CMDLINE_H
int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
int cmdline_find_option(const char *cmdline_ptr, const char *option,
char *buffer, int bufsize);
#endif /* _ASM_X86_CMDLINE_H */

View file

@ -187,6 +187,7 @@
#define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */
#define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */
#define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
@ -199,6 +200,9 @@
#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */

View file

@ -43,7 +43,7 @@ struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{

View file

@ -187,7 +187,7 @@ extern char irq_entries_start[];
#define VECTOR_RETRIGGERED ((void *)~0UL)
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
#endif /* !ASSEMBLY_ */

View file

@ -0,0 +1,141 @@
#ifndef _ASM_X86_KAISER_H
#define _ASM_X86_KAISER_H
#include <uapi/asm/processor-flags.h> /* For PCID constants */
/*
* This file includes the definitions for the KAISER feature.
* KAISER is a counter measure against x86_64 side channel attacks on
* the kernel virtual memory. It has a shadow pgd for every process: the
* shadow pgd has a minimalistic kernel-set mapped, but includes the whole
* user memory. Within a kernel context switch, or when an interrupt is handled,
* the pgd is switched to the normal one. When the system switches to user mode,
* the shadow pgd is enabled. By this, the virtual memory caches are freed,
* and the user may not attack the whole kernel memory.
*
* A minimalistic kernel mapping holds the parts needed to be mapped in user
* mode, such as the entry/exit functions of the user space, or the stacks.
*/
#define KAISER_SHADOW_PGD_OFFSET 0x1000
#ifdef __ASSEMBLY__
#ifdef CONFIG_PAGE_TABLE_ISOLATION
.macro _SWITCH_TO_KERNEL_CR3 reg
movq %cr3, \reg
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
movq \reg, %cr3
.endm
.macro _SWITCH_TO_USER_CR3 reg regb
/*
* regb must be the low byte portion of reg: because we have arranged
* for the low byte of the user PCID to serve as the high byte of NOFLUSH
* (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
* not enabled): so that the one register can update both memory and cr3.
*/
movq %cr3, \reg
orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
js 9f
/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
9:
movq \reg, %cr3
.endm
.macro SWITCH_KERNEL_CR3
ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
_SWITCH_TO_KERNEL_CR3 %rax
popq %rax
8:
.endm
.macro SWITCH_USER_CR3
ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
_SWITCH_TO_USER_CR3 %rax %al
popq %rax
8:
.endm
.macro SWITCH_KERNEL_CR3_NO_STACK
ALTERNATIVE "jmp 8f", \
__stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
X86_FEATURE_KAISER
_SWITCH_TO_KERNEL_CR3 %rax
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
8:
.endm
#else /* CONFIG_PAGE_TABLE_ISOLATION */
.macro SWITCH_KERNEL_CR3
.endm
.macro SWITCH_USER_CR3
.endm
.macro SWITCH_KERNEL_CR3_NO_STACK
.endm
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
#else /* __ASSEMBLY__ */
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Upon kernel/user mode switch, it may happen that the address
* space has to be switched before the registers have been
* stored. To change the address space, another register is
* needed. A register therefore has to be stored/restored.
*/
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
extern int kaiser_enabled;
extern void __init kaiser_check_boottime_disable(void);
#else
#define kaiser_enabled 0
static inline void __init kaiser_check_boottime_disable(void) {}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
/*
* Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,
* so as to build with tests on kaiser_enabled instead of #ifdefs.
*/
/**
* kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
* @addr: the start address of the range
* @size: the size of the range
* @flags: The mapping flags of the pages
*
* The mapping is done on a global scope, so no bigger
* synchronization has to be done. the pages have to be
* manually unmapped again when they are not needed any longer.
*/
extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
/**
* kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
* @addr: the start address of the range
* @size: the size of the range
*/
extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
/**
* kaiser_init - Initialize the shadow mapping
*
* Most parts of the shadow mapping can be mapped upon boot
* time. Only per-process things like the thread stacks
* or a new LDT have to be mapped at runtime. These boot-
* time mappings are permanent and never unmapped.
*/
extern void kaiser_init(void);
#endif /* __ASSEMBLY */
#endif /* _ASM_X86_KAISER_H */

View file

@ -18,6 +18,12 @@
#ifndef __ASSEMBLY__
#include <asm/x86_init.h>
#ifdef CONFIG_PAGE_TABLE_ISOLATION
extern int kaiser_enabled;
#else
#define kaiser_enabled 0
#endif
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_checkwx(void);
@ -653,7 +659,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
static inline int pgd_bad(pgd_t pgd)
{
return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
pgdval_t ignore_flags = _PAGE_USER;
/*
* We set NX on KAISER pgds that map userspace memory so
* that userspace can not meaningfully use the kernel
* page table by accident; it will fault on the first
* instruction it tries to run. See native_set_pgd().
*/
if (kaiser_enabled)
ignore_flags |= _PAGE_NX;
return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
@ -855,7 +871,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
*/
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_PAGE_TABLE_ISOLATION
if (kaiser_enabled) {
/* Clone the shadow pgd part as well */
memcpy(native_get_shadow_pgd(dst),
native_get_shadow_pgd(src),
count * sizeof(pgd_t));
}
#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)

View file

@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)
native_set_pud(pud, native_make_pud(0));
}
#ifdef CONFIG_PAGE_TABLE_ISOLATION
extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
{
#ifdef CONFIG_DEBUG_VM
/* linux/mmdebug.h may not have been included at this point */
BUG_ON(!kaiser_enabled);
#endif
return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
}
#else
static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
{
return pgd;
}
static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
{
BUILD_BUG_ON(1);
return NULL;
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
*pgdp = pgd;
*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
}
static inline void native_pgd_clear(pgd_t *pgd)

View file

@ -89,7 +89,7 @@
#define _PAGE_NX (_AT(pteval_t, 0))
#endif
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
@ -102,6 +102,33 @@
_PAGE_SOFT_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
/* The ASID is the lower 12 bits of CR3 */
#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
/* Mask for all the PCID-related bits in CR3: */
#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
#else
#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
/*
* PCIDs are unsupported on 32-bit and none of these bits can be
* set in CR3:
*/
#define X86_CR3_PCID_KERN_FLUSH (0)
#define X86_CR3_PCID_USER_FLUSH (0)
#define X86_CR3_PCID_KERN_NOFLUSH (0)
#define X86_CR3_PCID_USER_NOFLUSH (0)
#endif
/*
* The cache modes defined here are used to translate between pure SW usage
* and the HW defined cache mode bits and/or PAT entries.

View file

@ -305,7 +305,7 @@ struct tss_struct {
} ____cacheline_aligned;
DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);

View file

@ -4,6 +4,15 @@
#include <linux/clocksource.h>
#include <asm/pvclock-abi.h>
#ifdef CONFIG_PARAVIRT_CLOCK
extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
#else
static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
{
return NULL;
}
#endif
/* some helper functions for xen and kvm pv clock sources */
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);

View file

@ -131,6 +131,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
cr4_set_bits(mask);
}
/*
* Declare a couple of kaiser interfaces here for convenience,
* to avoid the need for asm/kaiser.h in unexpected places.
*/
#ifdef CONFIG_PAGE_TABLE_ISOLATION
extern int kaiser_enabled;
extern void kaiser_setup_pcid(void);
extern void kaiser_flush_tlb_on_return_to_user(void);
#else
#define kaiser_enabled 0
static inline void kaiser_setup_pcid(void)
{
}
static inline void kaiser_flush_tlb_on_return_to_user(void)
{
}
#endif
static inline void __native_flush_tlb(void)
{
/*
@ -139,6 +157,8 @@ static inline void __native_flush_tlb(void)
* back:
*/
preempt_disable();
if (kaiser_enabled)
kaiser_flush_tlb_on_return_to_user();
native_write_cr3(native_read_cr3());
preempt_enable();
}
@ -148,20 +168,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
unsigned long cr4;
cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* clear PGE */
native_write_cr4(cr4 & ~X86_CR4_PGE);
/* write old PGE again and flush TLBs */
native_write_cr4(cr4);
if (cr4 & X86_CR4_PGE) {
/* clear PGE and flush TLB of all entries */
native_write_cr4(cr4 & ~X86_CR4_PGE);
/* restore PGE as it was before */
native_write_cr4(cr4);
} else {
/* do it with cr3, letting kaiser flush user PCID */
__native_flush_tlb();
}
}
static inline void __native_flush_tlb_global(void)
{
unsigned long flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) {
if (this_cpu_has(X86_FEATURE_INVPCID)) {
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
*
* Note, this works with CR4.PCIDE=0 or 1.
*/
invpcid_flush_all();
return;
@ -173,24 +200,45 @@ static inline void __native_flush_tlb_global(void)
* be called from deep inside debugging code.)
*/
raw_local_irq_save(flags);
__native_flush_tlb_global_irq_disabled();
raw_local_irq_restore(flags);
}
static inline void __native_flush_tlb_single(unsigned long addr)
{
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
/*
* SIMICS #GP's if you run INVPCID with type 2/3
* and X86_CR4_PCIDE clear. Shame!
*
* The ASIDs used below are hard-coded. But, we must not
* call invpcid(type=1/2) before CR4.PCIDE=1. Just call
* invlpg in the case we are called early.
*/
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
if (kaiser_enabled)
kaiser_flush_tlb_on_return_to_user();
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
return;
}
/* Flush the address out of both PCIDs. */
/*
* An optimization here might be to determine addresses
* that are only kernel-mapped and only flush the kernel
* ASID. But, userspace flushes are probably much more
* important performance-wise.
*
* Make sure to do only a single invpcid when KAISER is
* disabled and we have only a single ASID.
*/
if (kaiser_enabled)
invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
}
static inline void __flush_tlb_all(void)
{
if (cpu_has_pge)
__flush_tlb_global();
else
__flush_tlb();
__flush_tlb_global();
/*
* Note: if we somehow had PCID but not PGE, then this wouldn't work --
* we'd end up flushing kernel translations for the current ASID but

View file

@ -22,6 +22,7 @@ struct vdso_image {
long sym_vvar_page;
long sym_hpet_page;
long sym_pvclock_page;
long sym_VDSO32_NOTE_MASK;
long sym___kernel_sigreturn;
long sym___kernel_rt_sigreturn;

View file

@ -77,7 +77,8 @@
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
/*
* Intel CPU features in CR4

View file

@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = {
static const struct cpu_dev *this_cpu = &default_cpu;
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
/*
* We need valid kernel segments for data and code in long mode too
@ -324,8 +324,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
static void setup_pcid(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_PCID)) {
if (cpu_has(c, X86_FEATURE_PGE)) {
if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
cr4_set_bits(X86_CR4_PCIDE);
/*
* INVPCID has two "groups" of types:
* 1/2: Invalidate an individual address
* 3/4: Invalidate all contexts
*
* 1/2 take a PCID, but 3/4 do not. So, 3/4
* ignore the PCID argument in the descriptor.
* But, we have to be careful not to call 1/2
* with an actual non-zero PCID in them before
* we do the above cr4_set_bits().
*/
if (cpu_has(c, X86_FEATURE_INVPCID))
set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't
@ -338,6 +351,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
clear_cpu_cap(c, X86_FEATURE_PCID);
}
}
kaiser_setup_pcid();
}
/*
@ -1229,7 +1243,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
@ -1392,6 +1406,14 @@ void cpu_init(void)
* try to read it.
*/
cr4_init_shadow();
if (!kaiser_enabled) {
/*
* secondary_startup_64() deferred setting PGE in cr4:
* probe_page_size_mask() sets it on the boot cpu,
* but it needs to be set on each secondary cpu.
*/
cr4_set_bits(X86_CR4_PGE);
}
/*
* Load microcode on this cpu if a valid microcode is available.

View file

@ -2,11 +2,15 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <asm/kaiser.h>
#include <asm/perf_event.h>
#include <asm/insn.h>
#include "perf_event.h"
static
DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
static DEFINE_PER_CPU(void *, insn_buffer);
static void *dsalloc(size_t size, gfp_t flags, int node)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
unsigned int order = get_order(size);
struct page *page;
unsigned long addr;
page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
if (!page)
return NULL;
addr = (unsigned long)page_address(page);
if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
__free_pages(page, order);
addr = 0;
}
return (void *)addr;
#else
return kmalloc_node(size, flags | __GFP_ZERO, node);
#endif
}
static void dsfree(const void *buffer, size_t size)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
if (!buffer)
return;
kaiser_remove_mapping((unsigned long)buffer, size);
free_pages((unsigned long)buffer, get_order(size));
#else
kfree(buffer);
#endif
}
static int alloc_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
if (!x86_pmu.pebs)
return 0;
buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
if (x86_pmu.intel_cap.pebs_format < 2) {
ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
if (!ibuffer) {
kfree(buffer);
dsfree(buffer, x86_pmu.pebs_buffer_size);
return -ENOMEM;
}
per_cpu(insn_buffer, cpu) = ibuffer;
@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
kfree(per_cpu(insn_buffer, cpu));
per_cpu(insn_buffer, cpu) = NULL;
kfree((void *)(unsigned long)ds->pebs_buffer_base);
dsfree((void *)(unsigned long)ds->pebs_buffer_base,
x86_pmu.pebs_buffer_size);
ds->pebs_buffer_base = 0;
}
@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
if (!x86_pmu.bts)
return 0;
buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
if (unlikely(!buffer)) {
WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
if (!ds || !x86_pmu.bts)
return;
kfree((void *)(unsigned long)ds->bts_buffer_base);
dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
ds->bts_buffer_base = 0;
}
static int alloc_ds_buffer(int cpu)
{
int node = cpu_to_node(cpu);
struct debug_store *ds;
ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
if (unlikely(!ds))
return -ENOMEM;
struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
memset(ds, 0, sizeof(*ds));
per_cpu(cpu_hw_events, cpu).ds = ds;
return 0;
@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
return;
per_cpu(cpu_hw_events, cpu).ds = NULL;
kfree(ds);
}
void release_ds_buffers(void)

View file

@ -41,6 +41,7 @@
#include <asm/pgalloc.h>
#include <asm/setup.h>
#include <asm/espfix.h>
#include <asm/kaiser.h>
/*
* Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)
/* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
/*
* Just copy the top-level PGD that is mapping the espfix
* area to ensure it is mapped into the shadow user page
* tables.
*/
if (kaiser_enabled) {
set_pgd(native_get_shadow_pgd(pgd_p),
__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
}
/* Randomize the locations */
init_espfix_random();

View file

@ -183,8 +183,8 @@ ENTRY(secondary_startup_64)
movq $(init_level4_pgt - __START_KERNEL_map), %rax
1:
/* Enable PAE mode and PGE */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
/* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
movq %rcx, %cr4
/* Setup early boot stage 4 level pagetables. */
@ -441,6 +441,27 @@ early_idt_ripmsg:
.balign PAGE_SIZE; \
GLOBAL(name)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Each PGD needs to be 8k long and 8k aligned. We do not
* ever go out to userspace with these, so we do not
* strictly *need* the second page, but this allows us to
* have a single set_pgd() implementation that does not
* need to worry about whether it has 4k or 8k to work
* with.
*
* This ensures PGDs are 8k long:
*/
#define KAISER_USER_PGD_FILL 512
/* This ensures they are 8k-aligned: */
#define NEXT_PGD_PAGE(name) \
.balign 2 * PAGE_SIZE; \
GLOBAL(name)
#else
#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
#define KAISER_USER_PGD_FILL 0
#endif
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@ -450,9 +471,10 @@ GLOBAL(name)
.endr
__INITDATA
NEXT_PAGE(early_level4_pgt)
NEXT_PGD_PAGE(early_level4_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
.fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@ -460,16 +482,18 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#ifndef CONFIG_XEN
NEXT_PAGE(init_level4_pgt)
NEXT_PGD_PAGE(init_level4_pgt)
.fill 512,8,0
.fill KAISER_USER_PGD_FILL,8,0
#else
NEXT_PAGE(init_level4_pgt)
NEXT_PGD_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
.fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@ -480,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#endif
.fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0

View file

@ -51,7 +51,7 @@ static struct irqaction irq2 = {
.flags = IRQF_NO_THREAD,
};
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};

View file

@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock);
static struct pvclock_vsyscall_time_info *hv_clock;
static struct pvclock_wall_clock wall_clock;
struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
{
return hv_clock;
}
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for

View file

@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
#include <linux/kaiser.h>
#include <asm/ldt.h>
#include <asm/desc.h>
@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
set_ldt(pc->ldt->entries, pc->ldt->size);
}
static void __free_ldt_struct(struct ldt_struct *ldt)
{
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(ldt->entries);
else
free_page((unsigned long)ldt->entries);
kfree(ldt);
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(int size)
{
struct ldt_struct *new_ldt;
int alloc_size;
int ret;
if (size > LDT_ENTRIES)
return NULL;
@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
return NULL;
}
ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
__PAGE_KERNEL);
new_ldt->size = size;
if (ret) {
__free_ldt_struct(new_ldt);
return NULL;
}
return new_ldt;
}
@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt))
return;
kaiser_remove_mapping((unsigned long)ldt->entries,
ldt->size * LDT_ENTRY_SIZE);
paravirt_free_ldt(ldt->entries, ldt->size);
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(ldt->entries);
else
free_page((unsigned long)ldt->entries);
kfree(ldt);
__free_ldt_struct(ldt);
}
/*

View file

@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
@ -62,7 +61,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):

View file

@ -39,7 +39,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
.x86_tss = {
.sp0 = TOP_OF_INIT_STACK,
#ifdef CONFIG_X86_32

View file

@ -112,6 +112,7 @@
#include <asm/alternative.h>
#include <asm/prom.h>
#include <asm/microcode.h>
#include <asm/kaiser.h>
/*
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
@ -1016,6 +1017,12 @@ void __init setup_arch(char **cmdline_p)
*/
init_hypervisor_platform();
/*
* This needs to happen right after XENPV is set on xen and
* kaiser_enabled is checked below in cleanup_highmap().
*/
kaiser_check_boottime_disable();
x86_init.resources.probe_roms();
/* after parse_early_param, so could debug it */

View file

@ -9,10 +9,12 @@
#include <linux/atomic.h>
atomic_t trace_idt_ctr = ATOMIC_INIT(0);
__aligned(PAGE_SIZE)
struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
(unsigned long) trace_idt_table };
/* No need to be aligned, but done to keep all IDTs defined the same way. */
__aligned(PAGE_SIZE)
gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
static int trace_irq_vector_refcount;

View file

@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
!is_long_mode(vcpu))
return 1;
}

View file

@ -82,3 +82,108 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
return 0; /* Buffer overrun */
}
/*
* Find a non-boolean option (i.e. option=argument). In accordance with
* standard Linux practice, if this option is repeated, this returns the
* last instance on the command line.
*
* @cmdline: the cmdline string
* @max_cmdline_size: the maximum size of cmdline
* @option: option string to look for
* @buffer: memory buffer to return the option argument
* @bufsize: size of the supplied memory buffer
*
* Returns the length of the argument (regardless of if it was
* truncated to fit in the buffer), or -1 on not found.
*/
static int
__cmdline_find_option(const char *cmdline, int max_cmdline_size,
const char *option, char *buffer, int bufsize)
{
char c;
int pos = 0, len = -1;
const char *opptr = NULL;
char *bufptr = buffer;
enum {
st_wordstart = 0, /* Start of word/after whitespace */
st_wordcmp, /* Comparing this word */
st_wordskip, /* Miscompare, skip */
st_bufcpy, /* Copying this to buffer */
} state = st_wordstart;
if (!cmdline)
return -1; /* No command line */
/*
* This 'pos' check ensures we do not overrun
* a non-NULL-terminated 'cmdline'
*/
while (pos++ < max_cmdline_size) {
c = *(char *)cmdline++;
if (!c)
break;
switch (state) {
case st_wordstart:
if (myisspace(c))
break;
state = st_wordcmp;
opptr = option;
/* fall through */
case st_wordcmp:
if ((c == '=') && !*opptr) {
/*
* We matched all the way to the end of the
* option we were looking for, prepare to
* copy the argument.
*/
len = 0;
bufptr = buffer;
state = st_bufcpy;
break;
} else if (c == *opptr++) {
/*
* We are currently matching, so continue
* to the next character on the cmdline.
*/
break;
}
state = st_wordskip;
/* fall through */
case st_wordskip:
if (myisspace(c))
state = st_wordstart;
break;
case st_bufcpy:
if (myisspace(c)) {
state = st_wordstart;
} else {
/*
* Increment len, but don't overrun the
* supplied buffer and leave room for the
* NULL terminator.
*/
if (++len < bufsize)
*bufptr++ = c;
}
break;
}
}
if (bufsize)
*bufptr = '\0';
return len;
}
int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
int bufsize)
{
return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
buffer, bufsize);
}

View file

@ -35,3 +35,4 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o

View file

@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void)
cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
if (cpu_has_pge) {
if (cpu_has_pge && !kaiser_enabled) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
} else

View file

@ -395,6 +395,16 @@ void __init cleanup_highmap(void)
continue;
if (vaddr < (unsigned long) _text || vaddr > end)
set_pmd(pmd, __pmd(0));
else if (kaiser_enabled) {
/*
* level2_kernel_pgt is initialized with _PAGE_GLOBAL:
* clear that now. This is not important, so long as
* CR4.PGE remains clear, but it removes an anomaly.
* Physical mapping setup below avoids _PAGE_GLOBAL
* by use of massage_pgprot() inside pfn_pte() etc.
*/
set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
}
}
}

456
arch/x86/mm/kaiser.c Normal file
View file

@ -0,0 +1,456 @@
#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
#include <asm/kaiser.h>
#include <asm/tlbflush.h> /* to verify its kaiser declarations */
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/desc.h>
#include <asm/cmdline.h>
int kaiser_enabled __read_mostly = 1;
EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
__visible
DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/*
* These can have bit 63 set, so we can not just use a plain "or"
* instruction to get their value or'd into CR3. It would take
* another register. So, we use a memory reference to these instead.
*
* This is also handy because systems that do not support PCIDs
* just end up or'ing a 0 into their CR3, which does no harm.
*/
DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
/*
* At runtime, the only things we map are some things for CPU
* hotplug, and stacks for new processes. No two CPUs will ever
* be populating the same addresses, so we only need to ensure
* that we protect between two CPUs trying to allocate and
* populate the same page table page.
*
* Only take this lock when doing a set_p[4um]d(), but it is not
* needed for doing a set_pte(). We assume that only the *owner*
* of a given allocation will be doing this for _their_
* allocation.
*
* This ensures that once a system has been running for a while
* and there have been stacks all over and these page tables
* are fully populated, there will be no further acquisitions of
* this lock.
*/
static DEFINE_SPINLOCK(shadow_table_allocation_lock);
/*
* Returns -1 on error.
*/
static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = pgd_offset_k(vaddr);
/*
* We made all the kernel PGDs present in kaiser_init().
* We expect them to stay that way.
*/
BUG_ON(pgd_none(*pgd));
/*
* PGDs are either 512GB or 128TB on all x86_64
* configurations. We don't handle these.
*/
BUG_ON(pgd_large(*pgd));
pud = pud_offset(pgd, vaddr);
if (pud_none(*pud)) {
WARN_ON_ONCE(1);
return -1;
}
if (pud_large(*pud))
return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
pmd = pmd_offset(pud, vaddr);
if (pmd_none(*pmd)) {
WARN_ON_ONCE(1);
return -1;
}
if (pmd_large(*pmd))
return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
pte = pte_offset_kernel(pmd, vaddr);
if (pte_none(*pte)) {
WARN_ON_ONCE(1);
return -1;
}
return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
}
/*
* This is a relatively normal page table walk, except that it
* also tries to allocate page tables pages along the way.
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
static pte_t *kaiser_pagetable_walk(unsigned long address)
{
pmd_t *pmd;
pud_t *pud;
pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
if (pgd_none(*pgd)) {
WARN_ONCE(1, "All shadow pgds should have been populated");
return NULL;
}
BUILD_BUG_ON(pgd_large(*pgd) != 0);
pud = pud_offset(pgd, address);
/* The shadow page tables do not use large mappings: */
if (pud_large(*pud)) {
WARN_ON(1);
return NULL;
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
if (!new_pmd_page)
return NULL;
spin_lock(&shadow_table_allocation_lock);
if (pud_none(*pud)) {
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
__inc_zone_page_state(virt_to_page((void *)
new_pmd_page), NR_KAISERTABLE);
} else
free_page(new_pmd_page);
spin_unlock(&shadow_table_allocation_lock);
}
pmd = pmd_offset(pud, address);
/* The shadow page tables do not use large mappings: */
if (pmd_large(*pmd)) {
WARN_ON(1);
return NULL;
}
if (pmd_none(*pmd)) {
unsigned long new_pte_page = __get_free_page(gfp);
if (!new_pte_page)
return NULL;
spin_lock(&shadow_table_allocation_lock);
if (pmd_none(*pmd)) {
set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
__inc_zone_page_state(virt_to_page((void *)
new_pte_page), NR_KAISERTABLE);
} else
free_page(new_pte_page);
spin_unlock(&shadow_table_allocation_lock);
}
return pte_offset_kernel(pmd, address);
}
static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
unsigned long flags)
{
int ret = 0;
pte_t *pte;
unsigned long start_addr = (unsigned long )__start_addr;
unsigned long address = start_addr & PAGE_MASK;
unsigned long end_addr = PAGE_ALIGN(start_addr + size);
unsigned long target_address;
/*
* It is convenient for callers to pass in __PAGE_KERNEL etc,
* and there is no actual harm from setting _PAGE_GLOBAL, so
* long as CR4.PGE is not set. But it is nonetheless troubling
* to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
* requires that not to be #defined to 0): so mask it off here.
*/
flags &= ~_PAGE_GLOBAL;
for (; address < end_addr; address += PAGE_SIZE) {
target_address = get_pa_from_mapping(address);
if (target_address == -1) {
ret = -EIO;
break;
}
pte = kaiser_pagetable_walk(address);
if (!pte) {
ret = -ENOMEM;
break;
}
if (pte_none(*pte)) {
set_pte(pte, __pte(flags | target_address));
} else {
pte_t tmp;
set_pte(&tmp, __pte(flags | target_address));
WARN_ON_ONCE(!pte_same(*pte, tmp));
}
}
return ret;
}
static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
{
unsigned long size = end - start;
return kaiser_add_user_map(start, size, flags);
}
/*
* Ensure that the top level of the (shadow) page tables are
* entirely populated. This ensures that all processes that get
* forked have the same entries. This way, we do not have to
* ever go set up new entries in older processes.
*
* Note: we never free these, so there are no updates to them
* after this.
*/
static void __init kaiser_init_all_pgds(void)
{
pgd_t *pgd;
int i = 0;
pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
pgd_t new_pgd;
pud_t *pud = pud_alloc_one(&init_mm,
PAGE_OFFSET + i * PGDIR_SIZE);
if (!pud) {
WARN_ON(1);
break;
}
inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
/*
* Make sure not to stomp on some other pgd entry.
*/
if (!pgd_none(pgd[i])) {
WARN_ON(1);
continue;
}
set_pgd(pgd + i, new_pgd);
}
}
#define kaiser_add_user_map_early(start, size, flags) do { \
int __ret = kaiser_add_user_map(start, size, flags); \
WARN_ON(__ret); \
} while (0)
#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
WARN_ON(__ret); \
} while (0)
void __init kaiser_check_boottime_disable(void)
{
bool enable = true;
char arg[5];
int ret;
if (boot_cpu_has(X86_FEATURE_XENPV))
goto silent_disable;
ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
if (ret > 0) {
if (!strncmp(arg, "on", 2))
goto enable;
if (!strncmp(arg, "off", 3))
goto disable;
if (!strncmp(arg, "auto", 4))
goto skip;
}
if (cmdline_find_option_bool(boot_command_line, "nopti"))
goto disable;
skip:
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
goto disable;
enable:
if (enable)
setup_force_cpu_cap(X86_FEATURE_KAISER);
return;
disable:
pr_info("disabled\n");
silent_disable:
kaiser_enabled = 0;
setup_clear_cpu_cap(X86_FEATURE_KAISER);
}
/*
* If anything in here fails, we will likely die on one of the
* first kernel->user transitions and init will die. But, we
* will have most of the kernel up by then and should be able to
* get a clean warning out of it. If we BUG_ON() here, we run
* the risk of being before we have good console output.
*/
void __init kaiser_init(void)
{
int cpu;
if (!kaiser_enabled)
return;
kaiser_init_all_pgds();
for_each_possible_cpu(cpu) {
void *percpu_vaddr = __per_cpu_user_mapped_start +
per_cpu_offset(cpu);
unsigned long percpu_sz = __per_cpu_user_mapped_end -
__per_cpu_user_mapped_start;
kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
__PAGE_KERNEL);
}
/*
* Map the entry/exit text section, which is needed at
* switches from user to and from kernel.
*/
kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
__PAGE_KERNEL_RX);
#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
kaiser_add_user_map_ptrs_early(__irqentry_text_start,
__irqentry_text_end,
__PAGE_KERNEL_RX);
#endif
kaiser_add_user_map_early((void *)idt_descr.address,
sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL_RO);
#ifdef CONFIG_TRACING
kaiser_add_user_map_early(&trace_idt_descr,
sizeof(trace_idt_descr),
__PAGE_KERNEL);
kaiser_add_user_map_early(&trace_idt_table,
sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL);
#endif
kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
__PAGE_KERNEL);
kaiser_add_user_map_early(&debug_idt_table,
sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL);
pr_info("enabled\n");
}
/* Add a mapping to the shadow mapping, and synchronize the mappings */
int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
{
if (!kaiser_enabled)
return 0;
return kaiser_add_user_map((const void *)addr, size, flags);
}
void kaiser_remove_mapping(unsigned long start, unsigned long size)
{
extern void unmap_pud_range_nofree(pgd_t *pgd,
unsigned long start, unsigned long end);
unsigned long end = start + size;
unsigned long addr, next;
pgd_t *pgd;
if (!kaiser_enabled)
return;
pgd = native_get_shadow_pgd(pgd_offset_k(start));
for (addr = start; addr < end; pgd++, addr = next) {
next = pgd_addr_end(addr, end);
unmap_pud_range_nofree(pgd, addr, next);
}
}
/*
* Page table pages are page-aligned. The lower half of the top
* level is used for userspace and the top half for the kernel.
* This returns true for user pages that need to get copied into
* both the user and kernel copies of the page tables, and false
* for kernel pages that should only be in the kernel copy.
*/
static inline bool is_userspace_pgd(pgd_t *pgdp)
{
return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
}
pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
{
if (!kaiser_enabled)
return pgd;
/*
* Do we need to also populate the shadow pgd? Check _PAGE_USER to
* skip cases like kexec and EFI which make temporary low mappings.
*/
if (pgd.pgd & _PAGE_USER) {
if (is_userspace_pgd(pgdp)) {
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
/*
* Even if the entry is *mapping* userspace, ensure
* that userspace can not use it. This way, if we
* get out to userspace running on the kernel CR3,
* userspace will crash instead of running.
*/
if (__supported_pte_mask & _PAGE_NX)
pgd.pgd |= _PAGE_NX;
}
} else if (!pgd.pgd) {
/*
* pgd_clear() cannot check _PAGE_USER, and is even used to
* clear corrupted pgd entries: so just rely on cases like
* kexec and EFI never to be using pgd_clear().
*/
if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
is_userspace_pgd(pgdp))
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
}
return pgd;
}
void kaiser_setup_pcid(void)
{
unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
if (this_cpu_has(X86_FEATURE_PCID))
user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
/*
* These variables are used by the entry/exit
* code to change PCID and pgd and TLB flushing.
*/
this_cpu_write(x86_cr3_pcid_user, user_cr3);
}
/*
* Make a note that this cpu will need to flush USER tlb on return to user.
* If cpu does not have PCID, then the NOFLUSH bit will never have been set.
*/
void kaiser_flush_tlb_on_return_to_user(void)
{
if (this_cpu_has(X86_FEATURE_PCID))
this_cpu_write(x86_cr3_pcid_user,
X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
}
EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);

View file

@ -121,11 +121,16 @@ void __init kasan_init(void)
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
memset(kasan_zero_page, 0, PAGE_SIZE);
load_cr3(init_level4_pgt);
__flush_tlb_all();
init_task.kasan_depth = 0;
/*
* kasan_zero_page has been used as early shadow memory, thus it may
* contain some garbage. Now we can clear it, since after the TLB flush
* no one should write to it.
*/
memset(kasan_zero_page, 0, PAGE_SIZE);
init_task.kasan_depth = 0;
pr_info("KernelAddressSanitizer initialized\n");
}

View file

@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
#define CPA_FREE_PAGETABLES 8
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
return 0;
}
static bool try_to_free_pte_page(pte_t *pte)
static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
{
int i;
if (!(cpa->flags & CPA_FREE_PAGETABLES))
return false;
for (i = 0; i < PTRS_PER_PTE; i++)
if (!pte_none(pte[i]))
return false;
@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)
return true;
}
static bool try_to_free_pmd_page(pmd_t *pmd)
static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
{
int i;
if (!(cpa->flags & CPA_FREE_PAGETABLES))
return false;
for (i = 0; i < PTRS_PER_PMD; i++)
if (!pmd_none(pmd[i]))
return false;
@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)
return true;
}
static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
unsigned long start,
unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, start);
@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
pte++;
}
if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
pmd_clear(pmd);
return true;
}
return false;
}
static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
unsigned long start, unsigned long end)
{
if (unmap_pte_range(pmd, start, end))
if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
if (unmap_pte_range(cpa, pmd, start, end))
if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
unsigned long start, unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, start);
@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
__unmap_pmd_range(pud, pmd, start, pre_end);
__unmap_pmd_range(cpa, pud, pmd, start, pre_end);
start = pre_end;
pmd++;
@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
if (pmd_large(*pmd))
pmd_clear(pmd);
else
__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
__unmap_pmd_range(cpa, pud, pmd,
start, start + PMD_SIZE);
start += PMD_SIZE;
pmd++;
@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
* 4K leftovers?
*/
if (start < end)
return __unmap_pmd_range(pud, pmd, start, end);
return __unmap_pmd_range(cpa, pud, pmd, start, end);
/*
* Try again to free the PMD page if haven't succeeded above.
*/
if (!pud_none(*pud))
if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
unsigned long start,
unsigned long end)
{
pud_t *pud = pud_offset(pgd, start);
@ -840,7 +853,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
unmap_pmd_range(pud, start, pre_end);
unmap_pmd_range(cpa, pud, start, pre_end);
start = pre_end;
pud++;
@ -854,7 +867,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
if (pud_large(*pud))
pud_clear(pud);
else
unmap_pmd_range(pud, start, start + PUD_SIZE);
unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
start += PUD_SIZE;
pud++;
@ -864,7 +877,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
* 2M leftovers?
*/
if (start < end)
unmap_pmd_range(pud, start, end);
unmap_pmd_range(cpa, pud, start, end);
/*
* No need to try to free the PUD page because we'll free it in
@ -872,6 +885,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
*/
}
static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
{
struct cpa_data cpa = {
.flags = CPA_FREE_PAGETABLES,
};
__unmap_pud_range(&cpa, pgd, start, end);
}
void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
{
struct cpa_data cpa = {
.flags = 0,
};
__unmap_pud_range(&cpa, pgd, start, end);
}
static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
{
pgd_t *pgd_entry = root + pgd_index(addr);

View file

@ -6,7 +6,7 @@
#include <asm/fixmap.h>
#include <asm/mtrr.h>
#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
@ -340,14 +340,24 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
#else
/*
* Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
* both 8k in size and 8k-aligned. That lets us just flip bit 12
* in a pointer to swap between the two 4k halves.
*/
#define PGD_ALLOCATION_ORDER kaiser_enabled
static inline pgd_t *_pgd_alloc(void)
{
return (pgd_t *)__get_free_page(PGALLOC_GFP);
/* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
free_page((unsigned long)pgd);
free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */

View file

@ -6,13 +6,14 @@
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
#include <linux/debugfs.h>
#include <asm/kaiser.h>
/*
* TLB flushing, formerly SMP-only
@ -34,6 +35,36 @@ struct flush_tlb_info {
unsigned long flush_end;
};
static void load_new_mm_cr3(pgd_t *pgdir)
{
unsigned long new_mm_cr3 = __pa(pgdir);
if (kaiser_enabled) {
/*
* We reuse the same PCID for different tasks, so we must
* flush all the entries for the PCID out when we change tasks.
* Flush KERN below, flush USER when returning to userspace in
* kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
*
* invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
* do it here, but can only be used if X86_FEATURE_INVPCID is
* available - and many machines support pcid without invpcid.
*
* If X86_CR3_PCID_KERN_FLUSH actually added something, then it
* would be needed in the write_cr3() below - if PCIDs enabled.
*/
BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
kaiser_flush_tlb_on_return_to_user();
}
/*
* Caution: many callers of this function expect
* that load_cr3() is serializing and orders TLB
* fills with respect to the mm_cpumask writes.
*/
write_cr3(new_mm_cr3);
}
/*
* We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
@ -45,7 +76,7 @@ void leave_mm(int cpu)
BUG();
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
load_cr3(swapper_pg_dir);
load_new_mm_cr3(swapper_pg_dir);
/*
* This gets called in the idle path where RCU
* functions differently. Tracing normally
@ -105,7 +136,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* ordering guarantee we need.
*
*/
load_cr3(next->pgd);
load_new_mm_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
@ -152,7 +183,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* As above, load_cr3() is serializing and orders TLB
* fills with respect to the mm_cpumask write.
*/
load_cr3(next->pgd);
load_new_mm_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
load_mm_cr4(next);
load_mm_ldt(next);

View file

@ -736,7 +736,14 @@
*/
#define PERCPU_INPUT(cacheline) \
VMLINUX_SYMBOL(__per_cpu_start) = .; \
VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
*(.data..percpu..first) \
. = ALIGN(cacheline); \
*(.data..percpu..user_mapped) \
*(.data..percpu..user_mapped..shared_aligned) \
. = ALIGN(PAGE_SIZE); \
*(.data..percpu..user_mapped..page_aligned) \
VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
. = ALIGN(PAGE_SIZE); \
*(.data..percpu..page_aligned) \
. = ALIGN(cacheline); \

52
include/linux/kaiser.h Normal file
View file

@ -0,0 +1,52 @@
#ifndef _LINUX_KAISER_H
#define _LINUX_KAISER_H
#ifdef CONFIG_PAGE_TABLE_ISOLATION
#include <asm/kaiser.h>
static inline int kaiser_map_thread_stack(void *stack)
{
/*
* Map that page of kernel stack on which we enter from user context.
*/
return kaiser_add_mapping((unsigned long)stack +
THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
}
static inline void kaiser_unmap_thread_stack(void *stack)
{
/*
* Note: may be called even when kaiser_map_thread_stack() failed.
*/
kaiser_remove_mapping((unsigned long)stack +
THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
}
#else
/*
* These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which
* includes architectures that support KAISER, but have it disabled.
*/
static inline void kaiser_init(void)
{
}
static inline int kaiser_add_mapping(unsigned long addr,
unsigned long size, unsigned long flags)
{
return 0;
}
static inline void kaiser_remove_mapping(unsigned long start,
unsigned long size)
{
}
static inline int kaiser_map_thread_stack(void *stack)
{
return 0;
}
static inline void kaiser_unmap_thread_stack(void *stack)
{
}
#endif /* !CONFIG_PAGE_TABLE_ISOLATION */
#endif /* _LINUX_KAISER_H */

View file

@ -133,8 +133,9 @@ enum zone_stat_item {
NR_SLAB_RECLAIMABLE,
NR_SLAB_UNRECLAIMABLE,
NR_PAGETABLE, /* used for pagetables */
NR_KERNEL_STACK,
/* Second 128 byte cacheline */
NR_KERNEL_STACK,
NR_KAISERTABLE,
NR_UNSTABLE_NFS, /* NFS unstable pages */
NR_BOUNCE,
NR_VMSCAN_WRITE,

View file

@ -35,6 +35,12 @@
#endif
#ifdef CONFIG_PAGE_TABLE_ISOLATION
#define USER_MAPPED_SECTION "..user_mapped"
#else
#define USER_MAPPED_SECTION ""
#endif
/*
* Base implementations of per-CPU variable declarations and definitions, where
* the section in which the variable is to be placed is provided by the
@ -115,6 +121,12 @@
#define DEFINE_PER_CPU(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "")
#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
/*
* Declaration/definition used for per-CPU variables that must come first in
* the set of variables.
@ -144,6 +156,14 @@
DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
____cacheline_aligned_in_smp
#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
____cacheline_aligned_in_smp
#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
____cacheline_aligned_in_smp
#define DECLARE_PER_CPU_ALIGNED(type, name) \
DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
____cacheline_aligned
@ -162,11 +182,21 @@
#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
__aligned(PAGE_SIZE)
/*
* Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
*/
#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
__aligned(PAGE_SIZE)
#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
__aligned(PAGE_SIZE)
/*
* Declaration/definition used for per-CPU variables that must be read mostly.
*/
#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
#define DEFINE_PER_CPU_READ_MOSTLY(type, name) \

View file

@ -81,6 +81,7 @@
#include <linux/integrity.h>
#include <linux/proc_ns.h>
#include <linux/io.h>
#include <linux/kaiser.h>
#include <asm/io.h>
#include <asm/bugs.h>
@ -489,6 +490,7 @@ static void __init mm_init(void)
pgtable_init();
vmalloc_init();
ioremap_huge_init();
kaiser_init();
}
asmlinkage __visible void __init start_kernel(void)

View file

@ -58,6 +58,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
#include <linux/kaiser.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
@ -172,6 +173,7 @@ static inline void free_thread_stack(unsigned long *stack)
{
struct page *page = virt_to_page(stack);
kaiser_unmap_thread_stack(stack);
__free_kmem_pages(page, THREAD_SIZE_ORDER);
}
# else
@ -355,6 +357,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
goto free_stack;
tsk->stack = stack;
err = kaiser_map_thread_stack(tsk->stack);
if (err)
goto free_stack;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under

View file

@ -738,6 +738,7 @@ const char * const vmstat_text[] = {
"nr_slab_unreclaimable",
"nr_page_table_pages",
"nr_kernel_stack",
"nr_overhead",
"nr_unstable",
"nr_bounce",
"nr_vmscan_write",

View file

@ -40,6 +40,16 @@ config SECURITY
If you are unsure how to answer this question, answer N.
config PAGE_TABLE_ISOLATION
bool "Remove the kernel mapping in user mode"
default y
depends on X86_64 && SMP
help
This enforces a strict kernel and user space isolation, in order
to close hardware side channels on kernel address information.
If you are unsure how to answer this question, answer Y.
config SECURITYFS
bool "Enable the securityfs filesystem"
help