kaiser: merged update · bed9bb7f3e - evie/android_kernel_oneplus_msm8998 - Gay Catgirls Forgejo: gay catgirls having sex

evie/android_kernel_oneplus_msm8998

kaiser: merged update

Merged fixes and cleanups, rebased to 4.4.89 tree (no 5-level paging).

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

This commit is contained in:

Dave Hansen

2017-08-30 16:23:00 -07:00

• committed by

Greg Kroah-Hartman

parent 8a43ddfb93

commit bed9bb7f3e

15 changed files with 553 additions and 190 deletions

									
										106

arch/x86/entry/entry_64.S
									
										View file
										
				@ -212,6 +212,13 @@ entry_SYSCALL_64_fastpath:

					movq	RIP(%rsp), %rcx

					movq	EFLAGS(%rsp), %r11

					RESTORE_C_REGS_EXCEPT_RCX_R11

					/*

					 * This opens a window where we have a user CR3, but are

					 * running in the kernel.  This makes using the CS

					 * register useless for telling whether or not we need to

					 * switch CR3 in NMIs.  Normal interrupts are OK because

					 * they are off here.

					 */

					SWITCH_USER_CR3

					movq	RSP(%rsp), %rsp

					/*

				@ -350,11 +357,25 @@ GLOBAL(int_ret_from_sys_call)

				syscall_return_via_sysret:

					/* rcx and r11 are already restored (see code above) */

					RESTORE_C_REGS_EXCEPT_RCX_R11

					/*

					 * This opens a window where we have a user CR3, but are

					 * running in the kernel.  This makes using the CS

					 * register useless for telling whether or not we need to

					 * switch CR3 in NMIs.  Normal interrupts are OK because

					 * they are off here.

					 */

					SWITCH_USER_CR3

					movq	RSP(%rsp), %rsp

					USERGS_SYSRET64

				opportunistic_sysret_failed:

					/*

					 * This opens a window where we have a user CR3, but are

					 * running in the kernel.  This makes using the CS

					 * register useless for telling whether or not we need to

					 * switch CR3 in NMIs.  Normal interrupts are OK because

					 * they are off here.

					 */

					SWITCH_USER_CR3

					SWAPGS

					jmp	restore_c_regs_and_iret

				@ -1059,6 +1080,13 @@ ENTRY(error_entry)

					cld

					SAVE_C_REGS 8

					SAVE_EXTRA_REGS 8

					/*

					 * error_entry() always returns with a kernel gsbase and

					 * CR3.  We must also have a kernel CR3/gsbase before

					 * calling TRACE_IRQS_*.  Just unconditionally switch to

					 * the kernel CR3 here.

					 */

					SWITCH_KERNEL_CR3

					xorl	%ebx, %ebx

					testb	$3, CS+8(%rsp)

					jz	.Lerror_kernelspace

				@ -1069,7 +1097,6 @@ ENTRY(error_entry)

					 * from user mode due to an IRET fault.

					 */

					SWAPGS

					SWITCH_KERNEL_CR3

				.Lerror_entry_from_usermode_after_swapgs:

					/*

				@ -1122,7 +1149,7 @@ ENTRY(error_entry)

					 * Switch to kernel gsbase:

					 */

					SWAPGS

					SWITCH_KERNEL_CR3

					/*

					 * Pretend that the exception came from user mode: set up pt_regs

					 * as if we faulted immediately after IRET and clear EBX so that

				@ -1222,7 +1249,10 @@ ENTRY(nmi)

					 */

					SWAPGS_UNSAFE_STACK

					SWITCH_KERNEL_CR3_NO_STACK

					/*

					 * percpu variables are mapped with user CR3, so no need

					 * to switch CR3 here.

					 */

					cld

					movq	%rsp, %rdx

					movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

				@ -1256,14 +1286,33 @@ ENTRY(nmi)

					movq	%rsp, %rdi

					movq	$-1, %rsi

				#ifdef CONFIG_KAISER

					/* Unconditionally use kernel CR3 for do_nmi() */

					/* %rax is saved above, so OK to clobber here */

					movq	%cr3, %rax

					pushq	%rax

				#ifdef CONFIG_KAISER_REAL_SWITCH

					andq	$(~0x1000), %rax

				#endif

					movq	%rax, %cr3

				#endif

					call	do_nmi

					/*

					 * Unconditionally restore CR3.  I know we return to

					 * kernel code that needs user CR3, but do we ever return

					 * to "user mode" where we need the kernel CR3?

					 */

				#ifdef CONFIG_KAISER

					popq	%rax

					mov	%rax, %cr3

				#endif

					/*

					 * Return back to user mode.  We must *not* do the normal exit

					 * work, because we don't want to enable interrupts.  Fortunately,

					 * do_nmi doesn't modify pt_regs.

					 * work, because we don't want to enable interrupts.  Do not

					 * switch to user CR3: we might be going back to kernel code

					 * that had a user CR3 set.

					 */

					SWITCH_USER_CR3

					SWAPGS

					jmp	restore_c_regs_and_iret

				@ -1459,23 +1508,54 @@ end_repeat_nmi:

					ALLOC_PT_GPREGS_ON_STACK

					/*

					 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit

					 * as we should not be calling schedule in NMI context.

					 * Even with normal interrupts enabled. An NMI should not be

					 * setting NEED_RESCHED or anything that normal interrupts and

					 * exceptions might do.

					 * Use the same approach as paranoid_entry to handle SWAPGS, but

					 * without CR3 handling since we do that differently in NMIs.  No

					 * need to use paranoid_exit as we should not be calling schedule

					 * in NMI context.  Even with normal interrupts enabled. An NMI

					 * should not be setting NEED_RESCHED or anything that normal

					 * interrupts and exceptions might do.

					 */

					call	paranoid_entry

					cld

					SAVE_C_REGS

					SAVE_EXTRA_REGS

					movl	$1, %ebx

					movl	$MSR_GS_BASE, %ecx

					rdmsr

					testl	%edx, %edx

					js	1f				/* negative -> in kernel */

					SWAPGS

					xorl	%ebx, %ebx

				1:

				#ifdef CONFIG_KAISER

					/* Unconditionally use kernel CR3 for do_nmi() */

					/* %rax is saved above, so OK to clobber here */

					movq	%cr3, %rax

					pushq	%rax

				#ifdef CONFIG_KAISER_REAL_SWITCH

					andq	$(~0x1000), %rax

				#endif

					movq	%rax, %cr3

				#endif

					/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */

					movq	%rsp, %rdi

					addq	$8, %rdi /* point %rdi at ptregs, fixed up for CR3 */

					movq	$-1, %rsi

					call	do_nmi

					/*

					 * Unconditionally restore CR3.  We might be returning to

					 * kernel code that needs user CR3, like just just before

					 * a sysret.

					 */

				#ifdef CONFIG_KAISER

					popq	%rax

					mov	%rax, %cr3

				#endif

					testl	%ebx, %ebx			/* swapgs needed? */

					jnz	nmi_restore

				nmi_swapgs:

					SWITCH_USER_CR3_NO_STACK

					/* We fixed up CR3 above, so no need to switch it here */

					SWAPGS_UNSAFE_STACK

				nmi_restore:

					RESTORE_EXTRA_REGS

									
										43

arch/x86/include/asm/kaiser.h
									
										View file
										
				@ -16,13 +16,17 @@

				.macro _SWITCH_TO_KERNEL_CR3 reg

				movq %cr3, \reg

				#ifdef CONFIG_KAISER_REAL_SWITCH

				andq $(~0x1000), \reg

				#endif

				movq \reg, %cr3

				.endm

				.macro _SWITCH_TO_USER_CR3 reg

				movq %cr3, \reg

				#ifdef CONFIG_KAISER_REAL_SWITCH

				orq $(0x1000), \reg

				#endif

				movq \reg, %cr3

				.endm

				@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax

				.endm

				#endif /* CONFIG_KAISER */

				#else /* __ASSEMBLY__ */

				#ifdef CONFIG_KAISER

				// Upon kernel/user mode switch, it may happen that

				// the address space has to be switched before the registers have been stored.

				// To change the address space, another register is needed.

				// A register therefore has to be stored/restored.

				//

				/*

				 * Upon kernel/user mode switch, it may happen that the address

				 * space has to be switched before the registers have been

				 * stored.  To change the address space, another register is

				 * needed.  A register therefore has to be stored/restored.

				*/

				DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

				#endif /* CONFIG_KAISER */

				/**

				 *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping

				 *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping

				 *  @addr: the start address of the range

				 *  @size: the size of the range

				 *  @flags: The mapping flags of the pages

				 *

				 *  the mapping is done on a global scope, so no bigger synchronization has to be done.

				 *  the pages have to be manually unmapped again when they are not needed any longer.

				 *  The mapping is done on a global scope, so no bigger

				 *  synchronization has to be done.  the pages have to be

				 *  manually unmapped again when they are not needed any longer.

				 */

				extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);

				extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);

				/**

				 *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping

				 *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping

				 *  @addr: the start address of the range

				 *  @size: the size of the range

				 */

				extern void kaiser_remove_mapping(unsigned long start, unsigned long size);

				/**

				 *  shadowmem_initialize_mapping - Initalize the shadow mapping

				 *  kaiser_initialize_mapping - Initalize the shadow mapping

				 *

				 *  most parts of the shadow mapping can be mapped upon boot time.

				 *  only the thread stacks have to be mapped on runtime.

				 *  the mapped regions are not unmapped at all.

				 *  Most parts of the shadow mapping can be mapped upon boot

				 *  time.  Only per-process things like the thread stacks

				 *  or a new LDT have to be mapped at runtime.  These boot-

				 *  time mappings are permanent and nevertunmapped.

				 */

				extern void kaiser_init(void);

				#endif

				#endif /* CONFIG_KAISER */

				#endif /* __ASSEMBLY */

									
										18

arch/x86/include/asm/pgtable.h
									
										View file
										
				@ -653,7 +653,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)

				static inline int pgd_bad(pgd_t pgd)

				{

					return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;

					pgdval_t ignore_flags = _PAGE_USER;

					/*

					 * We set NX on KAISER pgds that map userspace memory so

					 * that userspace can not meaningfully use the kernel

					 * page table by accident; it will fault on the first

					 * instruction it tries to run.  See native_set_pgd().

					 */

					if (IS_ENABLED(CONFIG_KAISER))

						ignore_flags |= _PAGE_NX;

					return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;

				}

				static inline int pgd_none(pgd_t pgd)

				@ -857,8 +867,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)

				{

				       memcpy(dst, src, count * sizeof(pgd_t));

				#ifdef CONFIG_KAISER

					// clone the shadow pgd part as well

					memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));

					/* Clone the shadow pgd part as well */

					memcpy(native_get_shadow_pgd(dst),

					       native_get_shadow_pgd(src),

					       count * sizeof(pgd_t));

				#endif

				}

									
										48

arch/x86/include/asm/pgtable_64.h
									
										View file
										
				@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud)

				}

				#ifdef CONFIG_KAISER

				static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {

				static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)

				{

					return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);

				}

				static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {

				static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)

				{

					return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);

				}

				#else

				static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)

				{

					BUILD_BUG_ON(1);

					return NULL;

				}

				static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)

				{

					return pgdp;

				}

				#endif /* CONFIG_KAISER */

				/*

				 * Page table pages are page-aligned.  The lower half of the top

				 * level is used for userspace and the top half for the kernel.

				 * This returns true for user pages that need to get copied into

				 * both the user and kernel copies of the page tables, and false

				 * for kernel pages that should only be in the kernel copy.

				 */

				static inline bool is_userspace_pgd(void *__ptr)

				{

					unsigned long ptr = (unsigned long)__ptr;

					return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));

				}

				static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)

				{

				#ifdef CONFIG_KAISER

					// We know that a pgd is page aligned.

					// Therefore the lower indices have to be mapped to user space.

					// These pages are mapped to the shadow mapping.

					if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {

					pteval_t extra_kern_pgd_flags = 0;

					/* Do we need to also populate the shadow pgd? */

					if (is_userspace_pgd(pgdp)) {

						native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;

						/*

						 * Even if the entry is *mapping* userspace, ensure

						 * that userspace can not use it.  This way, if we

						 * get out to userspace running on the kernel CR3,

						 * userspace will crash instead of running.

						 */

						extra_kern_pgd_flags = _PAGE_NX;

					}

					pgdp->pgd = pgd.pgd & ~_PAGE_USER;

					pgdp->pgd = pgd.pgd;

					pgdp->pgd |= extra_kern_pgd_flags;

				#else /* CONFIG_KAISER */

					*pgdp = pgd;

				#endif

									
										6

arch/x86/include/asm/pgtable_types.h
									
										View file
										
				@ -42,7 +42,7 @@

				#ifdef CONFIG_KAISER

				#define _PAGE_GLOBAL	(_AT(pteval_t, 0))

				#else

				#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)

				#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)

				#endif

				#define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)

				#define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)

				@ -93,11 +93,7 @@

				#define _PAGE_NX	(_AT(pteval_t, 0))

				#endif

				#ifdef CONFIG_KAISER

				#define _PAGE_PROTNONE	(_AT(pteval_t, 0))

				#else

				#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

				#endif

				#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\

							 _PAGE_ACCESSED | _PAGE_DIRTY)

									
										13

arch/x86/kernel/espfix_64.c
									
										View file
										
				@ -127,11 +127,14 @@ void __init init_espfix_bsp(void)

					/* Install the espfix pud into the kernel page directory */

					pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];

					pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);

				#ifdef CONFIG_KAISER

					// add the esp stack pud to the shadow mapping here.

					// This can be done directly, because the fixup stack has its own pud

					set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));

				#endif

					/*

					 * Just copy the top-level PGD that is mapping the espfix

					 * area to ensure it is mapped into the shadow user page

					 * tables.

					 */

					if (IS_ENABLED(CONFIG_KAISER))

						set_pgd(native_get_shadow_pgd(pgd_p),

							__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));

					/* Randomize the locations */

					init_espfix_random();

									
										19

arch/x86/kernel/head_64.S
									
										View file
										
				@ -442,11 +442,24 @@ early_idt_ripmsg:

				GLOBAL(name)

				#ifdef CONFIG_KAISER

				/*

				 * Each PGD needs to be 8k long and 8k aligned.  We do not

				 * ever go out to userspace with these, so we do not

				 * strictly *need* the second page, but this allows us to

				 * have a single set_pgd() implementation that does not

				 * need to worry about whether it has 4k or 8k to work

				 * with.

				 *

				 * This ensures PGDs are 8k long:

				 */

				#define KAISER_USER_PGD_FILL	512

				/* This ensures they are 8k-aligned: */

				#define NEXT_PGD_PAGE(name) \

					.balign 2 * PAGE_SIZE; \

				GLOBAL(name)

				#else

				#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)

				#define KAISER_USER_PGD_FILL	0

				#endif

				/* Automate the creation of 1 to 1 mapping pmd entries */

				@ -461,6 +474,7 @@ GLOBAL(name)

				NEXT_PGD_PAGE(early_level4_pgt)

					.fill	511,8,0

					.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE

					.fill	KAISER_USER_PGD_FILL,8,0

				NEXT_PAGE(early_dynamic_pgts)

					.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0

				@ -469,7 +483,8 @@ NEXT_PAGE(early_dynamic_pgts)

				#ifndef CONFIG_XEN

				NEXT_PGD_PAGE(init_level4_pgt)

					.fill	2*512,8,0

					.fill	512,8,0

					.fill	KAISER_USER_PGD_FILL,8,0

				#else

				NEXT_PGD_PAGE(init_level4_pgt)

					.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE

				@ -478,6 +493,7 @@ NEXT_PGD_PAGE(init_level4_pgt)

					.org    init_level4_pgt + L4_START_KERNEL*8, 0

					/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */

					.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE

					.fill	KAISER_USER_PGD_FILL,8,0

				NEXT_PAGE(level3_ident_pgt)

					.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE

				@ -488,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)

					 */

					PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)

				#endif

					.fill	KAISER_USER_PGD_FILL,8,0

				NEXT_PAGE(level3_kernel_pgt)

					.fill	L3_START_KERNEL,8,0

									
										27

arch/x86/kernel/ldt.c
									
										View file
										
				@ -18,6 +18,7 @@

				#include <linux/uaccess.h>

				#include <asm/ldt.h>

				#include <asm/kaiser.h>

				#include <asm/desc.h>

				#include <asm/mmu_context.h>

				#include <asm/syscalls.h>

				@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)

					set_ldt(pc->ldt->entries, pc->ldt->size);

				}

				static void __free_ldt_struct(struct ldt_struct *ldt)

				{

					if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)

						vfree(ldt->entries);

					else

						free_page((unsigned long)ldt->entries);

					kfree(ldt);

				}

				/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */

				static struct ldt_struct *alloc_ldt_struct(int size)

				{

					struct ldt_struct *new_ldt;

					int alloc_size;

					int ret = 0;

					if (size > LDT_ENTRIES)

						return NULL;

				@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_struct(int size)

						return NULL;

					}

					// FIXME: make kaiser_add_mapping() return an error code

					// when it fails

					kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,

							   __PAGE_KERNEL);

					if (ret) {

						__free_ldt_struct(new_ldt);

						return NULL;

					}

					new_ldt->size = size;

					return new_ldt;

				}

				@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)

					if (likely(!ldt))

						return;

					kaiser_remove_mapping((unsigned long)ldt->entries,

							      ldt->size * LDT_ENTRY_SIZE);

					paravirt_free_ldt(ldt->entries, ldt->size);

					if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)

						vfree(ldt->entries);

					else

						free_page((unsigned long)ldt->entries);

					kfree(ldt);

					__free_ldt_struct(ldt);

				}

				/*

									
										2

arch/x86/kernel/tracepoint.c
									
										View file
										
				@ -9,10 +9,12 @@

				#include <linux/atomic.h>

				atomic_t trace_idt_ctr = ATOMIC_INIT(0);

				__aligned(PAGE_SIZE)

				struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,

								(unsigned long) trace_idt_table };

				/* No need to be aligned, but done to keep all IDTs defined the same way. */

				__aligned(PAGE_SIZE)

				gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;

				static int trace_irq_vector_refcount;

									
										316

arch/x86/mm/kaiser.c
									
										View file
										
				@ -1,160 +1,306 @@

				#include <linux/bug.h>

				#include <linux/kernel.h>

				#include <linux/errno.h>

				#include <linux/string.h>

				#include <linux/types.h>

				#include <linux/bug.h>

				#include <linux/init.h>

				#include <linux/interrupt.h>

				#include <linux/spinlock.h>

				#include <linux/mm.h>

				#include <linux/uaccess.h>

				#include <linux/ftrace.h>

				#include <asm/kaiser.h>

				#include <asm/pgtable.h>

				#include <asm/pgalloc.h>

				#include <asm/desc.h>

				#ifdef CONFIG_KAISER

				__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

				/**

				 * Get the real ppn from a address in kernel mapping.

				 * @param address The virtual adrress

				 * @return the physical address

				/*

				 * At runtime, the only things we map are some things for CPU

				 * hotplug, and stacks for new processes.  No two CPUs will ever

				 * be populating the same addresses, so we only need to ensure

				 * that we protect between two CPUs trying to allocate and

				 * populate the same page table page.

				 *

				 * Only take this lock when doing a set_p[4um]d(), but it is not

				 * needed for doing a set_pte().  We assume that only the *owner*

				 * of a given allocation will be doing this for _their_

				 * allocation.

				 *

				 * This ensures that once a system has been running for a while

				 * and there have been stacks all over and these page tables

				 * are fully populated, there will be no further acquisitions of

				 * this lock.

				 */

				static inline unsigned long get_pa_from_mapping (unsigned long address)

				static DEFINE_SPINLOCK(shadow_table_allocation_lock);

				/*

				 * Returns -1 on error.

				 */

				static inline unsigned long get_pa_from_mapping(unsigned long vaddr)

				{

					pgd_t *pgd;

					pud_t *pud;

					pmd_t *pmd;

					pte_t *pte;

					pgd = pgd_offset_k(address);

					BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));

					pgd = pgd_offset_k(vaddr);

					/*

					 * We made all the kernel PGDs present in kaiser_init().

					 * We expect them to stay that way.

					 */

					BUG_ON(pgd_none(*pgd));

					/*

					 * PGDs are either 512GB or 128TB on all x86_64

					 * configurations.  We don't handle these.

					 */

					BUG_ON(pgd_large(*pgd));

					pud = pud_offset(pgd, vaddr);

					if (pud_none(*pud)) {

						WARN_ON_ONCE(1);

						return -1;

					}

					if (pud_large(*pud))

						return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);

					pmd = pmd_offset(pud, vaddr);

					if (pmd_none(*pmd)) {

						WARN_ON_ONCE(1);

						return -1;

					}

					if (pmd_large(*pmd))

						return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);

					pte = pte_offset_kernel(pmd, vaddr);

					if (pte_none(*pte)) {

						WARN_ON_ONCE(1);

						return -1;

					}

					return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);

				}

				/*

				 * This is a relatively normal page table walk, except that it

				 * also tries to allocate page tables pages along the way.

				 *

				 * Returns a pointer to a PTE on success, or NULL on failure.

				 */

				static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)

				{

					pmd_t *pmd;

					pud_t *pud;

					pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));

					gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);

					might_sleep();

					if (is_atomic) {

						gfp &= ~GFP_KERNEL;

						gfp |= __GFP_HIGH | __GFP_ATOMIC;

					}

					if (pgd_none(*pgd)) {

						WARN_ONCE(1, "All shadow pgds should have been populated");

						return NULL;

					}

					BUILD_BUG_ON(pgd_large(*pgd) != 0);

					pud = pud_offset(pgd, address);

					BUG_ON(pud_none(*pud));

					/* The shadow page tables do not use large mappings: */

					if (pud_large(*pud)) {

						return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);

						WARN_ON(1);

						return NULL;

					}

					if (pud_none(*pud)) {

						unsigned long new_pmd_page = __get_free_page(gfp);

						if (!new_pmd_page)

							return NULL;

						spin_lock(&shadow_table_allocation_lock);

						if (pud_none(*pud))

							set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));

						else

							free_page(new_pmd_page);

						spin_unlock(&shadow_table_allocation_lock);

					}

					pmd = pmd_offset(pud, address);

					BUG_ON(pmd_none(*pmd));

					/* The shadow page tables do not use large mappings: */

					if (pmd_large(*pmd)) {

						return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);

						WARN_ON(1);

						return NULL;

					}

					if (pmd_none(*pmd)) {

						unsigned long new_pte_page = __get_free_page(gfp);

						if (!new_pte_page)

							return NULL;

						spin_lock(&shadow_table_allocation_lock);

						if (pmd_none(*pmd))

							set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));

						else

							free_page(new_pte_page);

						spin_unlock(&shadow_table_allocation_lock);

					}

					pte = pte_offset_kernel(pmd, address);

					BUG_ON(pte_none(*pte));

					return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);

					return pte_offset_kernel(pmd, address);

				}

				void _kaiser_copy (unsigned long start_addr, unsigned long size,

									unsigned long flags)

				int kaiser_add_user_map(const void *__start_addr, unsigned long size,

							unsigned long flags)

				{

					pgd_t *pgd;

					pud_t *pud;

					pmd_t *pmd;

					int ret = 0;

					pte_t *pte;

					unsigned long address;

					unsigned long end_addr = start_addr + size;

					unsigned long start_addr = (unsigned long )__start_addr;

					unsigned long address = start_addr & PAGE_MASK;

					unsigned long end_addr = PAGE_ALIGN(start_addr + size);

					unsigned long target_address;

					for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));

							address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {

					for (;address < end_addr; address += PAGE_SIZE) {

						target_address = get_pa_from_mapping(address);

						pgd = native_get_shadow_pgd(pgd_offset_k(address));

						BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");

						BUG_ON(pgd_large(*pgd));

						pud = pud_offset(pgd, address);

						if (pud_none(*pud)) {

							set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));

						if (target_address == -1) {

							ret = -EIO;

							break;

						}

						BUG_ON(pud_large(*pud));

						pmd = pmd_offset(pud, address);

						if (pmd_none(*pmd)) {

							set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));

						}

						BUG_ON(pmd_large(*pmd));

						pte = pte_offset_kernel(pmd, address);

						pte = kaiser_pagetable_walk(address, false);

						if (pte_none(*pte)) {

							set_pte(pte, __pte(flags | target_address));

						} else {

							BUG_ON(__pa(pte_page(*pte)) != target_address);

							pte_t tmp;

							set_pte(&tmp, __pte(flags | target_address));

							WARN_ON_ONCE(!pte_same(*pte, tmp));

						}

					}

					return ret;

				}

				// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping

				static inline void __init _kaiser_init(void)

				static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)

				{

					unsigned long size = end - start;

					return kaiser_add_user_map(start, size, flags);

				}

				/*

				 * Ensure that the top level of the (shadow) page tables are

				 * entirely populated.  This ensures that all processes that get

				 * forked have the same entries.  This way, we do not have to

				 * ever go set up new entries in older processes.

				 *

				 * Note: we never free these, so there are no updates to them

				 * after this.

				 */

				static void __init kaiser_init_all_pgds(void)

				{

					pgd_t *pgd;

					int i = 0;

					pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));

					for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {

						set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));

						pgd_t new_pgd;

						pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);

						if (!pud) {

							WARN_ON(1);

							break;

						}

						new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));

						/*

						 * Make sure not to stomp on some other pgd entry.

						 */

						if (!pgd_none(pgd[i])) {

							WARN_ON(1);

							continue;

						}

						set_pgd(pgd + i, new_pgd);

					}

				}

				#define kaiser_add_user_map_early(start, size, flags) do {	\

					int __ret = kaiser_add_user_map(start, size, flags);	\

					WARN_ON(__ret);						\

				} while (0)

				#define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\

					int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\

					WARN_ON(__ret);							\

				} while (0)

				extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];

				spinlock_t shadow_table_lock;

				/*

				 * If anything in here fails, we will likely die on one of the

				 * first kernel->user transitions and init will die.  But, we

				 * will have most of the kernel up by then and should be able to

				 * get a clean warning out of it.  If we BUG_ON() here, we run

				 * the risk of being before we have good console output.

				 */

				void __init kaiser_init(void)

				{

					int cpu;

					spin_lock_init(&shadow_table_lock);

					spin_lock(&shadow_table_lock);

					_kaiser_init();

					kaiser_init_all_pgds();

					for_each_possible_cpu(cpu) {

						// map the per cpu user variables

						_kaiser_copy(

								(unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),

								(unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,

								__PAGE_KERNEL);

						void *percpu_vaddr = __per_cpu_user_mapped_start +

								     per_cpu_offset(cpu);

						unsigned long percpu_sz = __per_cpu_user_mapped_end -

									  __per_cpu_user_mapped_start;

						kaiser_add_user_map_early(percpu_vaddr, percpu_sz,

									  __PAGE_KERNEL);

					}

					// map the entry/exit text section, which is responsible to switch between user- and kernel mode

					_kaiser_copy(

							(unsigned long) __entry_text_start,

							(unsigned long) __entry_text_end - (unsigned long) __entry_text_start,

							__PAGE_KERNEL_RX);

					/*

					 * Map the entry/exit text section, which is needed at

					 * switches from user to and from kernel.

					 */

					kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,

								       __PAGE_KERNEL_RX);

					// the fixed map address of the idt_table

					_kaiser_copy(

							(unsigned long) idt_descr.address,

							sizeof(gate_desc) * NR_VECTORS,

							__PAGE_KERNEL_RO);

					spin_unlock(&shadow_table_lock);

				#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)

					kaiser_add_user_map_ptrs_early(__irqentry_text_start,

								       __irqentry_text_end,

								       __PAGE_KERNEL_RX);

				#endif

					kaiser_add_user_map_early((void *)idt_descr.address,

								  sizeof(gate_desc) * NR_VECTORS,

								  __PAGE_KERNEL_RO);

				#ifdef CONFIG_TRACING

					kaiser_add_user_map_early(&trace_idt_descr,

								  sizeof(trace_idt_descr),

								  __PAGE_KERNEL);

					kaiser_add_user_map_early(&trace_idt_table,

								  sizeof(gate_desc) * NR_VECTORS,

								  __PAGE_KERNEL);

				#endif

					kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),

								  __PAGE_KERNEL);

					kaiser_add_user_map_early(&debug_idt_table,

								  sizeof(gate_desc) * NR_VECTORS,

								  __PAGE_KERNEL);

				}

				extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);

				// add a mapping to the shadow-mapping, and synchronize the mappings

				void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)

				int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)

				{

					spin_lock(&shadow_table_lock);

					_kaiser_copy(addr, size, flags);

					spin_unlock(&shadow_table_lock);

					return kaiser_add_user_map((const void *)addr, size, flags);

				}

				extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);

				void kaiser_remove_mapping(unsigned long start, unsigned long size)

				{

					pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));

					spin_lock(&shadow_table_lock);

					do {

						unmap_pud_range(pgd, start, start + size);

					} while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));

					spin_unlock(&shadow_table_lock);

					unsigned long end = start + size;

					unsigned long addr;

					for (addr = start; addr < end; addr += PGDIR_SIZE) {

						pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));

						/*

						 * unmap_p4d_range() handles > P4D_SIZE unmaps,

						 * so no need to trim 'end'.

						 */

						unmap_pud_range_nofree(pgd, addr, end);

					}

				}

				#endif /* CONFIG_KAISER */

									
										63

arch/x86/mm/pageattr.c
									
										View file
										
				@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);

				#define CPA_FLUSHTLB 1

				#define CPA_ARRAY 2

				#define CPA_PAGES_ARRAY 4

				#define CPA_FREE_PAGETABLES 8

				#ifdef CONFIG_PROC_FS

				static unsigned long direct_pages_count[PG_LEVEL_NUM];

				@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,

					return 0;

				}

				static bool try_to_free_pte_page(pte_t *pte)

				static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)

				{

					int i;

					if (!(cpa->flags & CPA_FREE_PAGETABLES))

						return false;

					for (i = 0; i < PTRS_PER_PTE; i++)

						if (!pte_none(pte[i]))

							return false;

				@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)

					return true;

				}

				static bool try_to_free_pmd_page(pmd_t *pmd)

				static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)

				{

					int i;

					if (!(cpa->flags & CPA_FREE_PAGETABLES))

						return false;

					for (i = 0; i < PTRS_PER_PMD; i++)

						if (!pmd_none(pmd[i]))

							return false;

				@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)

					return true;

				}

				static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)

				static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,

							    unsigned long start,

							    unsigned long end)

				{

					pte_t *pte = pte_offset_kernel(pmd, start);

				@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)

						pte++;

					}

					if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {

					if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {

						pmd_clear(pmd);

						return true;

					}

					return false;

				}

				static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,

				static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,

							      unsigned long start, unsigned long end)

				{

					if (unmap_pte_range(pmd, start, end))

						if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))

					if (unmap_pte_range(cpa, pmd, start, end))

						if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))

							pud_clear(pud);

				}

				static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)

				static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,

							    unsigned long start, unsigned long end)

				{

					pmd_t *pmd = pmd_offset(pud, start);

				@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)

						unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;

						unsigned long pre_end = min_t(unsigned long, end, next_page);

						__unmap_pmd_range(pud, pmd, start, pre_end);

						__unmap_pmd_range(cpa, pud, pmd, start, pre_end);

						start = pre_end;

						pmd++;

				@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)

						if (pmd_large(*pmd))

							pmd_clear(pmd);

						else

							__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);

							__unmap_pmd_range(cpa, pud, pmd,

									  start, start + PMD_SIZE);

						start += PMD_SIZE;

						pmd++;

				@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)

					 * 4K leftovers?

					 */

					if (start < end)

						return __unmap_pmd_range(pud, pmd, start, end);

						return __unmap_pmd_range(cpa, pud, pmd, start, end);

					/*

					 * Try again to free the PMD page if haven't succeeded above.

					 */

					if (!pud_none(*pud))

						if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))

						if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))

							pud_clear(pud);

				}

				void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

				static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,

							      unsigned long start,

							      unsigned long end)

				{

					pud_t *pud = pud_offset(pgd, start);

				@ -840,7 +853,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

						unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;

						unsigned long pre_end	= min_t(unsigned long, end, next_page);

						unmap_pmd_range(pud, start, pre_end);

						unmap_pmd_range(cpa, pud, start, pre_end);

						start = pre_end;

						pud++;

				@ -854,7 +867,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

						if (pud_large(*pud))

							pud_clear(pud);

						else

							unmap_pmd_range(pud, start, start + PUD_SIZE);

							unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);

						start += PUD_SIZE;

						pud++;

				@ -864,7 +877,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

					 * 2M leftovers?

					 */

					if (start < end)

						unmap_pmd_range(pud, start, end);

						unmap_pmd_range(cpa, pud, start, end);

					/*

					 * No need to try to free the PUD page because we'll free it in

				@ -872,6 +885,24 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

					 */

				}

				static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)

				{

					struct cpa_data cpa = {

						.flags = CPA_FREE_PAGETABLES,

					};

					__unmap_pud_range(&cpa, pgd, start, end);

				}

				void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)

				{

					struct cpa_data cpa = {

						.flags = 0,

					};

					__unmap_pud_range(&cpa, pgd, start, end);

				}

				static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)

				{

					pgd_t *pgd_entry = root + pgd_index(addr);

									
										42

arch/x86/mm/pgtable.c
									
										View file
										
				@ -340,40 +340,26 @@ static inline void _pgd_free(pgd_t *pgd)

						kmem_cache_free(pgd_cache, pgd);

				}

				#else

				#ifdef CONFIG_KAISER

				/*

				 * Instead of one pmd, we aquire two pmds.  Being order-1, it is

				 * both 8k in size and 8k-aligned.  That lets us just flip bit 12

				 * in a pointer to swap between the two 4k halves.

				 */

				#define PGD_ALLOCATION_ORDER 1

				#else

				#define PGD_ALLOCATION_ORDER 0

				#endif

				static inline pgd_t *_pgd_alloc(void)

				{

				#ifdef CONFIG_KAISER

					// Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory

					// block. Therefore, we have to allocate at least 3 pages. However, the

					// __get_free_pages returns us 4 pages. Hence, we store the base pointer at

					// the beginning of the page of our 8kb-aligned memory block in order to

					// correctly free it afterwars.

					unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));

					if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)

					{

						*((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;

						return (pgd_t *) pages;

					}

					else

					{

						*((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;

						return (pgd_t *) (pages + PAGE_SIZE);

					}

				#else

					return (pgd_t *)__get_free_page(PGALLOC_GFP);

				#endif

					return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);

				}

				static inline void _pgd_free(pgd_t *pgd)

				{

				#ifdef CONFIG_KAISER

				  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));

					free_pages(pages, get_order(4*PAGE_SIZE));

				#else

					free_page((unsigned long)pgd);

				#endif

					free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);

				}

				#endif /* CONFIG_X86_PAE */

									
										26

include/linux/kaiser.h
									
										Normal file
									
										View file
										
				@ -0,0 +1,26 @@

				#ifndef _INCLUDE_KAISER_H

				#define _INCLUDE_KAISER_H

				#ifdef CONFIG_KAISER

				#include <asm/kaiser.h>

				#else

				/*

				 * These stubs are used whenever CONFIG_KAISER is off, which

				 * includes architectures that support KAISER, but have it

				 * disabled.

				 */

				static inline void kaiser_init(void)

				{

				}

				static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)

				{

				}

				static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)

				{

					return 0;

				}

				#endif /* !CONFIG_KAISER */

				#endif /* _INCLUDE_KAISER_H */

									
										9

kernel/fork.c
									
										View file
										
				@ -58,6 +58,7 @@

				#include <linux/tsacct_kern.h>

				#include <linux/cn_proc.h>

				#include <linux/freezer.h>

				#include <linux/kaiser.h>

				#include <linux/delayacct.h>

				#include <linux/taskstats_kern.h>

				#include <linux/random.h>

				@ -335,7 +336,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)

					*stackend = STACK_END_MAGIC;	/* for overflow detection */

				}

				extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);

				static struct task_struct *dup_task_struct(struct task_struct *orig, int node)

				{

					struct task_struct *tsk;

				@ -357,9 +357,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)

						goto free_ti;

					tsk->stack = ti;

				#ifdef CONFIG_KAISER

					kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);

				#endif

					err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);

					if (err)

						goto free_ti;

				#ifdef CONFIG_SECCOMP

					/*

					 * We must handle setting up seccomp filters once we're under

5

security/Kconfig

View file

 @ -32,12 +32,17 @@ config SECURITY
 	  If you are unsure how to answer this question, answer N.
 config KAISER
 	bool "Remove the kernel mapping in user mode"
 	default y
 	depends on X86_64
 	depends on !PARAVIRT
 	help
 	  This enforces a strict kernel and user space isolation in order to close
 	  hardware side channels on kernel address information.
 config KAISER_REAL_SWITCH
 	bool "KAISER: actually switch page tables"
 	default y
 config SECURITYFS
 	bool "Enable the securityfs filesystem"
 	help