From 77b02dccafafc467e2f299d49f652b1875c013c7 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 20 Dec 2017 16:59:11 -0800
Subject: [PATCH 01/44] ANDROID: sdcardfs: notify lower file of opens

fsnotify_open is not called within dentry_open,
so we need to call it ourselves.

Change-Id: Ia7f323b3d615e6ca5574e114e8a5d7973fb4c119
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 70706497
---
 fs/sdcardfs/file.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 5ac0b0bbb0ec..dd76ecf33cf3 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -18,6 +18,7 @@
  * General Public License.
  */
 
+#include <linux/fsnotify.h>
 #include "sdcardfs.h"
 #ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
 #include <linux/backing-dev.h>
@@ -259,6 +260,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file)
 			fput(lower_file); /* fput calls dput for lower_dentry */
 		}
 	} else {
+		fsnotify_open(lower_file);
 		sdcardfs_set_lower_file(file, lower_file);
 	}
 

From a51b84097d2103264f8b0759942e313317bccd78 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 2 Jan 2018 14:44:49 -0800
Subject: [PATCH 02/44] ANDROID: sdcardfs: Add default_normal option

The default_normal option causes mounts with the gid set to
AID_SDCARD_RW to have user specific gids, as in the normal case.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Change-Id: I9619b8ac55f41415df943484dc8db1ea986cef6f
Bug: 64672411
---
 fs/sdcardfs/main.c     | 6 ++++++
 fs/sdcardfs/sdcardfs.h | 3 ++-
 fs/sdcardfs/super.c    | 2 ++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 0a2b5167e9a2..3d1023a7ff7c 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -33,6 +33,7 @@ enum {
 	Opt_userid,
 	Opt_reserved_mb,
 	Opt_gid_derivation,
+	Opt_default_normal,
 	Opt_err,
 };
 
@@ -45,6 +46,7 @@ static const match_table_t sdcardfs_tokens = {
 	{Opt_userid, "userid=%d"},
 	{Opt_multiuser, "multiuser"},
 	{Opt_gid_derivation, "derive_gid"},
+	{Opt_default_normal, "default_normal"},
 	{Opt_reserved_mb, "reserved_mb=%u"},
 	{Opt_err, NULL}
 };
@@ -68,6 +70,7 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 	opts->reserved_mb = 0;
 	/* by default, gid derivation is off */
 	opts->gid_derivation = false;
+	vfsopts->default_normal = false;
 
 	*debug = 0;
 
@@ -122,6 +125,8 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 		case Opt_gid_derivation:
 			opts->gid_derivation = true;
 			break;
+		case Opt_default_normal:
+			vfsopts->default_normal = true;
 		/* unknown option */
 		default:
 			if (!silent)
@@ -175,6 +180,7 @@ int parse_options_remount(struct super_block *sb, char *options, int silent,
 				return 0;
 			vfsopts->mask = option;
 			break;
+		case Opt_default_normal:
 		case Opt_multiuser:
 		case Opt_userid:
 		case Opt_fsuid:
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 88b92b2f1872..f5054a2650f1 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -226,6 +226,7 @@ struct sdcardfs_mount_options {
 struct sdcardfs_vfsmount_options {
 	gid_t gid;
 	mode_t mask;
+	bool default_normal;
 };
 
 extern int parse_options_remount(struct super_block *sb, char *options, int silent,
@@ -417,7 +418,7 @@ static inline int get_gid(struct vfsmount *mnt,
 {
 	struct sdcardfs_vfsmount_options *opts = mnt->data;
 
-	if (opts->gid == AID_SDCARD_RW)
+	if (opts->gid == AID_SDCARD_RW && !opts->default_normal)
 		/* As an optimization, certain trusted system components only run
 		 * as owner but operate across all users. Since we're now handing
 		 * out the sdcard_rw GID only to trusted apps, we're okay relaxing
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index b89947d878e3..a28b40f5adc8 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -304,6 +304,8 @@ static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m,
 		seq_printf(m, ",userid=%u", opts->fs_user_id);
 	if (opts->gid_derivation)
 		seq_puts(m, ",derive_gid");
+	if (vfsopts->default_normal)
+		seq_puts(m, ",default_normal");
 	if (opts->reserved_mb != 0)
 		seq_printf(m, ",reserved=%uMB", opts->reserved_mb);
 

From 0fa147b407478e73fe7a478677ff2b12bb824014 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:33 -0500
Subject: [PATCH 03/44] x86/boot: Add early cmdline parsing for options with
 arguments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e505371dd83963caae1a37ead9524e8d997341be upstream.

Add a cmdline_find_option() function to look for cmdline options that
take arguments. The argument is returned in a supplied buffer and the
argument length (regardless of whether it fits in the supplied buffer)
is returned, with -1 indicating not found.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/36b5f97492a9745dce27682305f990fc20e5cf8a.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/cmdline.h |   2 +
 arch/x86/lib/cmdline.c         | 105 +++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)

diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index e01f7f7ccb0c..84ae170bc3d0 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,5 +2,7 @@
 #define _ASM_X86_CMDLINE_H
 
 int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+			char *buffer, int bufsize);
 
 #endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 422db000d727..a744506856b1 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -82,3 +82,108 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 
 	return 0;	/* Buffer overrun */
 }
+
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+		      const char *option, char *buffer, int bufsize)
+{
+	char c;
+	int pos = 0, len = -1;
+	const char *opptr = NULL;
+	char *bufptr = buffer;
+	enum {
+		st_wordstart = 0,	/* Start of word/after whitespace */
+		st_wordcmp,	/* Comparing this word */
+		st_wordskip,	/* Miscompare, skip */
+		st_bufcpy,	/* Copying this to buffer */
+	} state = st_wordstart;
+
+	if (!cmdline)
+		return -1;      /* No command line */
+
+	/*
+	 * This 'pos' check ensures we do not overrun
+	 * a non-NULL-terminated 'cmdline'
+	 */
+	while (pos++ < max_cmdline_size) {
+		c = *(char *)cmdline++;
+		if (!c)
+			break;
+
+		switch (state) {
+		case st_wordstart:
+			if (myisspace(c))
+				break;
+
+			state = st_wordcmp;
+			opptr = option;
+			/* fall through */
+
+		case st_wordcmp:
+			if ((c == '=') && !*opptr) {
+				/*
+				 * We matched all the way to the end of the
+				 * option we were looking for, prepare to
+				 * copy the argument.
+				 */
+				len = 0;
+				bufptr = buffer;
+				state = st_bufcpy;
+				break;
+			} else if (c == *opptr++) {
+				/*
+				 * We are currently matching, so continue
+				 * to the next character on the cmdline.
+				 */
+				break;
+			}
+			state = st_wordskip;
+			/* fall through */
+
+		case st_wordskip:
+			if (myisspace(c))
+				state = st_wordstart;
+			break;
+
+		case st_bufcpy:
+			if (myisspace(c)) {
+				state = st_wordstart;
+			} else {
+				/*
+				 * Increment len, but don't overrun the
+				 * supplied buffer and leave room for the
+				 * NULL terminator.
+				 */
+				if (++len < bufsize)
+					*bufptr++ = c;
+			}
+			break;
+		}
+	}
+
+	if (bufsize)
+		*bufptr = '\0';
+
+	return len;
+}
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+			int bufsize)
+{
+	return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+				     buffer, bufsize);
+}

From 8a43ddfb93a0c6ae1a6e1f5c25705ec5d1843c40 Mon Sep 17 00:00:00 2001
From: Richard Fellner <richard.fellner@student.tugraz.at>
Date: Thu, 4 May 2017 14:26:50 +0200
Subject: [PATCH 04/44] KAISER: Kernel Address Isolation

This patch introduces our implementation of KAISER (Kernel Address Isolation to
have Side-channels Efficiently Removed), a kernel isolation technique to close
hardware side channels on kernel address information.

More information about the patch can be found on:

        https://github.com/IAIK/KAISER

From: Richard Fellner <richard.fellner@student.tugraz.at>
From: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
X-Subject: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
Date: Thu, 4 May 2017 14:26:50 +0200
Link: http://marc.info/?l=linux-kernel&m=149390087310405&w=2
Kaiser-4.10-SHA1: c4b1831d44c6144d3762ccc72f0c4e71a0c713e5

To: <linux-kernel@vger.kernel.org>
To: <kernel-hardening@lists.openwall.com>
Cc: <clementine.maurice@iaik.tugraz.at>
Cc: <moritz.lipp@iaik.tugraz.at>
Cc: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
Cc: Richard Fellner <richard.fellner@student.tugraz.at>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: <kirill.shutemov@linux.intel.com>
Cc: <anders.fogh@gdata-adan.de>

After several recent works [1,2,3] KASLR on x86_64 was basically
considered dead by many researchers. We have been working on an
efficient but effective fix for this problem and found that not mapping
the kernel space when running in user mode is the solution to this
problem [4] (the corresponding paper [5] will be presented at ESSoS17).

With this RFC patch we allow anybody to configure their kernel with the
flag CONFIG_KAISER to add our defense mechanism.

If there are any questions we would love to answer them.
We also appreciate any comments!

Cheers,
Daniel (+ the KAISER team from Graz University of Technology)

[1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf
[2] https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-CPU-Behaviour-To-See-Into-Kernel-Mode-And-Break-KASLR-In-The-Process.pdf
[3] https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Address-Space-Layout-Randomization-KASLR-With-Intel-TSX.pdf
[4] https://github.com/IAIK/KAISER
[5] https://gruss.cc/files/kaiser.pdf

[patch based also on
https://raw.githubusercontent.com/IAIK/KAISER/master/KAISER/0001-KAISER-Kernel-Address-Isolation.patch]

Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S            |  19 +++-
 arch/x86/entry/entry_64_compat.S     |   6 +
 arch/x86/include/asm/hw_irq.h        |   2 +-
 arch/x86/include/asm/kaiser.h        | 113 +++++++++++++++++++
 arch/x86/include/asm/pgtable.h       |   4 +
 arch/x86/include/asm/pgtable_64.h    |  21 ++++
 arch/x86/include/asm/pgtable_types.h |  12 +-
 arch/x86/include/asm/processor.h     |   7 +-
 arch/x86/kernel/cpu/common.c         |   4 +-
 arch/x86/kernel/espfix_64.c          |   6 +
 arch/x86/kernel/head_64.S            |  16 ++-
 arch/x86/kernel/irqinit.c            |   2 +-
 arch/x86/kernel/process.c            |   2 +-
 arch/x86/mm/Makefile                 |   1 +
 arch/x86/mm/kaiser.c                 | 160 +++++++++++++++++++++++++++
 arch/x86/mm/pageattr.c               |   2 +-
 arch/x86/mm/pgtable.c                |  26 +++++
 include/asm-generic/vmlinux.lds.h    |  11 +-
 include/linux/percpu-defs.h          |  30 +++++
 init/main.c                          |   6 +
 kernel/fork.c                        |   8 ++
 security/Kconfig                     |   7 ++
 22 files changed, 449 insertions(+), 16 deletions(-)
 create mode 100644 arch/x86/include/asm/kaiser.h
 create mode 100644 arch/x86/mm/kaiser.c

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cc0f2f5da19b..273bc5b8d491 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -35,6 +35,7 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
 #include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
@@ -135,6 +136,7 @@ ENTRY(entry_SYSCALL_64)
 	 * it is too small to ever cause noticeable irq latency.
 	 */
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 	/*
 	 * A hypervisor implementation might want to use a label
 	 * after the swapgs, so that it can do the swapgs
@@ -207,9 +209,10 @@ entry_SYSCALL_64_fastpath:
 	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz	int_ret_from_sys_call_irqs_off	/* Go to the slow path */
 
-	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RIP(%rsp), %rcx
 	movq	EFLAGS(%rsp), %r11
+	RESTORE_C_REGS_EXCEPT_RCX_R11
+	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	/*
 	 * 64-bit SYSRET restores rip from rcx,
@@ -347,10 +350,12 @@ GLOBAL(int_ret_from_sys_call)
 syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	RESTORE_C_REGS_EXCEPT_RCX_R11
+	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	USERGS_SYSRET64
 
 opportunistic_sysret_failed:
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
 END(entry_SYSCALL_64)
@@ -509,6 +514,7 @@ END(irq_entries_start)
 	 * tracking that we're in kernel mode.
 	 */
 	SWAPGS
+	SWITCH_KERNEL_CR3
 
 	/*
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
@@ -568,6 +574,7 @@ GLOBAL(retint_user)
 	mov	%rsp,%rdi
 	call	prepare_exit_to_usermode
 	TRACE_IRQS_IRETQ
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_regs_and_iret
 
@@ -625,6 +632,7 @@ native_irq_return_ldt:
 	pushq	%rax
 	pushq	%rdi
 	SWAPGS
+	SWITCH_KERNEL_CR3
 	movq	PER_CPU_VAR(espfix_waddr), %rdi
 	movq	%rax, (0*8)(%rdi)		/* RAX */
 	movq	(2*8)(%rsp), %rax		/* RIP */
@@ -640,6 +648,7 @@ native_irq_return_ldt:
 	andl	$0xffff0000, %eax
 	popq	%rdi
 	orq	PER_CPU_VAR(espfix_stack), %rax
+	SWITCH_USER_CR3
 	SWAPGS
 	movq	%rax, %rsp
 	popq	%rax
@@ -1007,6 +1016,7 @@ ENTRY(paranoid_entry)
 	testl	%edx, %edx
 	js	1f				/* negative -> in kernel */
 	SWAPGS
+	SWITCH_KERNEL_CR3
 	xorl	%ebx, %ebx
 1:	ret
 END(paranoid_entry)
@@ -1029,6 +1039,7 @@ ENTRY(paranoid_exit)
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	paranoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
+	SWITCH_USER_CR3_NO_STACK
 	SWAPGS_UNSAFE_STACK
 	jmp	paranoid_exit_restore
 paranoid_exit_no_swapgs:
@@ -1058,6 +1069,7 @@ ENTRY(error_entry)
 	 * from user mode due to an IRET fault.
 	 */
 	SWAPGS
+	SWITCH_KERNEL_CR3
 
 .Lerror_entry_from_usermode_after_swapgs:
 	/*
@@ -1110,7 +1122,7 @@ ENTRY(error_entry)
 	 * Switch to kernel gsbase:
 	 */
 	SWAPGS
-
+	SWITCH_KERNEL_CR3
 	/*
 	 * Pretend that the exception came from user mode: set up pt_regs
 	 * as if we faulted immediately after IRET and clear EBX so that
@@ -1210,6 +1222,7 @@ ENTRY(nmi)
 	 */
 
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 	cld
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1250,6 +1263,7 @@ ENTRY(nmi)
 	 * work, because we don't want to enable interrupts.  Fortunately,
 	 * do_nmi doesn't modify pt_regs.
 	 */
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
 
@@ -1461,6 +1475,7 @@ end_repeat_nmi:
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore
 nmi_swapgs:
+	SWITCH_USER_CR3_NO_STACK
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_EXTRA_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 15cfebaa7688..fe1911930b52 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,7 @@
 #include <asm/irqflags.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
+#include <asm/kaiser.h>
 #include <linux/linkage.h>
 #include <linux/err.h>
 
@@ -50,6 +51,7 @@ ENDPROC(native_usergs_sysret32)
 ENTRY(entry_SYSENTER_compat)
 	/* Interrupts are off on entry. */
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	/*
@@ -161,6 +163,7 @@ ENDPROC(entry_SYSENTER_compat)
 ENTRY(entry_SYSCALL_compat)
 	/* Interrupts are off on entry. */
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 
 	/* Stash user ESP and switch to the kernel stack. */
 	movl	%esp, %r8d
@@ -208,6 +211,7 @@ ENTRY(entry_SYSCALL_compat)
 	/* Opportunistic SYSRET */
 sysret32_from_system_call:
 	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
+	SWITCH_USER_CR3
 	movq	RBX(%rsp), %rbx		/* pt_regs->rbx */
 	movq	RBP(%rsp), %rbp		/* pt_regs->rbp */
 	movq	EFLAGS(%rsp), %r11	/* pt_regs->flags (in r11) */
@@ -269,6 +273,7 @@ ENTRY(entry_INT80_compat)
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	ASM_CLAC			/* Do this early to minimize exposure */
 	SWAPGS
+	SWITCH_KERNEL_CR3_NO_STACK
 
 	/*
 	 * User tracing code (ptrace or signal handlers) might assume that
@@ -311,6 +316,7 @@ ENTRY(entry_INT80_compat)
 
 	/* Go back to user mode. */
 	TRACE_IRQS_ON
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_regs_and_iret
 END(entry_INT80_compat)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 59caa55fb9b5..ee52ff858699 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -187,7 +187,7 @@ extern char irq_entries_start[];
 #define VECTOR_RETRIGGERED	((void *)~0UL)
 
 typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
 
 #endif /* !ASSEMBLY_ */
 
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644
index 000000000000..63ee8309b35b
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,113 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+/* This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
+ * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
+ * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
+ * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
+ * of the user space, or the stacks.
+ */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg
+movq %cr3, \reg
+orq $(0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_USER_CR3
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.endm
+
+
+.macro SWITCH_USER_CR3_NO_STACK
+
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_USER_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3 reg
+.endm
+.macro SWITCH_USER_CR3 reg
+.endm
+.macro SWITCH_USER_CR3_NO_STACK
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+#else /* __ASSEMBLY__ */
+
+
+#ifdef CONFIG_KAISER
+// Upon kernel/user mode switch, it may happen that
+// the address space has to be switched before the registers have been stored.
+// To change the address space, another register is needed.
+// A register therefore has to be stored/restored.
+//
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+#endif /* CONFIG_KAISER */
+
+/**
+ *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ *  @flags: The mapping flags of the pages
+ *
+ *  the mapping is done on a global scope, so no bigger synchronization has to be done.
+ *  the pages have to be manually unmapped again when they are not needed any longer.
+ */
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+
+/**
+ *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ *  shadowmem_initialize_mapping - Initalize the shadow mapping
+ *
+ *  most parts of the shadow mapping can be mapped upon boot time.
+ *  only the thread stacks have to be mapped on runtime.
+ *  the mapped regions are not unmapped at all.
+ */
+extern void kaiser_init(void);
+
+#endif
+
+
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6ec0c8b2e9df..6a843d254114 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -856,6 +856,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
        memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+	// clone the shadow pgd part as well
+	memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+#endif
 }
 
 #define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 2ee781114d34..2131edda6620 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud)
 	native_set_pud(pud, native_make_pud(0));
 }
 
+#ifdef CONFIG_KAISER
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+	return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+	return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
+}
+#endif /* CONFIG_KAISER */
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
+#ifdef CONFIG_KAISER
+	// We know that a pgd is page aligned.
+	// Therefore the lower indices have to be mapped to user space.
+	// These pages are mapped to the shadow mapping.
+	if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+	}
+
+	pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+#else /* CONFIG_KAISER */
 	*pgdp = pgd;
+#endif
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 79c91853e50e..984670491901 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -39,7 +39,11 @@
 #define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
 #define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
 #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL	(_AT(pteval_t, 0))
+#else
+#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -89,7 +93,11 @@
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #endif
 
-#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#ifdef CONFIG_KAISER
+#define _PAGE_PROTNONE	(_AT(pteval_t, 0))
+#else
+#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#endif
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
 			 _PAGE_ACCESSED | _PAGE_DIRTY)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2d5a50cb61a2..6a2e0a0b4a96 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -305,7 +305,7 @@ struct tss_struct {
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
 
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
@@ -332,6 +332,11 @@ union irq_stack_union {
 		char gs_base[40];
 		unsigned long stack_canary;
 	};
+
+	struct {
+		char irq_stack_pointer[64];
+		char unused[IRQ_STACK_SIZE - 64];
+	};
 };
 
 DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index aa1e7246b06b..e5ba9701ee6c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = {
 
 static const struct cpu_dev *this_cpu = &default_cpu;
 
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
 	/*
 	 * We need valid kernel segments for data and code in long mode too
@@ -1229,7 +1229,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
 	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
 };
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 
 /* May not be marked __init: used by software suspend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 4d38416e2a7f..bd1358d55146 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
 #include <asm/pgalloc.h>
 #include <asm/setup.h>
 #include <asm/espfix.h>
+#include <asm/kaiser.h>
 
 /*
  * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -126,6 +127,11 @@ void __init init_espfix_bsp(void)
 	/* Install the espfix pud into the kernel page directory */
 	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
 	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+#ifdef CONFIG_KAISER
+	// add the esp stack pud to the shadow mapping here.
+	// This can be done directly, because the fixup stack has its own pud
+	set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
+#endif
 
 	/* Randomize the locations */
 	init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ffdc0e860390..0a8d18ac63eb 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -441,6 +441,14 @@ early_idt_ripmsg:
 	.balign	PAGE_SIZE; \
 GLOBAL(name)
 
+#ifdef CONFIG_KAISER
+#define NEXT_PGD_PAGE(name) \
+	.balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#endif
+
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)			\
 	i = 0 ;						\
@@ -450,7 +458,7 @@ GLOBAL(name)
 	.endr
 
 	__INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PGD_PAGE(early_level4_pgt)
 	.fill	511,8,0
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 
@@ -460,10 +468,10 @@ NEXT_PAGE(early_dynamic_pgts)
 	.data
 
 #ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
-	.fill	512,8,0
+NEXT_PGD_PAGE(init_level4_pgt)
+	.fill	2*512,8,0
 #else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
 	.org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 1423ab1b0312..f480b38a03c3 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
 	.flags = IRQF_NO_THREAD,
 };
 
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
 	[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
 };
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f7c21c22477..7c5c5dc90ffa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -39,7 +39,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
 	.x86_tss = {
 		.sp0 = TOP_OF_INIT_STACK,
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 1ae7c141f778..978156081bed 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
 obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
+obj-$(CONFIG_KAISER)		+= kaiser.o
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644
index 000000000000..cf1bb922d467
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,160 @@
+
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#ifdef CONFIG_KAISER
+
+__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/**
+ * Get the real ppn from a address in kernel mapping.
+ * @param address The virtual adrress
+ * @return the physical address
+ */
+static inline unsigned long get_pa_from_mapping (unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset_k(address);
+	BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+
+	pud = pud_offset(pgd, address);
+	BUG_ON(pud_none(*pud));
+
+	if (pud_large(*pud)) {
+		return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+	}
+
+	pmd = pmd_offset(pud, address);
+	BUG_ON(pmd_none(*pmd));
+
+	if (pmd_large(*pmd)) {
+		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+	}
+
+	pte = pte_offset_kernel(pmd, address);
+	BUG_ON(pte_none(*pte));
+
+	return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+}
+
+void _kaiser_copy (unsigned long start_addr, unsigned long size,
+					unsigned long flags)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long address;
+	unsigned long end_addr = start_addr + size;
+	unsigned long target_address;
+
+	for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
+			address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
+		target_address = get_pa_from_mapping(address);
+
+		pgd = native_get_shadow_pgd(pgd_offset_k(address));
+
+		BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
+		BUG_ON(pgd_large(*pgd));
+
+		pud = pud_offset(pgd, address);
+		if (pud_none(*pud)) {
+			set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
+		}
+		BUG_ON(pud_large(*pud));
+
+		pmd = pmd_offset(pud, address);
+		if (pmd_none(*pmd)) {
+			set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
+		}
+		BUG_ON(pmd_large(*pmd));
+
+		pte = pte_offset_kernel(pmd, address);
+		if (pte_none(*pte)) {
+			set_pte(pte, __pte(flags | target_address));
+		} else {
+			BUG_ON(__pa(pte_page(*pte)) != target_address);
+		}
+	}
+}
+
+// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
+static inline void __init _kaiser_init(void)
+{
+	pgd_t *pgd;
+	int i = 0;
+
+	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+		set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+	}
+}
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+spinlock_t shadow_table_lock;
+void __init kaiser_init(void)
+{
+	int cpu;
+	spin_lock_init(&shadow_table_lock);
+
+	spin_lock(&shadow_table_lock);
+
+	_kaiser_init();
+
+	for_each_possible_cpu(cpu) {
+		// map the per cpu user variables
+		_kaiser_copy(
+				(unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
+				(unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
+				__PAGE_KERNEL);
+	}
+
+	// map the entry/exit text section, which is responsible to switch between user- and kernel mode
+	_kaiser_copy(
+			(unsigned long) __entry_text_start,
+			(unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
+			__PAGE_KERNEL_RX);
+
+	// the fixed map address of the idt_table
+	_kaiser_copy(
+			(unsigned long) idt_descr.address,
+			sizeof(gate_desc) * NR_VECTORS,
+			__PAGE_KERNEL_RO);
+
+	spin_unlock(&shadow_table_lock);
+}
+
+// add a mapping to the shadow-mapping, and synchronize the mappings
+void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+	spin_lock(&shadow_table_lock);
+	_kaiser_copy(addr, size, flags);
+	spin_unlock(&shadow_table_lock);
+}
+
+extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
+	spin_lock(&shadow_table_lock);
+	do {
+		unmap_pud_range(pgd, start, start + size);
+	} while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
+	spin_unlock(&shadow_table_lock);
+}
+#endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index b599a780a5a9..4fe616bd2586 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -829,7 +829,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 			pud_clear(pud);
 }
 
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 {
 	pud_t *pud = pud_offset(pgd, start);
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fb0a9dd1d6e4..087d3e1e7125 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -342,12 +342,38 @@ static inline void _pgd_free(pgd_t *pgd)
 #else
 static inline pgd_t *_pgd_alloc(void)
 {
+#ifdef CONFIG_KAISER
+	// Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
+	// block. Therefore, we have to allocate at least 3 pages. However, the
+	// __get_free_pages returns us 4 pages. Hence, we store the base pointer at
+	// the beginning of the page of our 8kb-aligned memory block in order to
+	// correctly free it afterwars.
+
+	unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
+
+	if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
+	{
+		*((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
+		return (pgd_t *) pages;
+	}
+	else
+	{
+		*((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
+		return (pgd_t *) (pages + PAGE_SIZE);
+	}
+#else
 	return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#endif
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
+#ifdef CONFIG_KAISER
+  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
+	free_pages(pages, get_order(4*PAGE_SIZE));
+#else
 	free_page((unsigned long)pgd);
+#endif
 }
 #endif /* CONFIG_X86_PAE */
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index ef2e8c97e183..cc1c6628e0ae 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -725,7 +725,16 @@
  */
 #define PERCPU_INPUT(cacheline)						\
 	VMLINUX_SYMBOL(__per_cpu_start) = .;				\
-	*(.data..percpu..first)						\
+	\
+	VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;        \
+	*(.data..percpu..first)           \
+	. = ALIGN(cacheline);           \
+	*(.data..percpu..user_mapped)            \
+	*(.data..percpu..user_mapped..shared_aligned)        \
+	. = ALIGN(PAGE_SIZE);           \
+	*(.data..percpu..user_mapped..page_aligned)          \
+	VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;        \
+	\
 	. = ALIGN(PAGE_SIZE);						\
 	*(.data..percpu..page_aligned)					\
 	. = ALIGN(cacheline);						\
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8f16299ca068..8ea945f63a05 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -35,6 +35,12 @@
 
 #endif
 
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
 /*
  * Base implementations of per-CPU variable declarations and definitions, where
  * the section in which the variable is to be placed is provided by the
@@ -115,6 +121,12 @@
 #define DEFINE_PER_CPU(type, name)					\
 	DEFINE_PER_CPU_SECTION(type, name, "")
 
+#define DECLARE_PER_CPU_USER_MAPPED(type, name)         \
+	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name)          \
+	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
 /*
  * Declaration/definition used for per-CPU variables that must come first in
  * the set of variables.
@@ -144,6 +156,14 @@
 	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
 	____cacheline_aligned_in_smp
 
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)			\
+	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+	____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)			\
+	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+	____cacheline_aligned_in_smp
+
 #define DECLARE_PER_CPU_ALIGNED(type, name)				\
 	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)	\
 	____cacheline_aligned
@@ -162,6 +182,16 @@
 #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)				\
 	DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")		\
 	__aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)      \
+  DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned")   \
+  __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)       \
+  DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned")    \
+  __aligned(PAGE_SIZE)
 
 /*
  * Declaration/definition used for per-CPU variables that must be read mostly.
diff --git a/init/main.c b/init/main.c
index 9e64d7097f1a..0cc5c4b6841f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -87,6 +87,9 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+#endif
 
 static int kernel_init(void *);
 
@@ -492,6 +495,9 @@ static void __init mm_init(void)
 	pgtable_init();
 	vmalloc_init();
 	ioremap_huge_init();
+#ifdef CONFIG_KAISER
+	kaiser_init();
+#endif
 }
 
 asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 68cfda1c1800..8f1931f5b0a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -167,8 +167,12 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 	return page ? page_address(page) : NULL;
 }
 
+extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size);
 static inline void free_thread_info(struct thread_info *ti)
 {
+#ifdef CONFIG_KAISER
+	kaiser_remove_mapping((unsigned long)ti, THREAD_SIZE);
+#endif
 	free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 # else
@@ -331,6 +335,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 }
 
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
@@ -352,6 +357,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 		goto free_ti;
 
 	tsk->stack = ti;
+#ifdef CONFIG_KAISER
+	kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+#endif
 #ifdef CONFIG_SECCOMP
 	/*
 	 * We must handle setting up seccomp filters once we're under
diff --git a/security/Kconfig b/security/Kconfig
index e45237897b43..cb2a9bcff063 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -30,6 +30,13 @@ config SECURITY
 	  model will be used.
 
 	  If you are unsure how to answer this question, answer N.
+config KAISER
+	bool "Remove the kernel mapping in user mode"
+	depends on X86_64
+	depends on !PARAVIRT
+	help
+	  This enforces a strict kernel and user space isolation in order to close
+	  hardware side channels on kernel address information.
 
 config SECURITYFS
 	bool "Enable the securityfs filesystem"

From bed9bb7f3e6d4045013d2bb9e4004896de57f02b Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 30 Aug 2017 16:23:00 -0700
Subject: [PATCH 05/44] kaiser: merged update

Merged fixes and cleanups, rebased to 4.4.89 tree (no 5-level paging).

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S            | 106 +++++++--
 arch/x86/include/asm/kaiser.h        |  43 ++--
 arch/x86/include/asm/pgtable.h       |  18 +-
 arch/x86/include/asm/pgtable_64.h    |  48 +++-
 arch/x86/include/asm/pgtable_types.h |   6 +-
 arch/x86/kernel/espfix_64.c          |  13 +-
 arch/x86/kernel/head_64.S            |  19 +-
 arch/x86/kernel/ldt.c                |  27 ++-
 arch/x86/kernel/tracepoint.c         |   2 +
 arch/x86/mm/kaiser.c                 | 316 ++++++++++++++++++++-------
 arch/x86/mm/pageattr.c               |  63 ++++--
 arch/x86/mm/pgtable.c                |  42 ++--
 include/linux/kaiser.h               |  26 +++
 kernel/fork.c                        |   9 +-
 security/Kconfig                     |   5 +
 15 files changed, 553 insertions(+), 190 deletions(-)
 create mode 100644 include/linux/kaiser.h

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 273bc5b8d491..99bce319df0c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -212,6 +212,13 @@ entry_SYSCALL_64_fastpath:
 	movq	RIP(%rsp), %rcx
 	movq	EFLAGS(%rsp), %r11
 	RESTORE_C_REGS_EXCEPT_RCX_R11
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
 	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	/*
@@ -350,11 +357,25 @@ GLOBAL(int_ret_from_sys_call)
 syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	RESTORE_C_REGS_EXCEPT_RCX_R11
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
 	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	USERGS_SYSRET64
 
 opportunistic_sysret_failed:
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
 	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
@@ -1059,6 +1080,13 @@ ENTRY(error_entry)
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
+	/*
+	 * error_entry() always returns with a kernel gsbase and
+	 * CR3.  We must also have a kernel CR3/gsbase before
+	 * calling TRACE_IRQS_*.  Just unconditionally switch to
+	 * the kernel CR3 here.
+	 */
+	SWITCH_KERNEL_CR3
 	xorl	%ebx, %ebx
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
@@ -1069,7 +1097,6 @@ ENTRY(error_entry)
 	 * from user mode due to an IRET fault.
 	 */
 	SWAPGS
-	SWITCH_KERNEL_CR3
 
 .Lerror_entry_from_usermode_after_swapgs:
 	/*
@@ -1122,7 +1149,7 @@ ENTRY(error_entry)
 	 * Switch to kernel gsbase:
 	 */
 	SWAPGS
-	SWITCH_KERNEL_CR3
+
 	/*
 	 * Pretend that the exception came from user mode: set up pt_regs
 	 * as if we faulted immediately after IRET and clear EBX so that
@@ -1222,7 +1249,10 @@ ENTRY(nmi)
 	 */
 
 	SWAPGS_UNSAFE_STACK
-	SWITCH_KERNEL_CR3_NO_STACK
+	/*
+	 * percpu variables are mapped with user CR3, so no need
+	 * to switch CR3 here.
+	 */
 	cld
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1256,14 +1286,33 @@ ENTRY(nmi)
 
 	movq	%rsp, %rdi
 	movq	$-1, %rsi
+#ifdef CONFIG_KAISER
+	/* Unconditionally use kernel CR3 for do_nmi() */
+	/* %rax is saved above, so OK to clobber here */
+	movq	%cr3, %rax
+	pushq	%rax
+#ifdef CONFIG_KAISER_REAL_SWITCH
+	andq	$(~0x1000), %rax
+#endif
+	movq	%rax, %cr3
+#endif
 	call	do_nmi
+	/*
+	 * Unconditionally restore CR3.  I know we return to
+	 * kernel code that needs user CR3, but do we ever return
+	 * to "user mode" where we need the kernel CR3?
+	 */
+#ifdef CONFIG_KAISER
+	popq	%rax
+	mov	%rax, %cr3
+#endif
 
 	/*
 	 * Return back to user mode.  We must *not* do the normal exit
-	 * work, because we don't want to enable interrupts.  Fortunately,
-	 * do_nmi doesn't modify pt_regs.
+	 * work, because we don't want to enable interrupts.  Do not
+	 * switch to user CR3: we might be going back to kernel code
+	 * that had a user CR3 set.
 	 */
-	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
 
@@ -1459,23 +1508,54 @@ end_repeat_nmi:
 	ALLOC_PT_GPREGS_ON_STACK
 
 	/*
-	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
-	 * as we should not be calling schedule in NMI context.
-	 * Even with normal interrupts enabled. An NMI should not be
-	 * setting NEED_RESCHED or anything that normal interrupts and
-	 * exceptions might do.
+	 * Use the same approach as paranoid_entry to handle SWAPGS, but
+	 * without CR3 handling since we do that differently in NMIs.  No
+	 * need to use paranoid_exit as we should not be calling schedule
+	 * in NMI context.  Even with normal interrupts enabled. An NMI
+	 * should not be setting NEED_RESCHED or anything that normal
+	 * interrupts and exceptions might do.
 	 */
-	call	paranoid_entry
+	cld
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
+	movl	$1, %ebx
+	movl	$MSR_GS_BASE, %ecx
+	rdmsr
+	testl	%edx, %edx
+	js	1f				/* negative -> in kernel */
+	SWAPGS
+	xorl	%ebx, %ebx
+1:
+#ifdef CONFIG_KAISER
+	/* Unconditionally use kernel CR3 for do_nmi() */
+	/* %rax is saved above, so OK to clobber here */
+	movq	%cr3, %rax
+	pushq	%rax
+#ifdef CONFIG_KAISER_REAL_SWITCH
+	andq	$(~0x1000), %rax
+#endif
+	movq	%rax, %cr3
+#endif
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
 	movq	%rsp, %rdi
+	addq	$8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
 	movq	$-1, %rsi
 	call	do_nmi
+	/*
+	 * Unconditionally restore CR3.  We might be returning to
+	 * kernel code that needs user CR3, like just just before
+	 * a sysret.
+	 */
+#ifdef CONFIG_KAISER
+	popq	%rax
+	mov	%rax, %cr3
+#endif
 
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore
 nmi_swapgs:
-	SWITCH_USER_CR3_NO_STACK
+	/* We fixed up CR3 above, so no need to switch it here */
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_EXTRA_REGS
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 63ee8309b35b..0703f48777f3 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -16,13 +16,17 @@
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
+#ifdef CONFIG_KAISER_REAL_SWITCH
 andq $(~0x1000), \reg
+#endif
 movq \reg, %cr3
 .endm
 
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
+#ifdef CONFIG_KAISER_REAL_SWITCH
 orq $(0x1000), \reg
+#endif
 movq \reg, %cr3
 .endm
 
@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 .endm
 
 #endif /* CONFIG_KAISER */
+
 #else /* __ASSEMBLY__ */
 
 
 #ifdef CONFIG_KAISER
-// Upon kernel/user mode switch, it may happen that
-// the address space has to be switched before the registers have been stored.
-// To change the address space, another register is needed.
-// A register therefore has to be stored/restored.
-//
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored.  To change the address space, another register is
+ * needed.  A register therefore has to be stored/restored.
+*/
+
 DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
-#endif /* CONFIG_KAISER */
-
 /**
- *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
  *  @addr: the start address of the range
  *  @size: the size of the range
  *  @flags: The mapping flags of the pages
  *
- *  the mapping is done on a global scope, so no bigger synchronization has to be done.
- *  the pages have to be manually unmapped again when they are not needed any longer.
+ *  The mapping is done on a global scope, so no bigger
+ *  synchronization has to be done.  the pages have to be
+ *  manually unmapped again when they are not needed any longer.
  */
-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
 
 
 /**
- *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
  *  @addr: the start address of the range
  *  @size: the size of the range
  */
 extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
 
 /**
- *  shadowmem_initialize_mapping - Initalize the shadow mapping
+ *  kaiser_initialize_mapping - Initalize the shadow mapping
  *
- *  most parts of the shadow mapping can be mapped upon boot time.
- *  only the thread stacks have to be mapped on runtime.
- *  the mapped regions are not unmapped at all.
+ *  Most parts of the shadow mapping can be mapped upon boot
+ *  time.  Only per-process things like the thread stacks
+ *  or a new LDT have to be mapped at runtime.  These boot-
+ *  time mappings are permanent and nevertunmapped.
  */
 extern void kaiser_init(void);
 
-#endif
+#endif /* CONFIG_KAISER */
+
+#endif /* __ASSEMBLY */
 
 
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6a843d254114..176035fa057e 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -653,7 +653,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
 
 static inline int pgd_bad(pgd_t pgd)
 {
-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+	pgdval_t ignore_flags = _PAGE_USER;
+	/*
+	 * We set NX on KAISER pgds that map userspace memory so
+	 * that userspace can not meaningfully use the kernel
+	 * page table by accident; it will fault on the first
+	 * instruction it tries to run.  See native_set_pgd().
+	 */
+	if (IS_ENABLED(CONFIG_KAISER))
+		ignore_flags |= _PAGE_NX;
+
+	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
 }
 
 static inline int pgd_none(pgd_t pgd)
@@ -857,8 +867,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
        memcpy(dst, src, count * sizeof(pgd_t));
 #ifdef CONFIG_KAISER
-	// clone the shadow pgd part as well
-	memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+	/* Clone the shadow pgd part as well */
+	memcpy(native_get_shadow_pgd(dst),
+	       native_get_shadow_pgd(src),
+	       count * sizeof(pgd_t));
 #endif
 }
 
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 2131edda6620..b0adf0d11c9a 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud)
 }
 
 #ifdef CONFIG_KAISER
-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+{
 	return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
 }
 
-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+{
 	return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
 }
+#else
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+{
+	BUILD_BUG_ON(1);
+	return NULL;
+}
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+{
+	return pgdp;
+}
 #endif /* CONFIG_KAISER */
 
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(void *__ptr)
+{
+	unsigned long ptr = (unsigned long)__ptr;
+
+	return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
+}
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
 #ifdef CONFIG_KAISER
-	// We know that a pgd is page aligned.
-	// Therefore the lower indices have to be mapped to user space.
-	// These pages are mapped to the shadow mapping.
-	if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+	pteval_t extra_kern_pgd_flags = 0;
+	/* Do we need to also populate the shadow pgd? */
+	if (is_userspace_pgd(pgdp)) {
 		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+		/*
+		 * Even if the entry is *mapping* userspace, ensure
+		 * that userspace can not use it.  This way, if we
+		 * get out to userspace running on the kernel CR3,
+		 * userspace will crash instead of running.
+		 */
+		extra_kern_pgd_flags = _PAGE_NX;
 	}
-
-	pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+	pgdp->pgd = pgd.pgd;
+	pgdp->pgd |= extra_kern_pgd_flags;
 #else /* CONFIG_KAISER */
 	*pgdp = pgd;
 #endif
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 984670491901..a70d6100b3df 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -42,7 +42,7 @@
 #ifdef CONFIG_KAISER
 #define _PAGE_GLOBAL	(_AT(pteval_t, 0))
 #else
-#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 #endif
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
@@ -93,11 +93,7 @@
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #endif
 
-#ifdef CONFIG_KAISER
-#define _PAGE_PROTNONE	(_AT(pteval_t, 0))
-#else
 #define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-#endif
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
 			 _PAGE_ACCESSED | _PAGE_DIRTY)
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index bd1358d55146..54de4c70cbd6 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void)
 	/* Install the espfix pud into the kernel page directory */
 	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
 	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
-#ifdef CONFIG_KAISER
-	// add the esp stack pud to the shadow mapping here.
-	// This can be done directly, because the fixup stack has its own pud
-	set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
-#endif
+	/*
+	 * Just copy the top-level PGD that is mapping the espfix
+	 * area to ensure it is mapped into the shadow user page
+	 * tables.
+	 */
+	if (IS_ENABLED(CONFIG_KAISER))
+		set_pgd(native_get_shadow_pgd(pgd_p),
+			__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
 
 	/* Randomize the locations */
 	init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0a8d18ac63eb..abbbcfbb5e26 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -442,11 +442,24 @@ early_idt_ripmsg:
 GLOBAL(name)
 
 #ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned.  We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL	512
+/* This ensures they are 8k-aligned: */
 #define NEXT_PGD_PAGE(name) \
 	.balign 2 * PAGE_SIZE; \
 GLOBAL(name)
 #else
 #define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL	0
 #endif
 
 /* Automate the creation of 1 to 1 mapping pmd entries */
@@ -461,6 +474,7 @@ GLOBAL(name)
 NEXT_PGD_PAGE(early_level4_pgt)
 	.fill	511,8,0
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(early_dynamic_pgts)
 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -469,7 +483,8 @@ NEXT_PAGE(early_dynamic_pgts)
 
 #ifndef CONFIG_XEN
 NEXT_PGD_PAGE(init_level4_pgt)
-	.fill	2*512,8,0
+	.fill	512,8,0
+	.fill	KAISER_USER_PGD_FILL,8,0
 #else
 NEXT_PGD_PAGE(init_level4_pgt)
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -478,6 +493,7 @@ NEXT_PGD_PAGE(init_level4_pgt)
 	.org    init_level4_pgt + L4_START_KERNEL*8, 0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_ident_pgt)
 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -488,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
 	 */
 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 #endif
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_kernel_pgt)
 	.fill	L3_START_KERNEL,8,0
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d6279593bcdd..1062e88ff040 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
 #include <linux/uaccess.h>
 
 #include <asm/ldt.h>
+#include <asm/kaiser.h>
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
 	set_ldt(pc->ldt->entries, pc->ldt->size);
 }
 
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+		vfree(ldt->entries);
+	else
+		free_page((unsigned long)ldt->entries);
+	kfree(ldt);
+}
+
 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
 static struct ldt_struct *alloc_ldt_struct(int size)
 {
 	struct ldt_struct *new_ldt;
 	int alloc_size;
+	int ret = 0;
 
 	if (size > LDT_ENTRIES)
 		return NULL;
@@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_struct(int size)
 		return NULL;
 	}
 
+	// FIXME: make kaiser_add_mapping() return an error code
+	// when it fails
+	kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+			   __PAGE_KERNEL);
+	if (ret) {
+		__free_ldt_struct(new_ldt);
+		return NULL;
+	}
 	new_ldt->size = size;
 	return new_ldt;
 }
@@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
 	if (likely(!ldt))
 		return;
 
+	kaiser_remove_mapping((unsigned long)ldt->entries,
+			      ldt->size * LDT_ENTRY_SIZE);
 	paravirt_free_ldt(ldt->entries, ldt->size);
-	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
-		vfree(ldt->entries);
-	else
-		free_page((unsigned long)ldt->entries);
-	kfree(ldt);
+	__free_ldt_struct(ldt);
 }
 
 /*
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 1c113db9ed57..2bb5ee464df3 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -9,10 +9,12 @@
 #include <linux/atomic.h>
 
 atomic_t trace_idt_ctr = ATOMIC_INIT(0);
+__aligned(PAGE_SIZE)
 struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
 				(unsigned long) trace_idt_table };
 
 /* No need to be aligned, but done to keep all IDTs defined the same way. */
+__aligned(PAGE_SIZE)
 gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
 
 static int trace_irq_vector_refcount;
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index cf1bb922d467..960071d258c3 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -1,160 +1,306 @@
-
-
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
-
 #include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+#include <asm/kaiser.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
 #ifdef CONFIG_KAISER
 
 __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
-
-/**
- * Get the real ppn from a address in kernel mapping.
- * @param address The virtual adrress
- * @return the physical address
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes.  No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte().  We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
  */
-static inline unsigned long get_pa_from_mapping (unsigned long address)
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+/*
+ * Returns -1 on error.
+ */
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 
-	pgd = pgd_offset_k(address);
-	BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+	pgd = pgd_offset_k(vaddr);
+	/*
+	 * We made all the kernel PGDs present in kaiser_init().
+	 * We expect them to stay that way.
+	 */
+	BUG_ON(pgd_none(*pgd));
+	/*
+	 * PGDs are either 512GB or 128TB on all x86_64
+	 * configurations.  We don't handle these.
+	 */
+	BUG_ON(pgd_large(*pgd));
+
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	if (pud_large(*pud))
+		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	if (pmd_large(*pmd))
+		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (pte_none(*pte)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
+{
+	pmd_t *pmd;
+	pud_t *pud;
+	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+	might_sleep();
+	if (is_atomic) {
+		gfp &= ~GFP_KERNEL;
+		gfp |= __GFP_HIGH | __GFP_ATOMIC;
+	}
+
+	if (pgd_none(*pgd)) {
+		WARN_ONCE(1, "All shadow pgds should have been populated");
+		return NULL;
+	}
+	BUILD_BUG_ON(pgd_large(*pgd) != 0);
 
 	pud = pud_offset(pgd, address);
-	BUG_ON(pud_none(*pud));
-
+	/* The shadow page tables do not use large mappings: */
 	if (pud_large(*pud)) {
-		return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pud_none(*pud)) {
+		unsigned long new_pmd_page = __get_free_page(gfp);
+		if (!new_pmd_page)
+			return NULL;
+		spin_lock(&shadow_table_allocation_lock);
+		if (pud_none(*pud))
+			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+		else
+			free_page(new_pmd_page);
+		spin_unlock(&shadow_table_allocation_lock);
 	}
 
 	pmd = pmd_offset(pud, address);
-	BUG_ON(pmd_none(*pmd));
-
+	/* The shadow page tables do not use large mappings: */
 	if (pmd_large(*pmd)) {
-		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pmd_none(*pmd)) {
+		unsigned long new_pte_page = __get_free_page(gfp);
+		if (!new_pte_page)
+			return NULL;
+		spin_lock(&shadow_table_allocation_lock);
+		if (pmd_none(*pmd))
+			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+		else
+			free_page(new_pte_page);
+		spin_unlock(&shadow_table_allocation_lock);
 	}
 
-	pte = pte_offset_kernel(pmd, address);
-	BUG_ON(pte_none(*pte));
-
-	return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+	return pte_offset_kernel(pmd, address);
 }
 
-void _kaiser_copy (unsigned long start_addr, unsigned long size,
-					unsigned long flags)
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+			unsigned long flags)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
+	int ret = 0;
 	pte_t *pte;
-	unsigned long address;
-	unsigned long end_addr = start_addr + size;
+	unsigned long start_addr = (unsigned long )__start_addr;
+	unsigned long address = start_addr & PAGE_MASK;
+	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 	unsigned long target_address;
 
-	for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
-			address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
+	for (;address < end_addr; address += PAGE_SIZE) {
 		target_address = get_pa_from_mapping(address);
-
-		pgd = native_get_shadow_pgd(pgd_offset_k(address));
-
-		BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
-		BUG_ON(pgd_large(*pgd));
-
-		pud = pud_offset(pgd, address);
-		if (pud_none(*pud)) {
-			set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
+		if (target_address == -1) {
+			ret = -EIO;
+			break;
 		}
-		BUG_ON(pud_large(*pud));
-
-		pmd = pmd_offset(pud, address);
-		if (pmd_none(*pmd)) {
-			set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
-		}
-		BUG_ON(pmd_large(*pmd));
-
-		pte = pte_offset_kernel(pmd, address);
+		pte = kaiser_pagetable_walk(address, false);
 		if (pte_none(*pte)) {
 			set_pte(pte, __pte(flags | target_address));
 		} else {
-			BUG_ON(__pa(pte_page(*pte)) != target_address);
+			pte_t tmp;
+			set_pte(&tmp, __pte(flags | target_address));
+			WARN_ON_ONCE(!pte_same(*pte, tmp));
 		}
 	}
+	return ret;
 }
 
-// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
-static inline void __init _kaiser_init(void)
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+	unsigned long size = end - start;
+
+	return kaiser_add_user_map(start, size, flags);
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated.  This ensures that all processes that get
+ * forked have the same entries.  This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
 {
 	pgd_t *pgd;
 	int i = 0;
 
 	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
 	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
-		set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+		pgd_t new_pgd;
+		pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
+		if (!pud) {
+			WARN_ON(1);
+			break;
+		}
+		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+		/*
+		 * Make sure not to stomp on some other pgd entry.
+		 */
+		if (!pgd_none(pgd[i])) {
+			WARN_ON(1);
+			continue;
+		}
+		set_pgd(pgd + i, new_pgd);
 	}
 }
 
+#define kaiser_add_user_map_early(start, size, flags) do {	\
+	int __ret = kaiser_add_user_map(start, size, flags);	\
+	WARN_ON(__ret);						\
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\
+	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\
+	WARN_ON(__ret);							\
+} while (0)
+
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
-spinlock_t shadow_table_lock;
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die.  But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it.  If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
 void __init kaiser_init(void)
 {
 	int cpu;
-	spin_lock_init(&shadow_table_lock);
 
-	spin_lock(&shadow_table_lock);
-
-	_kaiser_init();
+	kaiser_init_all_pgds();
 
 	for_each_possible_cpu(cpu) {
-		// map the per cpu user variables
-		_kaiser_copy(
-				(unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
-				(unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
-				__PAGE_KERNEL);
+		void *percpu_vaddr = __per_cpu_user_mapped_start +
+				     per_cpu_offset(cpu);
+		unsigned long percpu_sz = __per_cpu_user_mapped_end -
+					  __per_cpu_user_mapped_start;
+		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+					  __PAGE_KERNEL);
 	}
 
-	// map the entry/exit text section, which is responsible to switch between user- and kernel mode
-	_kaiser_copy(
-			(unsigned long) __entry_text_start,
-			(unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
-			__PAGE_KERNEL_RX);
+	/*
+	 * Map the entry/exit text section, which is needed at
+	 * switches from user to and from kernel.
+	 */
+	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+				       __PAGE_KERNEL_RX);
 
-	// the fixed map address of the idt_table
-	_kaiser_copy(
-			(unsigned long) idt_descr.address,
-			sizeof(gate_desc) * NR_VECTORS,
-			__PAGE_KERNEL_RO);
-
-	spin_unlock(&shadow_table_lock);
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+				       __irqentry_text_end,
+				       __PAGE_KERNEL_RX);
+#endif
+	kaiser_add_user_map_early((void *)idt_descr.address,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL_RO);
+#ifdef CONFIG_TRACING
+	kaiser_add_user_map_early(&trace_idt_descr,
+				  sizeof(trace_idt_descr),
+				  __PAGE_KERNEL);
+	kaiser_add_user_map_early(&trace_idt_table,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL);
+#endif
+	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
+				  __PAGE_KERNEL);
+	kaiser_add_user_map_early(&debug_idt_table,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL);
 }
 
+extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);
 // add a mapping to the shadow-mapping, and synchronize the mappings
-void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 {
-	spin_lock(&shadow_table_lock);
-	_kaiser_copy(addr, size, flags);
-	spin_unlock(&shadow_table_lock);
+	return kaiser_add_user_map((const void *)addr, size, flags);
 }
 
-extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
 void kaiser_remove_mapping(unsigned long start, unsigned long size)
 {
-	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
-	spin_lock(&shadow_table_lock);
-	do {
-		unmap_pud_range(pgd, start, start + size);
-	} while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
-	spin_unlock(&shadow_table_lock);
+	unsigned long end = start + size;
+	unsigned long addr;
+
+	for (addr = start; addr < end; addr += PGDIR_SIZE) {
+		pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
+		/*
+		 * unmap_p4d_range() handles > P4D_SIZE unmaps,
+		 * so no need to trim 'end'.
+		 */
+		unmap_pud_range_nofree(pgd, addr, end);
+	}
 }
 #endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4fe616bd2586..79377e2a7bcd 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
 #define CPA_FLUSHTLB 1
 #define CPA_ARRAY 2
 #define CPA_PAGES_ARRAY 4
+#define CPA_FREE_PAGETABLES 8
 
 #ifdef CONFIG_PROC_FS
 static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 	return 0;
 }
 
-static bool try_to_free_pte_page(pte_t *pte)
+static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
 {
 	int i;
 
+	if (!(cpa->flags & CPA_FREE_PAGETABLES))
+		return false;
+
 	for (i = 0; i < PTRS_PER_PTE; i++)
 		if (!pte_none(pte[i]))
 			return false;
@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)
 	return true;
 }
 
-static bool try_to_free_pmd_page(pmd_t *pmd)
+static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
 {
 	int i;
 
+	if (!(cpa->flags & CPA_FREE_PAGETABLES))
+		return false;
+
 	for (i = 0; i < PTRS_PER_PMD; i++)
 		if (!pmd_none(pmd[i]))
 			return false;
@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)
 	return true;
 }
 
-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
+			    unsigned long start,
+			    unsigned long end)
 {
 	pte_t *pte = pte_offset_kernel(pmd, start);
 
@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
 		pte++;
 	}
 
-	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+	if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
 		pmd_clear(pmd);
 		return true;
 	}
 	return false;
 }
 
-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
 			      unsigned long start, unsigned long end)
 {
-	if (unmap_pte_range(pmd, start, end))
-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+	if (unmap_pte_range(cpa, pmd, start, end))
+		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
 			pud_clear(pud);
 }
 
-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
+			    unsigned long start, unsigned long end)
 {
 	pmd_t *pmd = pmd_offset(pud, start);
 
@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 		unsigned long pre_end = min_t(unsigned long, end, next_page);
 
-		__unmap_pmd_range(pud, pmd, start, pre_end);
+		__unmap_pmd_range(cpa, pud, pmd, start, pre_end);
 
 		start = pre_end;
 		pmd++;
@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 		if (pmd_large(*pmd))
 			pmd_clear(pmd);
 		else
-			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+			__unmap_pmd_range(cpa, pud, pmd,
+					  start, start + PMD_SIZE);
 
 		start += PMD_SIZE;
 		pmd++;
@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 	 * 4K leftovers?
 	 */
 	if (start < end)
-		return __unmap_pmd_range(pud, pmd, start, end);
+		return __unmap_pmd_range(cpa, pud, pmd, start, end);
 
 	/*
 	 * Try again to free the PMD page if haven't succeeded above.
 	 */
 	if (!pud_none(*pud))
-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
 			pud_clear(pud);
 }
 
-void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
+			      unsigned long start,
+			      unsigned long end)
 {
 	pud_t *pud = pud_offset(pgd, start);
 
@@ -840,7 +853,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 		unsigned long pre_end	= min_t(unsigned long, end, next_page);
 
-		unmap_pmd_range(pud, start, pre_end);
+		unmap_pmd_range(cpa, pud, start, pre_end);
 
 		start = pre_end;
 		pud++;
@@ -854,7 +867,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 		if (pud_large(*pud))
 			pud_clear(pud);
 		else
-			unmap_pmd_range(pud, start, start + PUD_SIZE);
+			unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
 
 		start += PUD_SIZE;
 		pud++;
@@ -864,7 +877,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 	 * 2M leftovers?
 	 */
 	if (start < end)
-		unmap_pmd_range(pud, start, end);
+		unmap_pmd_range(cpa, pud, start, end);
 
 	/*
 	 * No need to try to free the PUD page because we'll free it in
@@ -872,6 +885,24 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 	 */
 }
 
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+	struct cpa_data cpa = {
+		.flags = CPA_FREE_PAGETABLES,
+	};
+
+	__unmap_pud_range(&cpa, pgd, start, end);
+}
+
+void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+	struct cpa_data cpa = {
+		.flags = 0,
+	};
+
+	__unmap_pud_range(&cpa, pgd, start, end);
+}
+
 static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
 {
 	pgd_t *pgd_entry = root + pgd_index(addr);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 087d3e1e7125..e2bd5c81279e 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -340,40 +340,26 @@ static inline void _pgd_free(pgd_t *pgd)
 		kmem_cache_free(pgd_cache, pgd);
 }
 #else
+
+#ifdef CONFIG_KAISER
+/*
+ * Instead of one pmd, we aquire two pmds.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
 static inline pgd_t *_pgd_alloc(void)
 {
-#ifdef CONFIG_KAISER
-	// Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
-	// block. Therefore, we have to allocate at least 3 pages. However, the
-	// __get_free_pages returns us 4 pages. Hence, we store the base pointer at
-	// the beginning of the page of our 8kb-aligned memory block in order to
-	// correctly free it afterwars.
-
-	unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
-
-	if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
-	{
-		*((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
-		return (pgd_t *) pages;
-	}
-	else
-	{
-		*((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
-		return (pgd_t *) (pages + PAGE_SIZE);
-	}
-#else
-	return (pgd_t *)__get_free_page(PGALLOC_GFP);
-#endif
+	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
-#ifdef CONFIG_KAISER
-  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
-	free_pages(pages, get_order(4*PAGE_SIZE));
-#else
-	free_page((unsigned long)pgd);
-#endif
+	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 }
 #endif /* CONFIG_X86_PAE */
 
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
new file mode 100644
index 000000000000..9db5433c2284
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,26 @@
+#ifndef _INCLUDE_KAISER_H
+#define _INCLUDE_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it
+ * disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+}
+static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+	return 0;
+}
+
+#endif /* !CONFIG_KAISER */
+#endif /* _INCLUDE_KAISER_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 8f1931f5b0a5..3842ceddb7f9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -58,6 +58,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/freezer.h>
+#include <linux/kaiser.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
@@ -335,7 +336,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 }
 
-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
@@ -357,9 +357,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 		goto free_ti;
 
 	tsk->stack = ti;
-#ifdef CONFIG_KAISER
-	kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
-#endif
+
+	err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+	if (err)
+		goto free_ti;
 #ifdef CONFIG_SECCOMP
 	/*
 	 * We must handle setting up seccomp filters once we're under
diff --git a/security/Kconfig b/security/Kconfig
index cb2a9bcff063..45cdb0098f38 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -32,12 +32,17 @@ config SECURITY
 	  If you are unsure how to answer this question, answer N.
 config KAISER
 	bool "Remove the kernel mapping in user mode"
+	default y
 	depends on X86_64
 	depends on !PARAVIRT
 	help
 	  This enforces a strict kernel and user space isolation in order to close
 	  hardware side channels on kernel address information.
 
+config KAISER_REAL_SWITCH
+	bool "KAISER: actually switch page tables"
+	default y
+
 config SECURITYFS
 	bool "Enable the securityfs filesystem"
 	help

From edde73205b3fdde8c8a3adfce78cc6d0de72386b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 5 Sep 2017 12:05:01 -0700
Subject: [PATCH 06/44] kaiser: do not set _PAGE_NX on pgd_none

native_pgd_clear() uses native_set_pgd(), so native_set_pgd() must
avoid setting the _PAGE_NX bit on an otherwise pgd_none() entry:
usually that just generated a warning on exit, but sometimes
more mysterious and damaging failures (our production machines
could not complete booting).

The original fix to this just avoided adding _PAGE_NX to
an empty entry; but eventually more problems surfaced with kexec,
and EFI mapping expected to be a problem too.  So now instead
change native_set_pgd() to update shadow only if _PAGE_USER:

A few places (kernel/machine_kexec_64.c, platform/efi/efi_64.c for sure)
use set_pgd() to set up a temporary internal virtual address space, with
physical pages remapped at what Kaiser regards as userspace addresses:
Kaiser then assumes a shadow pgd follows, which it will try to corrupt.

This appears to be responsible for the recent kexec and kdump failures;
though it's unclear how those did not manifest as a problem before.
Ah, the shadow pgd will only be assumed to "follow" if the requested
pgd is on an even-numbered page: so I suppose it was going wrong 50%
of the time all along.

What we need is a flag to set_pgd(), to tell it we're dealing with
userspace.  Er, isn't that what the pgd's _PAGE_USER bit is saying?
Add a test for that.  But we cannot do the same for pgd_clear()
(which may be called to clear corrupted entries - set aside the
question of "corrupt in which pgd?" until later), so there just
rely on pgd_clear() not being called in the problematic cases -
with a WARN_ON_ONCE() which should fire half the time if it is.

But this is getting too big for an inline function: move it into
arch/x86/mm/kaiser.c (which then demands a boot/compressed mod);
and de-void and de-space native_get_shadow/normal_pgd() while here.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/boot/compressed/misc.h   |  1 +
 arch/x86/include/asm/pgtable_64.h | 51 ++++++++-----------------------
 arch/x86/mm/kaiser.c              | 42 +++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3783dc3e10b3..4bf52d351022 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -9,6 +9,7 @@
  */
 #undef CONFIG_PARAVIRT
 #undef CONFIG_PARAVIRT_SPINLOCKS
+#undef CONFIG_KAISER
 #undef CONFIG_KASAN
 
 #include <linux/linkage.h>
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index b0adf0d11c9a..8629be9e6649 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -107,61 +107,36 @@ static inline void native_pud_clear(pud_t *pud)
 }
 
 #ifdef CONFIG_KAISER
-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 {
-	return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+	return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
 }
 
-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
 {
-	return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
+	return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
 }
 #else
-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	return pgd;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 {
 	BUILD_BUG_ON(1);
 	return NULL;
 }
-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
 {
 	return pgdp;
 }
 #endif /* CONFIG_KAISER */
 
-/*
- * Page table pages are page-aligned.  The lower half of the top
- * level is used for userspace and the top half for the kernel.
- * This returns true for user pages that need to get copied into
- * both the user and kernel copies of the page tables, and false
- * for kernel pages that should only be in the kernel copy.
- */
-static inline bool is_userspace_pgd(void *__ptr)
-{
-	unsigned long ptr = (unsigned long)__ptr;
-
-	return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
-}
-
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
-#ifdef CONFIG_KAISER
-	pteval_t extra_kern_pgd_flags = 0;
-	/* Do we need to also populate the shadow pgd? */
-	if (is_userspace_pgd(pgdp)) {
-		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
-		/*
-		 * Even if the entry is *mapping* userspace, ensure
-		 * that userspace can not use it.  This way, if we
-		 * get out to userspace running on the kernel CR3,
-		 * userspace will crash instead of running.
-		 */
-		extra_kern_pgd_flags = _PAGE_NX;
-	}
-	pgdp->pgd = pgd.pgd;
-	pgdp->pgd |= extra_kern_pgd_flags;
-#else /* CONFIG_KAISER */
-	*pgdp = pgd;
-#endif
+	*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 960071d258c3..da2fdd3c7010 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -303,4 +303,46 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size)
 		unmap_pud_range_nofree(pgd, addr, end);
 	}
 }
+
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(pgd_t *pgdp)
+{
+	return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
+}
+
+pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	/*
+	 * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
+	 * skip cases like kexec and EFI which make temporary low mappings.
+	 */
+	if (pgd.pgd & _PAGE_USER) {
+		if (is_userspace_pgd(pgdp)) {
+			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+			/*
+			 * Even if the entry is *mapping* userspace, ensure
+			 * that userspace can not use it.  This way, if we
+			 * get out to userspace running on the kernel CR3,
+			 * userspace will crash instead of running.
+			 */
+			pgd.pgd |= _PAGE_NX;
+		}
+	} else if (!pgd.pgd) {
+		/*
+		 * pgd_clear() cannot check _PAGE_USER, and is even used to
+		 * clear corrupted pgd entries: so just rely on cases like
+		 * kexec and EFI never to be using pgd_clear().
+		 */
+		if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
+		    is_userspace_pgd(pgdp))
+			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+	}
+	return pgd;
+}
 #endif /* CONFIG_KAISER */

From 003e476716906afa135faf605ae0a5c3598c0293 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 3 Sep 2017 18:57:03 -0700
Subject: [PATCH 07/44] kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE

Kaiser only needs to map one page of the stack; and
kernel/fork.c did not build on powerpc (no __PAGE_KERNEL).
It's all cleaner if linux/kaiser.h provides kaiser_map_thread_stack()
and kaiser_unmap_thread_stack() wrappers around asm/kaiser.h's
kaiser_add_mapping() and kaiser_remove_mapping().  And use
linux/kaiser.h in init/main.c to avoid the #ifdefs there.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kaiser.h | 44 +++++++++++++++++++++++++++++++++---------
 init/main.c            |  6 +-----
 kernel/fork.c          |  7 ++-----
 3 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
index 9db5433c2284..4a4d6d911a14 100644
--- a/include/linux/kaiser.h
+++ b/include/linux/kaiser.h
@@ -1,26 +1,52 @@
-#ifndef _INCLUDE_KAISER_H
-#define _INCLUDE_KAISER_H
+#ifndef _LINUX_KAISER_H
+#define _LINUX_KAISER_H
 
 #ifdef CONFIG_KAISER
 #include <asm/kaiser.h>
+
+static inline int kaiser_map_thread_stack(void *stack)
+{
+	/*
+	 * Map that page of kernel stack on which we enter from user context.
+	 */
+	return kaiser_add_mapping((unsigned long)stack +
+			THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
+}
+
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+	/*
+	 * Note: may be called even when kaiser_map_thread_stack() failed.
+	 */
+	kaiser_remove_mapping((unsigned long)stack +
+			THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
+}
 #else
 
 /*
  * These stubs are used whenever CONFIG_KAISER is off, which
- * includes architectures that support KAISER, but have it
- * disabled.
+ * includes architectures that support KAISER, but have it disabled.
  */
 
 static inline void kaiser_init(void)
 {
 }
-static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
-{
-}
-static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+static inline int kaiser_add_mapping(unsigned long addr,
+				     unsigned long size, unsigned long flags)
 {
 	return 0;
 }
+static inline void kaiser_remove_mapping(unsigned long start,
+					 unsigned long size)
+{
+}
+static inline int kaiser_map_thread_stack(void *stack)
+{
+	return 0;
+}
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+}
 
 #endif /* !CONFIG_KAISER */
-#endif /* _INCLUDE_KAISER_H */
+#endif /* _LINUX_KAISER_H */
diff --git a/init/main.c b/init/main.c
index 0cc5c4b6841f..49926d95442f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -81,15 +81,13 @@
 #include <linux/integrity.h>
 #include <linux/proc_ns.h>
 #include <linux/io.h>
+#include <linux/kaiser.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
-#ifdef CONFIG_KAISER
-#include <asm/kaiser.h>
-#endif
 
 static int kernel_init(void *);
 
@@ -495,9 +493,7 @@ static void __init mm_init(void)
 	pgtable_init();
 	vmalloc_init();
 	ioremap_huge_init();
-#ifdef CONFIG_KAISER
 	kaiser_init();
-#endif
 }
 
 asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 3842ceddb7f9..ac00f14208b7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -168,12 +168,9 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 	return page ? page_address(page) : NULL;
 }
 
-extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size);
 static inline void free_thread_info(struct thread_info *ti)
 {
-#ifdef CONFIG_KAISER
-	kaiser_remove_mapping((unsigned long)ti, THREAD_SIZE);
-#endif
+	kaiser_unmap_thread_stack(ti);
 	free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 # else
@@ -358,7 +355,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 
 	tsk->stack = ti;
 
-	err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+	err = kaiser_map_thread_stack(tsk->stack);
 	if (err)
 		goto free_ti;
 #ifdef CONFIG_SECCOMP

From 9b94cf97f42ca30fe9b5010900fa6e1d6855a9f6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 3 Sep 2017 17:09:44 -0700
Subject: [PATCH 08/44] kaiser: fix build and FIXME in alloc_ldt_struct()

Include linux/kaiser.h instead of asm/kaiser.h to build ldt.c without
CONFIG_KAISER.  kaiser_add_mapping() does already return an error code,
so fix the FIXME.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/ldt.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 1062e88ff040..89ee37469d41 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -16,9 +16,9 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/kaiser.h>
 
 #include <asm/ldt.h>
-#include <asm/kaiser.h>
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
@@ -49,7 +49,7 @@ static struct ldt_struct *alloc_ldt_struct(int size)
 {
 	struct ldt_struct *new_ldt;
 	int alloc_size;
-	int ret = 0;
+	int ret;
 
 	if (size > LDT_ENTRIES)
 		return NULL;
@@ -77,10 +77,8 @@ static struct ldt_struct *alloc_ldt_struct(int size)
 		return NULL;
 	}
 
-	// FIXME: make kaiser_add_mapping() return an error code
-	// when it fails
-	kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
-			   __PAGE_KERNEL);
+	ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+				 __PAGE_KERNEL);
 	if (ret) {
 		__free_ldt_struct(new_ldt);
 		return NULL;

From d94df20135ccfdfb77b1479c501564e9b4ab5bc9 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 13 Sep 2017 14:03:10 -0700
Subject: [PATCH 09/44] kaiser: KAISER depends on SMP

It is absurd that KAISER should depend on SMP, but apparently nobody
has tried a UP build before: which breaks on implicit declaration of
function 'per_cpu_offset' in arch/x86/mm/kaiser.c.

Now, you would expect that to be trivially fixed up; but looking at
the System.map when that block is #ifdef'ed out of kaiser_init(),
I see that in a UP build __per_cpu_user_mapped_end is precisely at
__per_cpu_user_mapped_start, and the items carefully gathered into
that section for user-mapping on SMP, dispersed elsewhere on UP.

So, some other kind of section assignment will be needed on UP,
but implementing that is not a priority: just make KAISER depend
on SMP for now.

Also inserted a blank line before the option, tidied up the
brief Kconfig help message, and added an "If unsure, Y".

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 security/Kconfig | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/security/Kconfig b/security/Kconfig
index 45cdb0098f38..0d54e3cc4586 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -30,14 +30,16 @@ config SECURITY
 	  model will be used.
 
 	  If you are unsure how to answer this question, answer N.
+
 config KAISER
 	bool "Remove the kernel mapping in user mode"
 	default y
-	depends on X86_64
-	depends on !PARAVIRT
+	depends on X86_64 && SMP && !PARAVIRT
 	help
-	  This enforces a strict kernel and user space isolation in order to close
-	  hardware side channels on kernel address information.
+	  This enforces a strict kernel and user space isolation, in order
+	  to close hardware side channels on kernel address information.
+
+	  If you are unsure how to answer this question, answer Y.
 
 config KAISER_REAL_SWITCH
 	bool "KAISER: actually switch page tables"

From 487f0b73d82611a2dc48d7d78409e2e9d994006a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 21 Sep 2017 20:39:56 -0700
Subject: [PATCH 10/44] kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER

pjt has observed that nmi's second (nmi_from_kernel) call to do_nmi()
adjusted the %rdi regs arg, rightly when CONFIG_KAISER, but wrongly
when not CONFIG_KAISER.

Although the minimal change is to add an #ifdef CONFIG_KAISER around
the addq line, that looks cluttered, and I prefer how the first call
to do_nmi() handled it: prepare args in %rdi and %rsi before getting
into the CONFIG_KAISER block, since it does not touch them at all.

And while we're here, place the "#ifdef CONFIG_KAISER" that follows
each, to enclose the "Unconditionally restore CR3" comment: matching
how the "Unconditionally use kernel CR3" comment above is enclosed.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 99bce319df0c..0942aa6d16a8 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1297,12 +1297,13 @@ ENTRY(nmi)
 	movq	%rax, %cr3
 #endif
 	call	do_nmi
+
+#ifdef CONFIG_KAISER
 	/*
 	 * Unconditionally restore CR3.  I know we return to
 	 * kernel code that needs user CR3, but do we ever return
 	 * to "user mode" where we need the kernel CR3?
 	 */
-#ifdef CONFIG_KAISER
 	popq	%rax
 	mov	%rax, %cr3
 #endif
@@ -1526,6 +1527,8 @@ end_repeat_nmi:
 	SWAPGS
 	xorl	%ebx, %ebx
 1:
+	movq	%rsp, %rdi
+	movq	$-1, %rsi
 #ifdef CONFIG_KAISER
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
@@ -1538,16 +1541,14 @@ end_repeat_nmi:
 #endif
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
-	movq	%rsp, %rdi
-	addq	$8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
-	movq	$-1, %rsi
 	call	do_nmi
+
+#ifdef CONFIG_KAISER
 	/*
 	 * Unconditionally restore CR3.  We might be returning to
 	 * kernel code that needs user CR3, like just just before
 	 * a sysret.
 	 */
-#ifdef CONFIG_KAISER
 	popq	%rax
 	mov	%rax, %cr3
 #endif

From 20cbe9a3aa2e341824da57ce0ac6d52cbffaa570 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 23 Aug 2017 14:21:14 -0700
Subject: [PATCH 11/44] kaiser: fix perf crashes

Avoid perf crashes: place debug_store in the user-mapped per-cpu area
instead of allocating, and use page allocator plus kaiser_add_mapping()
to keep the BTS and PEBS buffers user-mapped (that is, present in the
user mapping, though visible only to kernel and hardware).  The PEBS
fixup buffer does not need this treatment.

The need for a user-mapped struct debug_store showed up before doing
any conscious perf testing: in a couple of kernel paging oopses on
Westmere, implicating the debug_store offset of the per-cpu area.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 57 ++++++++++++++++++-----
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 1e7de3cefc9c..4e334e3ac16f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -2,11 +2,15 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
+#include <asm/kaiser.h>
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 
 #include "perf_event.h"
 
+static
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
+
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE		24
 
@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
 
 static DEFINE_PER_CPU(void *, insn_buffer);
 
+static void *dsalloc(size_t size, gfp_t flags, int node)
+{
+#ifdef CONFIG_KAISER
+	unsigned int order = get_order(size);
+	struct page *page;
+	unsigned long addr;
+
+	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+	if (!page)
+		return NULL;
+	addr = (unsigned long)page_address(page);
+	if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
+		__free_pages(page, order);
+		addr = 0;
+	}
+	return (void *)addr;
+#else
+	return kmalloc_node(size, flags | __GFP_ZERO, node);
+#endif
+}
+
+static void dsfree(const void *buffer, size_t size)
+{
+#ifdef CONFIG_KAISER
+	if (!buffer)
+		return;
+	kaiser_remove_mapping((unsigned long)buffer, size);
+	free_pages((unsigned long)buffer, get_order(size));
+#else
+	kfree(buffer);
+#endif
+}
+
 static int alloc_pebs_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
 	if (!x86_pmu.pebs)
 		return 0;
 
-	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+	buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
 	if (x86_pmu.intel_cap.pebs_format < 2) {
 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
 		if (!ibuffer) {
-			kfree(buffer);
+			dsfree(buffer, x86_pmu.pebs_buffer_size);
 			return -ENOMEM;
 		}
 		per_cpu(insn_buffer, cpu) = ibuffer;
@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
 	kfree(per_cpu(insn_buffer, cpu));
 	per_cpu(insn_buffer, cpu) = NULL;
 
-	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	dsfree((void *)(unsigned long)ds->pebs_buffer_base,
+			x86_pmu.pebs_buffer_size);
 	ds->pebs_buffer_base = 0;
 }
 
@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
 	if (!x86_pmu.bts)
 		return 0;
 
-	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+	buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
 	if (unlikely(!buffer)) {
 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
 		return -ENOMEM;
@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
 	if (!ds || !x86_pmu.bts)
 		return;
 
-	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
 	ds->bts_buffer_base = 0;
 }
 
 static int alloc_ds_buffer(int cpu)
 {
-	int node = cpu_to_node(cpu);
-	struct debug_store *ds;
-
-	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-	if (unlikely(!ds))
-		return -ENOMEM;
+	struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
 
+	memset(ds, 0, sizeof(*ds));
 	per_cpu(cpu_hw_events, cpu).ds = ds;
 
 	return 0;
@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
 		return;
 
 	per_cpu(cpu_hw_events, cpu).ds = NULL;
-	kfree(ds);
 }
 
 void release_ds_buffers(void)

From 407c3ff6a24c7cb418b77a124d17e282f9622037 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 3 Sep 2017 18:48:02 -0700
Subject: [PATCH 12/44] kaiser: ENOMEM if kaiser_pagetable_walk() NULL

kaiser_add_user_map() took no notice when kaiser_pagetable_walk() failed.
And avoid its might_sleep() when atomic (though atomic at present unused).

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index da2fdd3c7010..57e3a7dd4708 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -99,11 +99,11 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
 	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 
-	might_sleep();
 	if (is_atomic) {
 		gfp &= ~GFP_KERNEL;
 		gfp |= __GFP_HIGH | __GFP_ATOMIC;
-	}
+	} else
+		might_sleep();
 
 	if (pgd_none(*pgd)) {
 		WARN_ONCE(1, "All shadow pgds should have been populated");
@@ -160,13 +160,17 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 	unsigned long target_address;
 
-	for (;address < end_addr; address += PAGE_SIZE) {
+	for (; address < end_addr; address += PAGE_SIZE) {
 		target_address = get_pa_from_mapping(address);
 		if (target_address == -1) {
 			ret = -EIO;
 			break;
 		}
 		pte = kaiser_pagetable_walk(address, false);
+		if (!pte) {
+			ret = -ENOMEM;
+			break;
+		}
 		if (pte_none(*pte)) {
 			set_pte(pte, __pte(flags | target_address));
 		} else {

From 5fbd46c4be78174656b52e1b04d3057a5dd7af66 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 3 Sep 2017 19:18:07 -0700
Subject: [PATCH 13/44] kaiser: tidied up asm/kaiser.h somewhat

Mainly deleting a surfeit of blank lines, and reflowing header comment.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/kaiser.h | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 0703f48777f3..7394ba9f9951 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -1,15 +1,17 @@
 #ifndef _ASM_X86_KAISER_H
 #define _ASM_X86_KAISER_H
-
-/* This file includes the definitions for the KAISER feature.
- * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
- * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
- * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
- * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
- * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
+/*
+ * This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on
+ * the kernel virtual memory.  It has a shadow pgd for every process: the
+ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
+ * user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode,
+ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
+ * and the user may not attack the whole kernel memory.
  *
- * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
- * of the user space, or the stacks.
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user
+ * mode, such as the entry/exit functions of the user space, or the stacks.
  */
 #ifdef __ASSEMBLY__
 #ifdef CONFIG_KAISER
@@ -48,13 +50,10 @@ _SWITCH_TO_KERNEL_CR3 %rax
 movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 .endm
 
-
 .macro SWITCH_USER_CR3_NO_STACK
-
 movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
 _SWITCH_TO_USER_CR3 %rax
 movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
-
 .endm
 
 #else /* CONFIG_KAISER */
@@ -72,7 +71,6 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 
 #else /* __ASSEMBLY__ */
 
-
 #ifdef CONFIG_KAISER
 /*
  * Upon kernel/user mode switch, it may happen that the address
@@ -80,7 +78,6 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
  * stored.  To change the address space, another register is
  * needed.  A register therefore has to be stored/restored.
 */
-
 DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
 /**
@@ -95,7 +92,6 @@ DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
  */
 extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
 
-
 /**
  *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
  *  @addr: the start address of the range
@@ -104,12 +100,12 @@ extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned l
 extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
 
 /**
- *  kaiser_initialize_mapping - Initalize the shadow mapping
+ *  kaiser_init - Initialize the shadow mapping
  *
  *  Most parts of the shadow mapping can be mapped upon boot
  *  time.  Only per-process things like the thread stacks
  *  or a new LDT have to be mapped at runtime.  These boot-
- *  time mappings are permanent and nevertunmapped.
+ *  time mappings are permanent and never unmapped.
  */
 extern void kaiser_init(void);
 
@@ -117,6 +113,4 @@ extern void kaiser_init(void);
 
 #endif /* __ASSEMBLY */
 
-
-
 #endif /* _ASM_X86_KAISER_H */

From 0c68228f7b39c96cabd89bee3e1d6bd55926df80 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 3 Sep 2017 19:23:08 -0700
Subject: [PATCH 14/44] kaiser: tidied up kaiser_add/remove_mapping slightly

Yes, unmap_pud_range_nofree()'s declaration ought to be in a
header file really, but I'm not sure we want to use it anyway:
so for now just declare it inside kaiser_remove_mapping().
And there doesn't seem to be such a thing as unmap_p4d_range(),
even in a 5-level paging tree.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 57e3a7dd4708..34e1d48e1339 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -286,8 +286,7 @@ void __init kaiser_init(void)
 				  __PAGE_KERNEL);
 }
 
-extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);
-// add a mapping to the shadow-mapping, and synchronize the mappings
+/* Add a mapping to the shadow mapping, and synchronize the mappings */
 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 {
 	return kaiser_add_user_map((const void *)addr, size, flags);
@@ -295,15 +294,13 @@ int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long fla
 
 void kaiser_remove_mapping(unsigned long start, unsigned long size)
 {
+	extern void unmap_pud_range_nofree(pgd_t *pgd,
+				unsigned long start, unsigned long end);
 	unsigned long end = start + size;
 	unsigned long addr;
 
 	for (addr = start; addr < end; addr += PGDIR_SIZE) {
 		pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
-		/*
-		 * unmap_p4d_range() handles > P4D_SIZE unmaps,
-		 * so no need to trim 'end'.
-		 */
 		unmap_pud_range_nofree(pgd, addr, end);
 	}
 }

From f127705d26b34c053e59b47aef84b3ea564dd743 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 2 Oct 2017 10:57:24 -0700
Subject: [PATCH 15/44] kaiser: kaiser_remove_mapping() move along the pgd

When removing the bogus comment from kaiser_remove_mapping(),
I really ought to have checked the extent of its bogosity: as
Neel points out, there is nothing to stop unmap_pud_range_nofree()
from continuing beyond the end of a pud (and starting in the wrong
position on the next).

Fix kaiser_remove_mapping() to constrain the extent and advance pgd
pointer correctly: use pgd_addr_end() macro as used throughout base
mm (but don't assume page-rounded start and size in this case).

But this bug was very unlikely to trigger in this backport: since
any buddy allocation is contained within a single pud extent, and
we are not using vmapped stacks (and are only mapping one page of
stack anyway): the only way to hit this bug here would be when
freeing a large modified ldt.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 34e1d48e1339..1d50c44ff6a7 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -297,11 +297,13 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size)
 	extern void unmap_pud_range_nofree(pgd_t *pgd,
 				unsigned long start, unsigned long end);
 	unsigned long end = start + size;
-	unsigned long addr;
+	unsigned long addr, next;
+	pgd_t *pgd;
 
-	for (addr = start; addr < end; addr += PGDIR_SIZE) {
-		pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
-		unmap_pud_range_nofree(pgd, addr, end);
+	pgd = native_get_shadow_pgd(pgd_offset_k(start));
+	for (addr = start; addr < end; pgd++, addr = next) {
+		next = pgd_addr_end(addr, end);
+		unmap_pud_range_nofree(pgd, addr, next);
 	}
 }
 

From c52e55a2a82d3a44189810d35717d81cb4cf61d4 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 21 Aug 2017 20:11:43 -0700
Subject: [PATCH 16/44] kaiser: cleanups while trying for gold link

While trying to get our gold link to work, four cleanups:
matched the gdt_page declaration to its definition;
in fiddling unsuccessfully with PERCPU_INPUT(), lined up backslashes;
lined up the backslashes according to convention in percpu-defs.h;
deleted the unused irq_stack_pointer addition to irq_stack_union.

Sad to report that aligning backslashes does not appear to help gold
align to 8192: but while these did not help, they are worth keeping.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/desc.h       |  2 +-
 arch/x86/include/asm/processor.h  |  5 -----
 include/asm-generic/vmlinux.lds.h | 18 ++++++++----------
 include/linux/percpu-defs.h       | 22 +++++++++++-----------
 4 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4e10d73cf018..880db91d9457 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -43,7 +43,7 @@ struct gdt_page {
 	struct desc_struct gdt[GDT_ENTRIES];
 } __attribute__((aligned(PAGE_SIZE)));
 
-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6a2e0a0b4a96..f3bdaed0188f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -332,11 +332,6 @@ union irq_stack_union {
 		char gs_base[40];
 		unsigned long stack_canary;
 	};
-
-	struct {
-		char irq_stack_pointer[64];
-		char unused[IRQ_STACK_SIZE - 64];
-	};
 };
 
 DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index cc1c6628e0ae..a461b6604fd9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -725,16 +725,14 @@
  */
 #define PERCPU_INPUT(cacheline)						\
 	VMLINUX_SYMBOL(__per_cpu_start) = .;				\
-	\
-	VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;        \
-	*(.data..percpu..first)           \
-	. = ALIGN(cacheline);           \
-	*(.data..percpu..user_mapped)            \
-	*(.data..percpu..user_mapped..shared_aligned)        \
-	. = ALIGN(PAGE_SIZE);           \
-	*(.data..percpu..user_mapped..page_aligned)          \
-	VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;        \
-	\
+	VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;		\
+	*(.data..percpu..first)						\
+	. = ALIGN(cacheline);						\
+	*(.data..percpu..user_mapped)					\
+	*(.data..percpu..user_mapped..shared_aligned)			\
+	. = ALIGN(PAGE_SIZE);						\
+	*(.data..percpu..user_mapped..page_aligned)			\
+	VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;			\
 	. = ALIGN(PAGE_SIZE);						\
 	*(.data..percpu..page_aligned)					\
 	. = ALIGN(cacheline);						\
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8ea945f63a05..cfe13cb4ec63 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -121,10 +121,10 @@
 #define DEFINE_PER_CPU(type, name)					\
 	DEFINE_PER_CPU_SECTION(type, name, "")
 
-#define DECLARE_PER_CPU_USER_MAPPED(type, name)         \
+#define DECLARE_PER_CPU_USER_MAPPED(type, name)				\
 	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
 
-#define DEFINE_PER_CPU_USER_MAPPED(type, name)          \
+#define DEFINE_PER_CPU_USER_MAPPED(type, name)				\
 	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
 
 /*
@@ -156,11 +156,11 @@
 	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
 	____cacheline_aligned_in_smp
 
-#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)			\
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)		\
 	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
 	____cacheline_aligned_in_smp
 
-#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)			\
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)		\
 	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
 	____cacheline_aligned_in_smp
 
@@ -185,18 +185,18 @@
 /*
  * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
  */
-#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)      \
-  DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned")   \
-  __aligned(PAGE_SIZE)
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)		\
+	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+	__aligned(PAGE_SIZE)
 
-#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)       \
-  DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned")    \
-  __aligned(PAGE_SIZE)
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)		\
+	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+	__aligned(PAGE_SIZE)
 
 /*
  * Declaration/definition used for per-CPU variables that must be read mostly.
  */
-#define DECLARE_PER_CPU_READ_MOSTLY(type, name)			\
+#define DECLARE_PER_CPU_READ_MOSTLY(type, name)				\
 	DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
 
 #define DEFINE_PER_CPU_READ_MOSTLY(type, name)				\

From aeda21d77e22fb382c51fd3f6bbb18df69bc032f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sat, 9 Sep 2017 17:31:18 -0700
Subject: [PATCH 17/44] kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET

There's a 0x1000 in various places, which looks better with a name.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S     | 4 ++--
 arch/x86/include/asm/kaiser.h | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 0942aa6d16a8..82661cd1cd13 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1292,7 +1292,7 @@ ENTRY(nmi)
 	movq	%cr3, %rax
 	pushq	%rax
 #ifdef CONFIG_KAISER_REAL_SWITCH
-	andq	$(~0x1000), %rax
+	andq	$(~KAISER_SHADOW_PGD_OFFSET), %rax
 #endif
 	movq	%rax, %cr3
 #endif
@@ -1535,7 +1535,7 @@ end_repeat_nmi:
 	movq	%cr3, %rax
 	pushq	%rax
 #ifdef CONFIG_KAISER_REAL_SWITCH
-	andq	$(~0x1000), %rax
+	andq	$(~KAISER_SHADOW_PGD_OFFSET), %rax
 #endif
 	movq	%rax, %cr3
 #endif
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 7394ba9f9951..051acf678cda 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -13,13 +13,16 @@
  * A minimalistic kernel mapping holds the parts needed to be mapped in user
  * mode, such as the entry/exit functions of the user space, or the stacks.
  */
+
+#define KAISER_SHADOW_PGD_OFFSET 0x1000
+
 #ifdef __ASSEMBLY__
 #ifdef CONFIG_KAISER
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
 #ifdef CONFIG_KAISER_REAL_SWITCH
-andq $(~0x1000), \reg
+andq $(~KAISER_SHADOW_PGD_OFFSET), \reg
 #endif
 movq \reg, %cr3
 .endm
@@ -27,7 +30,7 @@ movq \reg, %cr3
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
 #ifdef CONFIG_KAISER_REAL_SWITCH
-orq $(0x1000), \reg
+orq $(KAISER_SHADOW_PGD_OFFSET), \reg
 #endif
 movq \reg, %cr3
 .endm

From b9d2ccc54e17b5aa50dd0c036d3f4fb4e5248d54 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 3 Sep 2017 18:30:43 -0700
Subject: [PATCH 18/44] kaiser: delete KAISER_REAL_SWITCH option

We fail to see what CONFIG_KAISER_REAL_SWITCH is for: it seems to be
left over from early development, and now just obscures tricky parts
of the code.  Delete it before adding PCIDs, or nokaiser boot option.

(Or if there is some good reason to keep the option, then it needs
a help text - and a "depends on KAISER", so that all those without
KAISER are not asked the question.)

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S     | 4 ----
 arch/x86/include/asm/kaiser.h | 4 ----
 security/Kconfig              | 4 ----
 3 files changed, 12 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 82661cd1cd13..a058e0fd99e3 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1291,9 +1291,7 @@ ENTRY(nmi)
 	/* %rax is saved above, so OK to clobber here */
 	movq	%cr3, %rax
 	pushq	%rax
-#ifdef CONFIG_KAISER_REAL_SWITCH
 	andq	$(~KAISER_SHADOW_PGD_OFFSET), %rax
-#endif
 	movq	%rax, %cr3
 #endif
 	call	do_nmi
@@ -1534,9 +1532,7 @@ end_repeat_nmi:
 	/* %rax is saved above, so OK to clobber here */
 	movq	%cr3, %rax
 	pushq	%rax
-#ifdef CONFIG_KAISER_REAL_SWITCH
 	andq	$(~KAISER_SHADOW_PGD_OFFSET), %rax
-#endif
 	movq	%rax, %cr3
 #endif
 
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 051acf678cda..e0fc45e77aee 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -21,17 +21,13 @@
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
-#ifdef CONFIG_KAISER_REAL_SWITCH
 andq $(~KAISER_SHADOW_PGD_OFFSET), \reg
-#endif
 movq \reg, %cr3
 .endm
 
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
-#ifdef CONFIG_KAISER_REAL_SWITCH
 orq $(KAISER_SHADOW_PGD_OFFSET), \reg
-#endif
 movq \reg, %cr3
 .endm
 
diff --git a/security/Kconfig b/security/Kconfig
index 0d54e3cc4586..8d5d2407be7e 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -41,10 +41,6 @@ config KAISER
 
 	  If you are unsure how to answer this question, answer Y.
 
-config KAISER_REAL_SWITCH
-	bool "KAISER: actually switch page tables"
-	default y
-
 config SECURITYFS
 	bool "Enable the securityfs filesystem"
 	help

From 3e3d38fd9832e82a8cb1a5b1154acfa43ac08d15 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sat, 9 Sep 2017 21:27:32 -0700
Subject: [PATCH 19/44] kaiser: vmstat show NR_KAISERTABLE as nr_overhead

The kaiser update made an interesting choice, never to free any shadow
page tables.  Contention on global spinlock was worrying, particularly
with it held across page table scans when freeing.  Something had to be
done: I was going to add refcounting; but simply never to free them is
an appealing choice, minimizing contention without complicating the code
(the more a page table is found already, the less the spinlock is used).

But leaking pages in this way is also a worry: can we get away with it?
At the very least, we need a count to show how bad it actually gets:
in principle, one might end up wasting about 1/256 of memory that way
(1/512 for when direct-mapped pages have to be user-mapped, plus 1/512
for when they are user-mapped from the vmalloc area on another occasion
(but we don't have vmalloc'ed stacks, so only large ldts are vmalloc'ed).

Add per-cpu stat NR_KAISERTABLE: including 256 at startup for the
shared pgd entries, and 1 for each intermediate page table added
thereafter for user-mapping - but leave out the 1 per mm, for its
shadow pgd, because that distracts from the monotonic increase.
Shown in /proc/vmstat as nr_overhead (0 if kaiser not enabled).

In practice, it doesn't look so bad so far: more like 1/12000 after
nine hours of gtests below; and movable pageblock segregation should
tend to cluster the kaiser tables into a subset of the address space
(if not, they will be bad for compaction too).  But production may
tell a different story: keep an eye on this number, and bring back
lighter freeing if it gets out of control (maybe a shrinker).

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c   | 16 +++++++++++-----
 include/linux/mmzone.h |  3 ++-
 mm/vmstat.c            |  1 +
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 1d50c44ff6a7..bf48bf0df8c5 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -122,9 +122,11 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
 		if (!new_pmd_page)
 			return NULL;
 		spin_lock(&shadow_table_allocation_lock);
-		if (pud_none(*pud))
+		if (pud_none(*pud)) {
 			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
-		else
+			__inc_zone_page_state(virt_to_page((void *)
+						new_pmd_page), NR_KAISERTABLE);
+		} else
 			free_page(new_pmd_page);
 		spin_unlock(&shadow_table_allocation_lock);
 	}
@@ -140,9 +142,11 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
 		if (!new_pte_page)
 			return NULL;
 		spin_lock(&shadow_table_allocation_lock);
-		if (pmd_none(*pmd))
+		if (pmd_none(*pmd)) {
 			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
-		else
+			__inc_zone_page_state(virt_to_page((void *)
+						new_pte_page), NR_KAISERTABLE);
+		} else
 			free_page(new_pte_page);
 		spin_unlock(&shadow_table_allocation_lock);
 	}
@@ -206,11 +210,13 @@ static void __init kaiser_init_all_pgds(void)
 	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
 	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
 		pgd_t new_pgd;
-		pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
+		pud_t *pud = pud_alloc_one(&init_mm,
+					   PAGE_OFFSET + i * PGDIR_SIZE);
 		if (!pud) {
 			WARN_ON(1);
 			break;
 		}
+		inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
 		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
 		/*
 		 * Make sure not to stomp on some other pgd entry.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ff88d6189411..b93b578cfa42 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -131,8 +131,9 @@ enum zone_stat_item {
 	NR_SLAB_RECLAIMABLE,
 	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,		/* used for pagetables */
-	NR_KERNEL_STACK,
 	/* Second 128 byte cacheline */
+	NR_KERNEL_STACK,
+	NR_KAISERTABLE,
 	NR_UNSTABLE_NFS,	/* NFS unstable pages */
 	NR_BOUNCE,
 	NR_VMSCAN_WRITE,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c344e3609c53..324b7e90b4c5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -736,6 +736,7 @@ const char * const vmstat_text[] = {
 	"nr_slab_unreclaimable",
 	"nr_page_table_pages",
 	"nr_kernel_stack",
+	"nr_overhead",
 	"nr_unstable",
 	"nr_bounce",
 	"nr_vmscan_write",

From eb82151d0b1df53d1ad8d060ecd554ca12eb552a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 30 Aug 2017 16:23:00 -0700
Subject: [PATCH 20/44] kaiser: enhanced by kernel and user PCIDs

Merged performance improvements to Kaiser, using distinct kernel
and user Process Context Identifiers to minimize the TLB flushing.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S                   | 10 +++-
 arch/x86/entry/entry_64_compat.S            |  1 +
 arch/x86/include/asm/cpufeature.h           |  1 +
 arch/x86/include/asm/kaiser.h               | 15 +++++-
 arch/x86/include/asm/pgtable_types.h        | 26 +++++++++++
 arch/x86/include/asm/tlbflush.h             | 52 +++++++++++++++++----
 arch/x86/include/uapi/asm/processor-flags.h |  3 +-
 arch/x86/kernel/cpu/common.c                | 34 ++++++++++++++
 arch/x86/kvm/x86.c                          |  3 +-
 arch/x86/mm/kaiser.c                        |  7 +++
 arch/x86/mm/tlb.c                           | 46 ++++++++++++++++--
 11 files changed, 181 insertions(+), 17 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a058e0fd99e3..6a18d787aed4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1291,7 +1291,10 @@ ENTRY(nmi)
 	/* %rax is saved above, so OK to clobber here */
 	movq	%cr3, %rax
 	pushq	%rax
-	andq	$(~KAISER_SHADOW_PGD_OFFSET), %rax
+	/* mask off "user" bit of pgd address and 12 PCID bits: */
+	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+	/* Add back kernel PCID and "no flush" bit */
+	orq	X86_CR3_PCID_KERN_VAR, %rax
 	movq	%rax, %cr3
 #endif
 	call	do_nmi
@@ -1532,7 +1535,10 @@ end_repeat_nmi:
 	/* %rax is saved above, so OK to clobber here */
 	movq	%cr3, %rax
 	pushq	%rax
-	andq	$(~KAISER_SHADOW_PGD_OFFSET), %rax
+	/* mask off "user" bit of pgd address and 12 PCID bits: */
+	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+	/* Add back kernel PCID and "no flush" bit */
+	orq	X86_CR3_PCID_KERN_VAR, %rax
 	movq	%rax, %cr3
 #endif
 
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index fe1911930b52..d03bf0e28b8b 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,7 @@
 #include <asm/irqflags.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
+#include <asm/pgtable_types.h>
 #include <asm/kaiser.h>
 #include <linux/linkage.h>
 #include <linux/err.h>
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index f7ba9fbf12ee..72a4343b774f 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -187,6 +187,7 @@
 #define X86_FEATURE_ARAT	( 7*32+ 1) /* Always Running APIC Timer */
 #define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
 #define X86_FEATURE_PLN		( 7*32+ 5) /* Intel Power Limit Notification */
 #define X86_FEATURE_PTS		( 7*32+ 6) /* Intel Package Thermal Status */
 #define X86_FEATURE_DTHERM	( 7*32+ 7) /* Digital Thermal Sensor */
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index e0fc45e77aee..360ff3bc44a9 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -1,5 +1,8 @@
 #ifndef _ASM_X86_KAISER_H
 #define _ASM_X86_KAISER_H
+
+#include <uapi/asm/processor-flags.h> /* For PCID constants */
+
 /*
  * This file includes the definitions for the KAISER feature.
  * KAISER is a counter measure against x86_64 side channel attacks on
@@ -21,13 +24,21 @@
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
-andq $(~KAISER_SHADOW_PGD_OFFSET), \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+orq  X86_CR3_PCID_KERN_VAR, \reg
 movq \reg, %cr3
 .endm
 
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
-orq $(KAISER_SHADOW_PGD_OFFSET), \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+/*
+ * This can obviously be one instruction by putting the
+ * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR.
+ * But, just leave it now for simplicity.
+ */
+orq  X86_CR3_PCID_USER_VAR, \reg
+orq  $(KAISER_SHADOW_PGD_OFFSET), \reg
 movq \reg, %cr3
 .endm
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index a70d6100b3df..79319174e27b 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -106,6 +106,32 @@
 			 _PAGE_SOFT_DIRTY)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK  (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+#define X86_CR3_PCID_ASID_KERN  (_AC(0x4,UL))
+#define X86_CR3_PCID_ASID_USER  (_AC(0x6,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH		(X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH		(X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
+#define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH		(0)
+#define X86_CR3_PCID_USER_FLUSH		(0)
+#define X86_CR3_PCID_KERN_NOFLUSH	(0)
+#define X86_CR3_PCID_USER_NOFLUSH	(0)
+#endif
+
 /*
  * The cache modes defined here are used to translate between pure SW usage
  * and the HW defined cache mode bits and/or PAT entries.
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 9fc5968da820..48ef37079bc2 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -12,7 +12,6 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr,
 			     unsigned long type)
 {
 	struct { u64 d[2]; } desc = { { pcid, addr } };
-
 	/*
 	 * The memory clobber is because the whole point is to invalidate
 	 * stale TLB entries and, especially if we're flushing global
@@ -133,14 +132,25 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 
 static inline void __native_flush_tlb(void)
 {
+	if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) {
+	 	/*
+		 * If current->mm == NULL then we borrow a mm which may change during a
+		 * task switch and therefore we must not be preempted while we write CR3
+		 * back:
+		 */
+		preempt_disable();
+		native_write_cr3(native_read_cr3());
+		preempt_enable();
+		return;
+	}
 	/*
-	 * If current->mm == NULL then we borrow a mm which may change during a
-	 * task switch and therefore we must not be preempted while we write CR3
-	 * back:
+	 * We are no longer using globals with KAISER, so a
+	 * "nonglobals" flush would work too. But, this is more
+	 * conservative.
+	 *
+	 * Note, this works with CR4.PCIDE=0 or 1.
 	 */
-	preempt_disable();
-	native_write_cr3(native_read_cr3());
-	preempt_enable();
+	invpcid_flush_all();
 }
 
 static inline void __native_flush_tlb_global_irq_disabled(void)
@@ -162,6 +172,8 @@ static inline void __native_flush_tlb_global(void)
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
 		 * to CR4 sandwiched inside an IRQ flag save/restore.
+		 *
+	 	 * Note, this works with CR4.PCIDE=0 or 1.
 		 */
 		invpcid_flush_all();
 		return;
@@ -181,7 +193,31 @@ static inline void __native_flush_tlb_global(void)
 
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
-	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+	/*
+	 * SIMICS #GP's if you run INVPCID with type 2/3
+	 * and X86_CR4_PCIDE clear.  Shame!
+	 *
+	 * The ASIDs used below are hard-coded.  But, we must not
+	 * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call
+	 * invpcid in the case we are called early.
+	 */
+	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+		return;
+	}
+	/* Flush the address out of both PCIDs. */
+	/*
+	 * An optimization here might be to determine addresses
+	 * that are only kernel-mapped and only flush the kernel
+	 * ASID.  But, userspace flushes are probably much more
+	 * important performance-wise.
+	 *
+	 * Make sure to do only a single invpcid when KAISER is
+	 * disabled and we have only a single ASID.
+	 */
+	if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
+		invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+	invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
 }
 
 static inline void __flush_tlb_all(void)
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 79887abcb5e1..1361779f44fe 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -77,7 +77,8 @@
 #define X86_CR3_PWT		_BITUL(X86_CR3_PWT_BIT)
 #define X86_CR3_PCD_BIT		4 /* Page Cache Disable */
 #define X86_CR3_PCD		_BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK	_AC(0x00000fff,UL) /* PCID Mask */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
 
 /*
  * Intel CPU features in CR4
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e5ba9701ee6c..07b7f2816567 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -321,11 +321,45 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 	}
 }
 
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3.  It would take
+ * another register.  So, we use a memory reference to these
+ * instead.
+ *
+ * This is also handy because systems that do not support
+ * PCIDs just end up or'ing a 0 into their CR3, which does
+ * no harm.
+ */
+__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0;
+__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0;
+
 static void setup_pcid(struct cpuinfo_x86 *c)
 {
 	if (cpu_has(c, X86_FEATURE_PCID)) {
 		if (cpu_has(c, X86_FEATURE_PGE)) {
 			cr4_set_bits(X86_CR4_PCIDE);
+			/*
+			 * These variables are used by the entry/exit
+			 * code to change PCIDs.
+			 */
+#ifdef CONFIG_KAISER
+			X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH;
+			X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH;
+#endif
+			/*
+			 * INVPCID has two "groups" of types:
+			 * 1/2: Invalidate an individual address
+			 * 3/4: Invalidate all contexts
+			 *
+			 * 1/2 take a PCID, but 3/4 do not.  So, 3/4
+			 * ignore the PCID argument in the descriptor.
+			 * But, we have to be careful not to call 1/2
+			 * with an actual non-zero PCID in them before
+			 * we do the above cr4_set_bits().
+			 */
+			if (cpu_has(c, X86_FEATURE_INVPCID))
+				set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
 		} else {
 			/*
 			 * flush_tlb_all(), as currently implemented, won't
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 796f1ec67469..ccf17dbfea09 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 			return 1;
 
 		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
-		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
+		    !is_long_mode(vcpu))
 			return 1;
 	}
 
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index bf48bf0df8c5..290a52e6017d 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -240,6 +240,8 @@ static void __init kaiser_init_all_pgds(void)
 } while (0)
 
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+extern unsigned long X86_CR3_PCID_KERN_VAR;
+extern unsigned long X86_CR3_PCID_USER_VAR;
 /*
  * If anything in here fails, we will likely die on one of the
  * first kernel->user transitions and init will die.  But, we
@@ -290,6 +292,11 @@ void __init kaiser_init(void)
 	kaiser_add_user_map_early(&debug_idt_table,
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL);
+
+	kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
+				  __PAGE_KERNEL);
+	kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE,
+				  __PAGE_KERNEL);
 }
 
 /* Add a mapping to the shadow mapping, and synchronize the mappings */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 7a4cdb632508..aed0b704de3d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -34,6 +34,46 @@ struct flush_tlb_info {
 	unsigned long flush_end;
 };
 
+static void load_new_mm_cr3(pgd_t *pgdir)
+{
+	unsigned long new_mm_cr3 = __pa(pgdir);
+
+	/*
+	 * KAISER, plus PCIDs needs some extra work here.  But,
+	 * if either of features is not present, we need no
+	 * PCIDs here and just do a normal, full TLB flush with
+	 * the write_cr3()
+	 */
+	if (!IS_ENABLED(CONFIG_KAISER) ||
+	    !cpu_feature_enabled(X86_FEATURE_PCID))
+		goto out_set_cr3;
+	/*
+	 * We reuse the same PCID for different tasks, so we must
+	 * flush all the entires for the PCID out when we change
+	 * tasks.
+	 */
+	new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir);
+
+	/*
+	 * The flush from load_cr3() may leave old TLB entries
+	 * for userspace in place.  We must flush that context
+	 * separately.  We can theoretically delay doing this
+	 * until we actually load up the userspace CR3, but
+	 * that's a bit tricky.  We have to have the "need to
+	 * flush userspace PCID" bit per-cpu and check it in the
+	 * exit-to-userspace paths.
+	 */
+	invpcid_flush_single_context(X86_CR3_PCID_ASID_USER);
+
+out_set_cr3:
+	/*
+	 * Caution: many callers of this function expect
+	 * that load_cr3() is serializing and orders TLB
+	 * fills with respect to the mm_cpumask writes.
+	 */
+	write_cr3(new_mm_cr3);
+}
+
 /*
  * We cannot call mmdrop() because we are in interrupt context,
  * instead update mm->cpu_vm_mask.
@@ -45,7 +85,7 @@ void leave_mm(int cpu)
 		BUG();
 	if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
 		cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
-		load_cr3(swapper_pg_dir);
+		load_new_mm_cr3(swapper_pg_dir);
 		/*
 		 * This gets called in the idle path where RCU
 		 * functions differently.  Tracing normally
@@ -105,7 +145,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		 * ordering guarantee we need.
 		 *
 		 */
-		load_cr3(next->pgd);
+		load_new_mm_cr3(next->pgd);
 
 		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 
@@ -152,7 +192,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			 * As above, load_cr3() is serializing and orders TLB
 			 * fills with respect to the mm_cpumask write.
 			 */
-			load_cr3(next->pgd);
+			load_new_mm_cr3(next->pgd);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 			load_mm_cr4(next);
 			load_mm_ldt(next);

From 0731188fc74cc2237975a2b5bedd36e2463ef10b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 17 Aug 2017 15:00:37 -0700
Subject: [PATCH 21/44] kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush
 user

We have many machines (Westmere, Sandybridge, Ivybridge) supporting
PCID but not INVPCID: on these load_new_mm_cr3() simply crashed.

Flushing user context inside load_new_mm_cr3() without the use of
invpcid is difficult: momentarily switch from kernel to user context
and back to do so?  I'm not sure whether that can be safely done at
all, and would risk polluting user context with kernel internals,
and kernel context with stale user externals.

Instead, follow the hint in the comment that was there: change
X86_CR3_PCID_USER_VAR to be a per-cpu variable, then load_new_mm_cr3()
can leave a note in it, for SWITCH_USER_CR3 on return to userspace to
flush user context TLB, instead of default X86_CR3_PCID_USER_NOFLUSH.

Which works well enough that there's no need to do it this way only
when invpcid is unsupported: it's a good alternative to invpcid here.
But there's a couple of inlines in asm/tlbflush.h that need to do the
same trick, so it's best to localize all this per-cpu business in
mm/kaiser.c: moving that part of the initialization from setup_pcid()
to kaiser_setup_pcid(); with kaiser_flush_tlb_on_return_to_user() the
function for noting an X86_CR3_PCID_USER_FLUSH.  And let's keep a
KAISER_SHADOW_PGD_OFFSET in there, to avoid the extra OR on exit.

I did try to make the feature tests in asm/tlbflush.h more consistent
with each other: there seem to be far too many ways of performing such
tests, and I don't have a good grasp of their differences.  At first
I converted them all to be static_cpu_has(): but that proved to be a
mistake, as the comment in __native_flush_tlb_single() hints; so then
I reversed and made them all this_cpu_has().  Probably all gratuitous
change, but that's the way it's working at present.

I am slightly bothered by the way non-per-cpu X86_CR3_PCID_KERN_VAR
gets re-initialized by each cpu (before and after these changes):
no problem when (as usual) all cpus on a machine have the same
features, but in principle incorrect.  However, my experiment
to per-cpu-ify that one did not end well...

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/kaiser.h   | 18 ++++++----
 arch/x86/include/asm/tlbflush.h | 58 +++++++++++++++++++++++----------
 arch/x86/kernel/cpu/common.c    | 22 +------------
 arch/x86/mm/kaiser.c            | 52 +++++++++++++++++++++++++----
 arch/x86/mm/tlb.c               | 46 ++++++++++----------------
 5 files changed, 115 insertions(+), 81 deletions(-)

diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 360ff3bc44a9..009bca514c20 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -32,13 +32,12 @@ movq \reg, %cr3
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
 andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
-/*
- * This can obviously be one instruction by putting the
- * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR.
- * But, just leave it now for simplicity.
- */
-orq  X86_CR3_PCID_USER_VAR, \reg
-orq  $(KAISER_SHADOW_PGD_OFFSET), \reg
+orq  PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg
+js   9f
+// FLUSH this time, reset to NOFLUSH for next time
+// But if nopcid?  Consider using 0x80 for user pcid?
+movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
+9:
 movq \reg, %cr3
 .endm
 
@@ -90,6 +89,11 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 */
 DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
+extern unsigned long X86_CR3_PCID_KERN_VAR;
+DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
 /**
  *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
  *  @addr: the start address of the range
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 48ef37079bc2..ff8c5eb95caf 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -12,6 +12,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr,
 			     unsigned long type)
 {
 	struct { u64 d[2]; } desc = { { pcid, addr } };
+
 	/*
 	 * The memory clobber is because the whole point is to invalidate
 	 * stale TLB entries and, especially if we're flushing global
@@ -130,27 +131,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 	cr4_set_bits(mask);
 }
 
+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_KAISER
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
+
 static inline void __native_flush_tlb(void)
 {
-	if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) {
-	 	/*
-		 * If current->mm == NULL then we borrow a mm which may change during a
-		 * task switch and therefore we must not be preempted while we write CR3
-		 * back:
+	if (this_cpu_has(X86_FEATURE_INVPCID)) {
+		/*
+		 * Note, this works with CR4.PCIDE=0 or 1.
 		 */
-		preempt_disable();
-		native_write_cr3(native_read_cr3());
-		preempt_enable();
+		invpcid_flush_all_nonglobals();
 		return;
 	}
+
 	/*
-	 * We are no longer using globals with KAISER, so a
-	 * "nonglobals" flush would work too. But, this is more
-	 * conservative.
-	 *
-	 * Note, this works with CR4.PCIDE=0 or 1.
+	 * If current->mm == NULL then we borrow a mm which may change during a
+	 * task switch and therefore we must not be preempted while we write CR3
+	 * back:
 	 */
-	invpcid_flush_all();
+	preempt_disable();
+	if (this_cpu_has(X86_FEATURE_PCID))
+		kaiser_flush_tlb_on_return_to_user();
+	native_write_cr3(native_read_cr3());
+	preempt_enable();
 }
 
 static inline void __native_flush_tlb_global_irq_disabled(void)
@@ -166,9 +182,13 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
 
 static inline void __native_flush_tlb_global(void)
 {
+#ifdef CONFIG_KAISER
+	/* Globals are not used at all */
+	__native_flush_tlb();
+#else
 	unsigned long flags;
 
-	if (static_cpu_has(X86_FEATURE_INVPCID)) {
+	if (this_cpu_has(X86_FEATURE_INVPCID)) {
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
 		 * to CR4 sandwiched inside an IRQ flag save/restore.
@@ -185,10 +205,9 @@ static inline void __native_flush_tlb_global(void)
 	 * be called from deep inside debugging code.)
 	 */
 	raw_local_irq_save(flags);
-
 	__native_flush_tlb_global_irq_disabled();
-
 	raw_local_irq_restore(flags);
+#endif
 }
 
 static inline void __native_flush_tlb_single(unsigned long addr)
@@ -199,9 +218,12 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 	 *
 	 * The ASIDs used below are hard-coded.  But, we must not
 	 * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call
-	 * invpcid in the case we are called early.
+	 * invlpg in the case we are called early.
 	 */
+
 	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+		if (this_cpu_has(X86_FEATURE_PCID))
+			kaiser_flush_tlb_on_return_to_user();
 		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 		return;
 	}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 07b7f2816567..46ad2faca9ab 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -321,32 +321,11 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 	}
 }
 
-/*
- * These can have bit 63 set, so we can not just use a plain "or"
- * instruction to get their value or'd into CR3.  It would take
- * another register.  So, we use a memory reference to these
- * instead.
- *
- * This is also handy because systems that do not support
- * PCIDs just end up or'ing a 0 into their CR3, which does
- * no harm.
- */
-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0;
-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0;
-
 static void setup_pcid(struct cpuinfo_x86 *c)
 {
 	if (cpu_has(c, X86_FEATURE_PCID)) {
 		if (cpu_has(c, X86_FEATURE_PGE)) {
 			cr4_set_bits(X86_CR4_PCIDE);
-			/*
-			 * These variables are used by the entry/exit
-			 * code to change PCIDs.
-			 */
-#ifdef CONFIG_KAISER
-			X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH;
-			X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH;
-#endif
 			/*
 			 * INVPCID has two "groups" of types:
 			 * 1/2: Invalidate an individual address
@@ -372,6 +351,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
 			clear_cpu_cap(c, X86_FEATURE_PCID);
 		}
 	}
+	kaiser_setup_pcid();
 }
 
 /*
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 290a52e6017d..bf3ec221a90b 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -12,12 +12,26 @@
 #include <linux/ftrace.h>
 
 #include <asm/kaiser.h>
+#include <asm/tlbflush.h>	/* to verify its kaiser declarations */
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
-#ifdef CONFIG_KAISER
 
-__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+#ifdef CONFIG_KAISER
+__visible
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3.  It would take
+ * another register.  So, we use a memory reference to these instead.
+ *
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR;
+DEFINE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
+
 /*
  * At runtime, the only things we map are some things for CPU
  * hotplug, and stacks for new processes.  No two CPUs will ever
@@ -239,9 +253,6 @@ static void __init kaiser_init_all_pgds(void)
 	WARN_ON(__ret);							\
 } while (0)
 
-extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
-extern unsigned long X86_CR3_PCID_KERN_VAR;
-extern unsigned long X86_CR3_PCID_USER_VAR;
 /*
  * If anything in here fails, we will likely die on one of the
  * first kernel->user transitions and init will die.  But, we
@@ -295,8 +306,6 @@ void __init kaiser_init(void)
 
 	kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
 				  __PAGE_KERNEL);
-	kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE,
-				  __PAGE_KERNEL);
 }
 
 /* Add a mapping to the shadow mapping, and synchronize the mappings */
@@ -361,4 +370,33 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 	}
 	return pgd;
 }
+
+void kaiser_setup_pcid(void)
+{
+	unsigned long kern_cr3 = 0;
+	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+	if (this_cpu_has(X86_FEATURE_PCID)) {
+		kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
+		user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+	}
+	/*
+	 * These variables are used by the entry/exit
+	 * code to change PCID and pgd and TLB flushing.
+	 */
+	X86_CR3_PCID_KERN_VAR = kern_cr3;
+	this_cpu_write(X86_CR3_PCID_USER_VAR, user_cr3);
+}
+
+/*
+ * Make a note that this cpu will need to flush USER tlb on return to user.
+ * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
+ * if cpu does not, then the NOFLUSH bit will never have been set.
+ */
+void kaiser_flush_tlb_on_return_to_user(void)
+{
+	this_cpu_write(X86_CR3_PCID_USER_VAR,
+			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+}
+EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
 #endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index aed0b704de3d..99a5d1b95527 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,13 +6,14 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
+#include <linux/debugfs.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/cache.h>
 #include <asm/apic.h>
 #include <asm/uv/uv.h>
-#include <linux/debugfs.h>
+#include <asm/kaiser.h>
 
 /*
  *	TLB flushing, formerly SMP-only
@@ -38,34 +39,23 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 {
 	unsigned long new_mm_cr3 = __pa(pgdir);
 
-	/*
-	 * KAISER, plus PCIDs needs some extra work here.  But,
-	 * if either of features is not present, we need no
-	 * PCIDs here and just do a normal, full TLB flush with
-	 * the write_cr3()
-	 */
-	if (!IS_ENABLED(CONFIG_KAISER) ||
-	    !cpu_feature_enabled(X86_FEATURE_PCID))
-		goto out_set_cr3;
-	/*
-	 * We reuse the same PCID for different tasks, so we must
-	 * flush all the entires for the PCID out when we change
-	 * tasks.
-	 */
-	new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir);
+#ifdef CONFIG_KAISER
+	if (this_cpu_has(X86_FEATURE_PCID)) {
+		/*
+		 * We reuse the same PCID for different tasks, so we must
+		 * flush all the entries for the PCID out when we change tasks.
+		 * Flush KERN below, flush USER when returning to userspace in
+		 * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
+		 *
+		 * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
+		 * do it here, but can only be used if X86_FEATURE_INVPCID is
+		 * available - and many machines support pcid without invpcid.
+		 */
+		new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+		kaiser_flush_tlb_on_return_to_user();
+	}
+#endif /* CONFIG_KAISER */
 
-	/*
-	 * The flush from load_cr3() may leave old TLB entries
-	 * for userspace in place.  We must flush that context
-	 * separately.  We can theoretically delay doing this
-	 * until we actually load up the userspace CR3, but
-	 * that's a bit tricky.  We have to have the "need to
-	 * flush userspace PCID" bit per-cpu and check it in the
-	 * exit-to-userspace paths.
-	 */
-	invpcid_flush_single_context(X86_CR3_PCID_ASID_USER);
-
-out_set_cr3:
 	/*
 	 * Caution: many callers of this function expect
 	 * that load_cr3() is serializing and orders TLB

From 3b4ce0e1a17228eec71815d7997e49e403ebf2a7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 8 Sep 2017 19:26:30 -0700
Subject: [PATCH 22/44] kaiser: PCID 0 for kernel and 128 for user

Why was 4 chosen for kernel PCID and 6 for user PCID?
No good reason in a backport where PCIDs are only used for Kaiser.

If we continue with those, then we shall need to add Andy Lutomirski's
4.13 commit 6c690ee1039b ("x86/mm: Split read_cr3() into read_cr3_pa()
and __read_cr3()"), which deals with the problem of read_cr3() callers
finding stray bits in the cr3 that they expected to be page-aligned;
and for hibernation, his 4.14 commit f34902c5c6c0 ("x86/hibernate/64:
Mask off CR3's PCID bits in the saved CR3").

But if 0 is used for kernel PCID, then there's no need to add in those
commits - whenever the kernel looks, it sees 0 in the lower bits; and
0 for kernel seems an obvious choice.

And I naughtily propose 128 for user PCID.  Because there's a place
in _SWITCH_TO_USER_CR3 where it takes note of the need for TLB FLUSH,
but needs to reset that to NOFLUSH for the next occasion.  Currently
it does so with a "movb $(0x80)" into the high byte of the per-cpu
quadword, but that will cause a machine without PCID support to crash.
Now, if %al just happened to have 0x80 in it at that point, on a
machine with PCID support, but 0 on a machine without PCID support...

(That will go badly wrong once the pgd can be at a physical address
above 2^56, but even with 5-level paging, physical goes up to 2^52.)

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/kaiser.h        | 19 ++++++++++++-------
 arch/x86/include/asm/pgtable_types.h |  7 ++++---
 arch/x86/mm/tlb.c                    |  3 +++
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 009bca514c20..110a73e0572d 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -29,14 +29,19 @@ orq  X86_CR3_PCID_KERN_VAR, \reg
 movq \reg, %cr3
 .endm
 
-.macro _SWITCH_TO_USER_CR3 reg
+.macro _SWITCH_TO_USER_CR3 reg regb
+/*
+ * regb must be the low byte portion of reg: because we have arranged
+ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
+ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
+ * not enabled): so that the one register can update both memory and cr3.
+ */
 movq %cr3, \reg
 andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
 orq  PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg
 js   9f
-// FLUSH this time, reset to NOFLUSH for next time
-// But if nopcid?  Consider using 0x80 for user pcid?
-movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
+/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
+movb \regb, PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
 9:
 movq \reg, %cr3
 .endm
@@ -49,7 +54,7 @@ popq %rax
 
 .macro SWITCH_USER_CR3
 pushq %rax
-_SWITCH_TO_USER_CR3 %rax
+_SWITCH_TO_USER_CR3 %rax %al
 popq %rax
 .endm
 
@@ -61,7 +66,7 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 
 .macro SWITCH_USER_CR3_NO_STACK
 movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
-_SWITCH_TO_USER_CR3 %rax
+_SWITCH_TO_USER_CR3 %rax %al
 movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 .endm
 
@@ -69,7 +74,7 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 
 .macro SWITCH_KERNEL_CR3 reg
 .endm
-.macro SWITCH_USER_CR3 reg
+.macro SWITCH_USER_CR3 reg regb
 .endm
 .macro SWITCH_USER_CR3_NO_STACK
 .endm
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 79319174e27b..22704eca1900 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -111,16 +111,17 @@
 
 /* Mask for all the PCID-related bits in CR3: */
 #define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
+
 #if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
-#define X86_CR3_PCID_ASID_KERN  (_AC(0x4,UL))
-#define X86_CR3_PCID_ASID_USER  (_AC(0x6,UL))
+/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
+#define X86_CR3_PCID_ASID_USER	(_AC(0x80,UL))
 
 #define X86_CR3_PCID_KERN_FLUSH		(X86_CR3_PCID_ASID_KERN)
 #define X86_CR3_PCID_USER_FLUSH		(X86_CR3_PCID_ASID_USER)
 #define X86_CR3_PCID_KERN_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
 #define X86_CR3_PCID_USER_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
 #else
-#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
 #define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))
 /*
  * PCIDs are unsupported on 32-bit and none of these bits can be
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 99a5d1b95527..90ef67a9e34b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -50,6 +50,9 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 		 * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
 		 * do it here, but can only be used if X86_FEATURE_INVPCID is
 		 * available - and many machines support pcid without invpcid.
+		 *
+		 * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
+		 * but keep that line in there in case something changes.
 		 */
 		new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
 		kaiser_flush_tlb_on_return_to_user();

From 20268a10ffecd9fcc04880b21fc99a9192394599 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 27 Aug 2017 16:24:27 -0700
Subject: [PATCH 23/44] kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user

Mostly this commit is just unshouting X86_CR3_PCID_KERN_VAR and
X86_CR3_PCID_USER_VAR: we usually name variables in lower-case.

But why does x86_cr3_pcid_noflush need to be __aligned(PAGE_SIZE)?
Ah, it's a leftover from when kaiser_add_user_map() once complained
about mapping the same page twice.  Make it __read_mostly instead.
(I'm a little uneasy about all the unrelated data which shares its
page getting user-mapped too, but that was so before, and not a big
deal: though we call it user-mapped, it's not mapped with _PAGE_USER.)

And there is a little change around the two calls to do_nmi().
Previously they set the NOFLUSH bit (if PCID supported) when
forcing to kernel context before do_nmi(); now they also have the
NOFLUSH bit set (if PCID supported) when restoring context after:
nothing done in do_nmi() should require a TLB to be flushed here.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S     |  8 ++++----
 arch/x86/include/asm/kaiser.h | 11 +++++------
 arch/x86/mm/kaiser.c          | 13 +++++++------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 6a18d787aed4..efd96f02ac9e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1290,11 +1290,11 @@ ENTRY(nmi)
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
 	movq	%cr3, %rax
+	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+	orq	x86_cr3_pcid_noflush, %rax
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
-	/* Add back kernel PCID and "no flush" bit */
-	orq	X86_CR3_PCID_KERN_VAR, %rax
 	movq	%rax, %cr3
 #endif
 	call	do_nmi
@@ -1534,11 +1534,11 @@ end_repeat_nmi:
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
 	movq	%cr3, %rax
+	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+	orq	x86_cr3_pcid_noflush, %rax
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
-	/* Add back kernel PCID and "no flush" bit */
-	orq	X86_CR3_PCID_KERN_VAR, %rax
 	movq	%rax, %cr3
 #endif
 
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 110a73e0572d..48d8d70dd8c7 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -25,7 +25,7 @@
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
 andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
-orq  X86_CR3_PCID_KERN_VAR, \reg
+orq  x86_cr3_pcid_noflush, \reg
 movq \reg, %cr3
 .endm
 
@@ -37,11 +37,10 @@ movq \reg, %cr3
  * not enabled): so that the one register can update both memory and cr3.
  */
 movq %cr3, \reg
-andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
-orq  PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg
+orq  PER_CPU_VAR(x86_cr3_pcid_user), \reg
 js   9f
 /* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
-movb \regb, PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
+movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
 9:
 movq \reg, %cr3
 .endm
@@ -94,8 +93,8 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 */
 DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
-extern unsigned long X86_CR3_PCID_KERN_VAR;
-DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
+extern unsigned long x86_cr3_pcid_noflush;
+DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
 
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index bf3ec221a90b..8a5eb5993d3c 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -29,8 +29,8 @@ DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
  * This is also handy because systems that do not support PCIDs
  * just end up or'ing a 0 into their CR3, which does no harm.
  */
-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR;
-DEFINE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
+unsigned long x86_cr3_pcid_noflush __read_mostly;
+DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 
 /*
  * At runtime, the only things we map are some things for CPU
@@ -304,7 +304,8 @@ void __init kaiser_init(void)
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL);
 
-	kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
+	kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
+				  sizeof(x86_cr3_pcid_noflush),
 				  __PAGE_KERNEL);
 }
 
@@ -384,8 +385,8 @@ void kaiser_setup_pcid(void)
 	 * These variables are used by the entry/exit
 	 * code to change PCID and pgd and TLB flushing.
 	 */
-	X86_CR3_PCID_KERN_VAR = kern_cr3;
-	this_cpu_write(X86_CR3_PCID_USER_VAR, user_cr3);
+	x86_cr3_pcid_noflush = kern_cr3;
+	this_cpu_write(x86_cr3_pcid_user, user_cr3);
 }
 
 /*
@@ -395,7 +396,7 @@ void kaiser_setup_pcid(void)
  */
 void kaiser_flush_tlb_on_return_to_user(void)
 {
-	this_cpu_write(X86_CR3_PCID_USER_VAR,
+	this_cpu_write(x86_cr3_pcid_user,
 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 }
 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);

From fc8334e6b3e5d28afd4eec8a74493933f73b2784 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 26 Sep 2017 18:43:07 -0700
Subject: [PATCH 24/44] kaiser: paranoid_entry pass cr3 need to paranoid_exit

Neel Natu points out that paranoid_entry() was wrong to assume that
an entry that did not need swapgs would not need SWITCH_KERNEL_CR3:
paranoid_entry (used for debug breakpoint, int3, double fault or MCE;
though I think it's only the MCE case that is cause for concern here)
can break in at an awkward time, between cr3 switch and swapgs, but
its handling always needs kernel gs and kernel cr3.

Easy to fix in itself, but paranoid_entry() also needs to convey to
paranoid_exit() (and my reading of macro idtentry says paranoid_entry
and paranoid_exit are always paired) how to restore the prior state.
The swapgs state is already conveyed by %ebx (0 or 1), so extend that
also to convey when SWITCH_USER_CR3 will be needed (2 or 3).

(Yes, I'd much prefer that 0 meant no swapgs, whereas it's the other
way round: and a convention shared with error_entry() and error_exit(),
which I don't want to touch.  Perhaps I should have inverted the bit
for switch cr3 too, but did not.)

paranoid_exit() would be straightforward, except for TRACE_IRQS: it
did TRACE_IRQS_IRETQ when doing swapgs, but TRACE_IRQS_IRETQ_DEBUG
when not: which is it supposed to use when SWITCH_USER_CR3 is split
apart from that?  As best as I can determine, commit 5963e317b1e9
("ftrace/x86: Do not change stacks in DEBUG when calling lockdep")
missed the swapgs case, and should have used TRACE_IRQS_IRETQ_DEBUG
there too (the discrepancy has nothing to do with the liberal use
of _NO_STACK and _UNSAFE_STACK hereabouts: TRACE_IRQS_OFF_DEBUG has
just been used in all cases); discrepancy lovingly preserved across
several paranoid_exit() cleanups, but I'm now removing it.

Neel further indicates that to use SWITCH_USER_CR3_NO_STACK there in
paranoid_exit() is now not only unnecessary but unsafe: might corrupt
syscall entry's unsafe_stack_register_backup of %rax.  Just use
SWITCH_USER_CR3: and delete SWITCH_USER_CR3_NO_STACK altogether,
before we make the mistake of using it again.

hughd adds: this commit fixes an issue in the Kaiser-without-PCIDs
part of the series, and ought to be moved earlier, if you decided
to make a release of Kaiser-without-PCIDs.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S     | 50 ++++++++++++++++++++++++++---------
 arch/x86/include/asm/kaiser.h |  8 ------
 2 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index efd96f02ac9e..59d9e5d8c05b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1025,7 +1025,11 @@ idtentry machine_check					has_error_code=0	paranoid=1 do_sym=*machine_check_vec
 /*
  * Save all registers in pt_regs, and switch gs if needed.
  * Use slow, but surefire "are we in kernel?" check.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ *
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
  */
 ENTRY(paranoid_entry)
 	cld
@@ -1037,9 +1041,26 @@ ENTRY(paranoid_entry)
 	testl	%edx, %edx
 	js	1f				/* negative -> in kernel */
 	SWAPGS
-	SWITCH_KERNEL_CR3
 	xorl	%ebx, %ebx
-1:	ret
+1:
+#ifdef CONFIG_KAISER
+	/*
+	 * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+	 * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+	 * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+	 * unconditionally, but we need to find out whether the reverse
+	 * should be done on return (conveyed to paranoid_exit in %ebx).
+	 */
+	movq	%cr3, %rax
+	testl	$KAISER_SHADOW_PGD_OFFSET, %eax
+	jz	2f
+	orl	$2, %ebx
+	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+	orq	x86_cr3_pcid_noflush, %rax
+	movq	%rax, %cr3
+2:
+#endif
+	ret
 END(paranoid_entry)
 
 /*
@@ -1052,20 +1073,25 @@ END(paranoid_entry)
  * be complicated.  Fortunately, we there's no good reason
  * to try to handle preemption here.
  *
- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ *           ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ *           ebx=2: needs both swapgs and SWITCH_USER_CR3
+ *           ebx=3: needs SWITCH_USER_CR3 but not swapgs
  */
 ENTRY(paranoid_exit)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF_DEBUG
-	testl	%ebx, %ebx			/* swapgs needed? */
-	jnz	paranoid_exit_no_swapgs
-	TRACE_IRQS_IRETQ
-	SWITCH_USER_CR3_NO_STACK
-	SWAPGS_UNSAFE_STACK
-	jmp	paranoid_exit_restore
-paranoid_exit_no_swapgs:
 	TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
+#ifdef CONFIG_KAISER
+	testl	$2, %ebx			/* SWITCH_USER_CR3 needed? */
+	jz	paranoid_exit_no_switch
+	SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+	testl	$1, %ebx			/* swapgs needed? */
+	jnz	paranoid_exit_no_swapgs
+	SWAPGS_UNSAFE_STACK
+paranoid_exit_no_swapgs:
 	RESTORE_EXTRA_REGS
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 48d8d70dd8c7..3dc5f4c39b3e 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -63,20 +63,12 @@ _SWITCH_TO_KERNEL_CR3 %rax
 movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 .endm
 
-.macro SWITCH_USER_CR3_NO_STACK
-movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
-_SWITCH_TO_USER_CR3 %rax %al
-movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
-.endm
-
 #else /* CONFIG_KAISER */
 
 .macro SWITCH_KERNEL_CR3 reg
 .endm
 .macro SWITCH_USER_CR3 reg regb
 .endm
-.macro SWITCH_USER_CR3_NO_STACK
-.endm
 .macro SWITCH_KERNEL_CR3_NO_STACK
 .endm
 

From d41f46f778951b0ea851ca52b88b2549c6336b47 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 13 Oct 2017 12:10:00 -0700
Subject: [PATCH 25/44] kaiser: _pgd_alloc() without __GFP_REPEAT to avoid
 stalls

Synthetic filesystem mempressure testing has shown softlockups, with
hour-long page allocation stalls, and pgd_alloc() trying for order:1
with __GFP_REPEAT in one of the backtraces each time.

That's _pgd_alloc() going for a Kaiser double-pgd, using the __GFP_REPEAT
common to all page table allocations, but actually having no effect on
order:0 (see should_alloc_oom() and should_continue_reclaim() in this
tree, but beware that ports to another tree might behave differently).

Order:1 stack allocation has been working satisfactorily without
__GFP_REPEAT forever, and page table allocation only asks __GFP_REPEAT
for awkward occasions in a long-running process: it's not appropriate
at fork or exec time, and seems to be doing much more harm than good:
getting those contiguous pages under very heavy mempressure can be
hard (though even without it, Kaiser does generate more mempressure).

Mask out that __GFP_REPEAT inside _pgd_alloc().  Why not take it out
of the PGALLOG_GFP altogether, as v4.7 commit a3a9a59d2067 ("x86: get
rid of superfluous __GFP_REPEAT") did?  Because I think that might
make a difference to our page table memcg charging, which I'd prefer
not to interfere with at this time.

hughd adds: __alloc_pages_slowpath() in the 4.4.89-stable tree handles
__GFP_REPEAT a little differently than in prod kernel or 3.18.72-stable,
so it may not always be exactly a no-op on order:0 pages, as said above;
but I think still appropriate to omit it from Kaiser or non-Kaiser pgd.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/pgtable.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e2bd5c81279e..d0a424988f82 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -6,7 +6,7 @@
 #include <asm/fixmap.h>
 #include <asm/mtrr.h>
 
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -354,7 +354,9 @@ static inline void _pgd_free(pgd_t *pgd)
 
 static inline pgd_t *_pgd_alloc(void)
 {
-	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
+	/* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
+	return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
+					 PGD_ALLOCATION_ORDER);
 }
 
 static inline void _pgd_free(pgd_t *pgd)

From 500943e57db8d3e298e98f595f835c5b613e843b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 4 Dec 2017 20:13:35 -0800
Subject: [PATCH 26/44] kaiser: fix unlikely error in alloc_ldt_struct()

An error from kaiser_add_mapping() here is not at all likely, but
Eric Biggers rightly points out that __free_ldt_struct() relies on
new_ldt->size being initialized: move that up.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/ldt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 89ee37469d41..bc429365b72a 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -79,11 +79,11 @@ static struct ldt_struct *alloc_ldt_struct(int size)
 
 	ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
 				 __PAGE_KERNEL);
+	new_ldt->size = size;
 	if (ret) {
 		__free_ldt_struct(new_ldt);
 		return NULL;
 	}
-	new_ldt->size = size;
 	return new_ldt;
 }
 

From e345dcc9481543edf4a0a5df4c4c2f9597b0a997 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 24 Sep 2017 16:59:49 -0700
Subject: [PATCH 27/44] kaiser: add "nokaiser" boot option, using ALTERNATIVE

Added "nokaiser" boot option: an early param like "noinvpcid".
Most places now check int kaiser_enabled (#defined 0 when not
CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S
and entry_64_compat.S are using the ALTERNATIVE technique, which
patches in the preferred instructions at runtime.  That technique
is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated.

Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that,
but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when
nokaiser like when !CONFIG_KAISER, but not setting either when kaiser -
neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL
won't get set in some obscure corner, or something add PGE into CR4.
By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled,
all page table setup which uses pte_pfn() masks it out of the ptes.

It's slightly shameful that the same declaration versus definition of
kaiser_enabled appears in not one, not two, but in three header files
(asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h).  I felt safer that way,
than with #including any of those in any of the others; and did not
feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes
them all, so we shall hear about it if they get out of synch.

Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER
from kaiser.c; removed the unused native_get_normal_pgd(); removed
the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some
comments.  But more interestingly, set CR4.PSE in secondary_startup_64:
the manual is clear that it does not matter whether it's 0 or 1 when
4-level-pts are enabled, but I was distracted to find cr4 different on
BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask().

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/kernel-parameters.txt  |  2 ++
 arch/x86/entry/entry_64.S            | 15 ++++++-----
 arch/x86/include/asm/cpufeature.h    |  3 +++
 arch/x86/include/asm/kaiser.h        | 27 ++++++++++++++-----
 arch/x86/include/asm/pgtable.h       | 20 +++++++++-----
 arch/x86/include/asm/pgtable_64.h    | 13 +++-------
 arch/x86/include/asm/pgtable_types.h |  4 ---
 arch/x86/include/asm/tlbflush.h      | 39 ++++++++++++++++++----------
 arch/x86/kernel/cpu/common.c         | 28 +++++++++++++++++++-
 arch/x86/kernel/espfix_64.c          |  3 ++-
 arch/x86/kernel/head_64.S            |  4 +--
 arch/x86/mm/init.c                   |  2 +-
 arch/x86/mm/init_64.c                | 10 +++++++
 arch/x86/mm/kaiser.c                 | 26 ++++++++++++++++---
 arch/x86/mm/pgtable.c                |  8 ++----
 arch/x86/mm/tlb.c                    |  4 +--
 16 files changed, 143 insertions(+), 65 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index b4a83a490212..67d4f67f56ba 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2523,6 +2523,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	nojitter	[IA-64] Disables jitter checking for ITC timers.
 
+	nokaiser	[X86-64] Disable KAISER isolation of kernel from user.
+
 	no-kvmclock	[X86,KVM] Disable paravirtualized KVM clock driver
 
 	no-kvmapf	[X86,KVM] Disable paravirtualized asynchronous page
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 59d9e5d8c05b..85e30957e494 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1051,7 +1051,7 @@ ENTRY(paranoid_entry)
 	 * unconditionally, but we need to find out whether the reverse
 	 * should be done on return (conveyed to paranoid_exit in %ebx).
 	 */
-	movq	%cr3, %rax
+	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	testl	$KAISER_SHADOW_PGD_OFFSET, %eax
 	jz	2f
 	orl	$2, %ebx
@@ -1083,6 +1083,7 @@ ENTRY(paranoid_exit)
 	TRACE_IRQS_OFF_DEBUG
 	TRACE_IRQS_IRETQ_DEBUG
 #ifdef CONFIG_KAISER
+	/* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
 	testl	$2, %ebx			/* SWITCH_USER_CR3 needed? */
 	jz	paranoid_exit_no_switch
 	SWITCH_USER_CR3
@@ -1315,13 +1316,14 @@ ENTRY(nmi)
 #ifdef CONFIG_KAISER
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
-	movq	%cr3, %rax
+	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
 	orq	x86_cr3_pcid_noflush, %rax
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
 	movq	%rax, %cr3
+2:
 #endif
 	call	do_nmi
 
@@ -1331,8 +1333,7 @@ ENTRY(nmi)
 	 * kernel code that needs user CR3, but do we ever return
 	 * to "user mode" where we need the kernel CR3?
 	 */
-	popq	%rax
-	mov	%rax, %cr3
+	ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
 #endif
 
 	/*
@@ -1559,13 +1560,14 @@ end_repeat_nmi:
 #ifdef CONFIG_KAISER
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
-	movq	%cr3, %rax
+	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
 	orq	x86_cr3_pcid_noflush, %rax
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
 	movq	%rax, %cr3
+2:
 #endif
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
@@ -1577,8 +1579,7 @@ end_repeat_nmi:
 	 * kernel code that needs user CR3, like just just before
 	 * a sysret.
 	 */
-	popq	%rax
-	mov	%rax, %cr3
+	ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
 #endif
 
 	testl	%ebx, %ebx			/* swapgs needed? */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 72a4343b774f..3fc40a43578f 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -200,6 +200,9 @@
 #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
 #define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
 
+/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
+
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
 #define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 3dc5f4c39b3e..96643a9c194c 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -46,28 +46,33 @@ movq \reg, %cr3
 .endm
 
 .macro SWITCH_KERNEL_CR3
-pushq %rax
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
 _SWITCH_TO_KERNEL_CR3 %rax
 popq %rax
+8:
 .endm
 
 .macro SWITCH_USER_CR3
-pushq %rax
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
 _SWITCH_TO_USER_CR3 %rax %al
 popq %rax
+8:
 .endm
 
 .macro SWITCH_KERNEL_CR3_NO_STACK
-movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+ALTERNATIVE "jmp 8f", \
+	__stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
+	X86_FEATURE_KAISER
 _SWITCH_TO_KERNEL_CR3 %rax
 movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+8:
 .endm
 
 #else /* CONFIG_KAISER */
 
-.macro SWITCH_KERNEL_CR3 reg
+.macro SWITCH_KERNEL_CR3
 .endm
-.macro SWITCH_USER_CR3 reg regb
+.macro SWITCH_USER_CR3
 .endm
 .macro SWITCH_KERNEL_CR3_NO_STACK
 .endm
@@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
 
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled	0
+#endif /* CONFIG_KAISER */
+
+/*
+ * Kaiser function prototypes are needed even when CONFIG_KAISER is not set,
+ * so as to build with tests on kaiser_enabled instead of #ifdefs.
+ */
+
 /**
  *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
  *  @addr: the start address of the range
@@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
  */
 extern void kaiser_init(void);
 
-#endif /* CONFIG_KAISER */
-
 #endif /* __ASSEMBLY */
 
 #endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 176035fa057e..051beec179f4 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -18,6 +18,12 @@
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
 
+#ifdef CONFIG_KAISER
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled 0
+#endif
+
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
 void ptdump_walk_pgd_level_checkwx(void);
 
@@ -660,7 +666,7 @@ static inline int pgd_bad(pgd_t pgd)
 	 * page table by accident; it will fault on the first
 	 * instruction it tries to run.  See native_set_pgd().
 	 */
-	if (IS_ENABLED(CONFIG_KAISER))
+	if (kaiser_enabled)
 		ignore_flags |= _PAGE_NX;
 
 	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
@@ -865,12 +871,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  */
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
-       memcpy(dst, src, count * sizeof(pgd_t));
+	memcpy(dst, src, count * sizeof(pgd_t));
 #ifdef CONFIG_KAISER
-	/* Clone the shadow pgd part as well */
-	memcpy(native_get_shadow_pgd(dst),
-	       native_get_shadow_pgd(src),
-	       count * sizeof(pgd_t));
+	if (kaiser_enabled) {
+		/* Clone the shadow pgd part as well */
+		memcpy(native_get_shadow_pgd(dst),
+			native_get_shadow_pgd(src),
+			count * sizeof(pgd_t));
+	}
 #endif
 }
 
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 8629be9e6649..233d19c9f22e 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -111,13 +111,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
 
 static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 {
+#ifdef CONFIG_DEBUG_VM
+	/* linux/mmdebug.h may not have been included at this point */
+	BUG_ON(!kaiser_enabled);
+#endif
 	return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
 }
-
-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
-{
-	return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
-}
 #else
 static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 {
@@ -128,10 +127,6 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 	BUILD_BUG_ON(1);
 	return NULL;
 }
-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
-{
-	return pgdp;
-}
 #endif /* CONFIG_KAISER */
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 22704eca1900..22547baa458a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -39,11 +39,7 @@
 #define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
 #define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
 #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#ifdef CONFIG_KAISER
-#define _PAGE_GLOBAL	(_AT(pteval_t, 0))
-#else
 #define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
-#endif
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index ff8c5eb95caf..b376095a1fd9 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -136,9 +136,11 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
  * to avoid the need for asm/kaiser.h in unexpected places.
  */
 #ifdef CONFIG_KAISER
+extern int kaiser_enabled;
 extern void kaiser_setup_pcid(void);
 extern void kaiser_flush_tlb_on_return_to_user(void);
 #else
+#define kaiser_enabled 0
 static inline void kaiser_setup_pcid(void)
 {
 }
@@ -163,7 +165,7 @@ static inline void __native_flush_tlb(void)
 	 * back:
 	 */
 	preempt_disable();
-	if (this_cpu_has(X86_FEATURE_PCID))
+	if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
 		kaiser_flush_tlb_on_return_to_user();
 	native_write_cr3(native_read_cr3());
 	preempt_enable();
@@ -174,20 +176,30 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
 	unsigned long cr4;
 
 	cr4 = this_cpu_read(cpu_tlbstate.cr4);
-	/* clear PGE */
-	native_write_cr4(cr4 & ~X86_CR4_PGE);
-	/* write old PGE again and flush TLBs */
-	native_write_cr4(cr4);
+	if (cr4 & X86_CR4_PGE) {
+		/* clear PGE and flush TLB of all entries */
+		native_write_cr4(cr4 & ~X86_CR4_PGE);
+		/* restore PGE as it was before */
+		native_write_cr4(cr4);
+	} else {
+		/*
+		 * x86_64 microcode update comes this way when CR4.PGE is not
+		 * enabled, and it's safer for all callers to allow this case.
+		 */
+		native_write_cr3(native_read_cr3());
+	}
 }
 
 static inline void __native_flush_tlb_global(void)
 {
-#ifdef CONFIG_KAISER
-	/* Globals are not used at all */
-	__native_flush_tlb();
-#else
 	unsigned long flags;
 
+	if (kaiser_enabled) {
+		/* Globals are not used at all */
+		__native_flush_tlb();
+		return;
+	}
+
 	if (this_cpu_has(X86_FEATURE_INVPCID)) {
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
@@ -207,7 +219,6 @@ static inline void __native_flush_tlb_global(void)
 	raw_local_irq_save(flags);
 	__native_flush_tlb_global_irq_disabled();
 	raw_local_irq_restore(flags);
-#endif
 }
 
 static inline void __native_flush_tlb_single(unsigned long addr)
@@ -222,7 +233,7 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 	 */
 
 	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
-		if (this_cpu_has(X86_FEATURE_PCID))
+		if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
 			kaiser_flush_tlb_on_return_to_user();
 		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 		return;
@@ -237,9 +248,9 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 	 * Make sure to do only a single invpcid when KAISER is
 	 * disabled and we have only a single ASID.
 	 */
-	if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
-		invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
-	invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+	if (kaiser_enabled)
+		invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+	invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
 }
 
 static inline void __flush_tlb_all(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 46ad2faca9ab..86db04f0ec78 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -178,6 +178,20 @@ static int __init x86_pcid_setup(char *s)
 	return 1;
 }
 __setup("nopcid", x86_pcid_setup);
+
+static int __init x86_nokaiser_setup(char *s)
+{
+	/* nokaiser doesn't accept parameters */
+	if (s)
+		return -EINVAL;
+#ifdef CONFIG_KAISER
+	kaiser_enabled = 0;
+	setup_clear_cpu_cap(X86_FEATURE_KAISER);
+	pr_info("nokaiser: KAISER feature disabled\n");
+#endif
+	return 0;
+}
+early_param("nokaiser", x86_nokaiser_setup);
 #endif
 
 static int __init x86_noinvpcid_setup(char *s)
@@ -324,7 +338,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 static void setup_pcid(struct cpuinfo_x86 *c)
 {
 	if (cpu_has(c, X86_FEATURE_PCID)) {
-		if (cpu_has(c, X86_FEATURE_PGE)) {
+		if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
 			cr4_set_bits(X86_CR4_PCIDE);
 			/*
 			 * INVPCID has two "groups" of types:
@@ -747,6 +761,10 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_power = cpuid_edx(0x80000007);
 
 	init_scattered_cpuid_features(c);
+#ifdef CONFIG_KAISER
+	if (kaiser_enabled)
+		set_cpu_cap(c, X86_FEATURE_KAISER);
+#endif
 }
 
 static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -1406,6 +1424,14 @@ void cpu_init(void)
 	 * try to read it.
 	 */
 	cr4_init_shadow();
+	if (!kaiser_enabled) {
+		/*
+		 * secondary_startup_64() deferred setting PGE in cr4:
+		 * probe_page_size_mask() sets it on the boot cpu,
+		 * but it needs to be set on each secondary cpu.
+		 */
+		cr4_set_bits(X86_CR4_PGE);
+	}
 
 	/*
 	 * Load microcode on this cpu if a valid microcode is available.
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 54de4c70cbd6..b02cb2ec6726 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -132,9 +132,10 @@ void __init init_espfix_bsp(void)
 	 * area to ensure it is mapped into the shadow user page
 	 * tables.
 	 */
-	if (IS_ENABLED(CONFIG_KAISER))
+	if (kaiser_enabled) {
 		set_pgd(native_get_shadow_pgd(pgd_p),
 			__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
+	}
 
 	/* Randomize the locations */
 	init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index abbbcfbb5e26..9af949ce8a69 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -183,8 +183,8 @@ ENTRY(secondary_startup_64)
 	movq	$(init_level4_pgt - __START_KERNEL_map), %rax
 1:
 
-	/* Enable PAE mode and PGE */
-	movl	$(X86_CR4_PAE | X86_CR4_PGE), %ecx
+	/* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
+	movl	$(X86_CR4_PAE | X86_CR4_PSE), %ecx
 	movq	%rcx, %cr4
 
 	/* Setup early boot stage 4 level pagetables. */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ed4b372860e4..2bd45ae91eb3 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void)
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
 	/* Enable PGE if available */
-	if (cpu_has_pge) {
+	if (cpu_has_pge && !kaiser_enabled) {
 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	} else
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec081fe0ce2c..d76ec9348cff 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -395,6 +395,16 @@ void __init cleanup_highmap(void)
 			continue;
 		if (vaddr < (unsigned long) _text || vaddr > end)
 			set_pmd(pmd, __pmd(0));
+		else if (kaiser_enabled) {
+			/*
+			 * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
+			 * clear that now.  This is not important, so long as
+			 * CR4.PGE remains clear, but it removes an anomaly.
+			 * Physical mapping setup below avoids _PAGE_GLOBAL
+			 * by use of massage_pgprot() inside pfn_pte() etc.
+			 */
+			set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
+		}
 	}
 }
 
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 8a5eb5993d3c..45fd51c08a0e 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -17,7 +17,9 @@
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
 
-#ifdef CONFIG_KAISER
+int kaiser_enabled __read_mostly = 1;
+EXPORT_SYMBOL(kaiser_enabled);	/* for inlined TLB flush functions */
+
 __visible
 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
@@ -168,8 +170,8 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
 	return pte_offset_kernel(pmd, address);
 }
 
-int kaiser_add_user_map(const void *__start_addr, unsigned long size,
-			unsigned long flags)
+static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+			       unsigned long flags)
 {
 	int ret = 0;
 	pte_t *pte;
@@ -178,6 +180,15 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 	unsigned long target_address;
 
+	/*
+	 * It is convenient for callers to pass in __PAGE_KERNEL etc,
+	 * and there is no actual harm from setting _PAGE_GLOBAL, so
+	 * long as CR4.PGE is not set.  But it is nonetheless troubling
+	 * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
+	 * requires that not to be #defined to 0): so mask it off here.
+	 */
+	flags &= ~_PAGE_GLOBAL;
+
 	for (; address < end_addr; address += PAGE_SIZE) {
 		target_address = get_pa_from_mapping(address);
 		if (target_address == -1) {
@@ -264,6 +275,8 @@ void __init kaiser_init(void)
 {
 	int cpu;
 
+	if (!kaiser_enabled)
+		return;
 	kaiser_init_all_pgds();
 
 	for_each_possible_cpu(cpu) {
@@ -312,6 +325,8 @@ void __init kaiser_init(void)
 /* Add a mapping to the shadow mapping, and synchronize the mappings */
 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 {
+	if (!kaiser_enabled)
+		return 0;
 	return kaiser_add_user_map((const void *)addr, size, flags);
 }
 
@@ -323,6 +338,8 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size)
 	unsigned long addr, next;
 	pgd_t *pgd;
 
+	if (!kaiser_enabled)
+		return;
 	pgd = native_get_shadow_pgd(pgd_offset_k(start));
 	for (addr = start; addr < end; pgd++, addr = next) {
 		next = pgd_addr_end(addr, end);
@@ -344,6 +361,8 @@ static inline bool is_userspace_pgd(pgd_t *pgdp)
 
 pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 {
+	if (!kaiser_enabled)
+		return pgd;
 	/*
 	 * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
 	 * skip cases like kexec and EFI which make temporary low mappings.
@@ -400,4 +419,3 @@ void kaiser_flush_tlb_on_return_to_user(void)
 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 }
 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
-#endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d0a424988f82..dbc27a2b4ad5 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -341,16 +341,12 @@ static inline void _pgd_free(pgd_t *pgd)
 }
 #else
 
-#ifdef CONFIG_KAISER
 /*
- * Instead of one pmd, we aquire two pmds.  Being order-1, it is
+ * Instead of one pgd, Kaiser acquires two pgds.  Being order-1, it is
  * both 8k in size and 8k-aligned.  That lets us just flip bit 12
  * in a pointer to swap between the two 4k halves.
  */
-#define PGD_ALLOCATION_ORDER 1
-#else
-#define PGD_ALLOCATION_ORDER 0
-#endif
+#define PGD_ALLOCATION_ORDER	kaiser_enabled
 
 static inline pgd_t *_pgd_alloc(void)
 {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 90ef67a9e34b..6ac065d4230c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -39,8 +39,7 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 {
 	unsigned long new_mm_cr3 = __pa(pgdir);
 
-#ifdef CONFIG_KAISER
-	if (this_cpu_has(X86_FEATURE_PCID)) {
+	if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) {
 		/*
 		 * We reuse the same PCID for different tasks, so we must
 		 * flush all the entries for the PCID out when we change tasks.
@@ -57,7 +56,6 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 		new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
 		kaiser_flush_tlb_on_return_to_user();
 	}
-#endif /* CONFIG_KAISER */
 
 	/*
 	 * Caution: many callers of this function expect

From dea9aa9ffae11c91285335cc3215b4f0e48e8139 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 2 Jan 2018 14:19:48 +0100
Subject: [PATCH 28/44] x86/kaiser: Rename and simplify X86_FEATURE_KAISER
 handling

Concentrate it in arch/x86/mm/kaiser.c and use the upstream string "nopti".

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/kernel-parameters.txt |  2 +-
 arch/x86/kernel/cpu/common.c        | 18 ------------------
 arch/x86/mm/kaiser.c                | 20 +++++++++++++++++++-
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 67d4f67f56ba..453ceddda862 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2523,7 +2523,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	nojitter	[IA-64] Disables jitter checking for ITC timers.
 
-	nokaiser	[X86-64] Disable KAISER isolation of kernel from user.
+	nopti		[X86-64] Disable KAISER isolation of kernel from user.
 
 	no-kvmclock	[X86,KVM] Disable paravirtualized KVM clock driver
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 86db04f0ec78..cc154ac64f00 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -178,20 +178,6 @@ static int __init x86_pcid_setup(char *s)
 	return 1;
 }
 __setup("nopcid", x86_pcid_setup);
-
-static int __init x86_nokaiser_setup(char *s)
-{
-	/* nokaiser doesn't accept parameters */
-	if (s)
-		return -EINVAL;
-#ifdef CONFIG_KAISER
-	kaiser_enabled = 0;
-	setup_clear_cpu_cap(X86_FEATURE_KAISER);
-	pr_info("nokaiser: KAISER feature disabled\n");
-#endif
-	return 0;
-}
-early_param("nokaiser", x86_nokaiser_setup);
 #endif
 
 static int __init x86_noinvpcid_setup(char *s)
@@ -761,10 +747,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_power = cpuid_edx(0x80000007);
 
 	init_scattered_cpuid_features(c);
-#ifdef CONFIG_KAISER
-	if (kaiser_enabled)
-		set_cpu_cap(c, X86_FEATURE_KAISER);
-#endif
 }
 
 static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 45fd51c08a0e..fb812fd865d8 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -275,8 +275,13 @@ void __init kaiser_init(void)
 {
 	int cpu;
 
-	if (!kaiser_enabled)
+	if (!kaiser_enabled) {
+		setup_clear_cpu_cap(X86_FEATURE_KAISER);
 		return;
+	}
+
+	setup_force_cpu_cap(X86_FEATURE_KAISER);
+
 	kaiser_init_all_pgds();
 
 	for_each_possible_cpu(cpu) {
@@ -419,3 +424,16 @@ void kaiser_flush_tlb_on_return_to_user(void)
 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 }
 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
+
+static int __init x86_nokaiser_setup(char *s)
+{
+	/* nopti doesn't accept parameters */
+	if (s)
+		return -EINVAL;
+
+	kaiser_enabled = 0;
+	pr_info("Kernel/User page tables isolation: disabled\n");
+
+	return 0;
+}
+early_param("nopti", x86_nokaiser_setup);

From e405a064bd7d6eca88935342ddb71057a9d6ceab Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 2 Jan 2018 14:19:48 +0100
Subject: [PATCH 29/44] x86/kaiser: Check boottime cmdline params

AMD (and possibly other vendors) are not affected by the leak
KAISER is protecting against.

Keep the "nopti" for traditional reasons and add pti=<on|off|auto>
like upstream.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/kernel-parameters.txt |  6 +++
 arch/x86/mm/kaiser.c                | 59 ++++++++++++++++++++---------
 2 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 453ceddda862..5977c4d71356 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3056,6 +3056,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	pt.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
 
+	pti=		[X86_64]
+			Control KAISER user/kernel address space isolation:
+			on - enable
+			off - disable
+			auto - default setting
+
 	pty.legacy_count=
 			[KNL] Number of legacy pty's. Overwrites compiled-in
 			default number.
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index fb812fd865d8..604aa48c8946 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -16,6 +16,7 @@
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
+#include <asm/cmdline.h>
 
 int kaiser_enabled __read_mostly = 1;
 EXPORT_SYMBOL(kaiser_enabled);	/* for inlined TLB flush functions */
@@ -264,6 +265,43 @@ static void __init kaiser_init_all_pgds(void)
 	WARN_ON(__ret);							\
 } while (0)
 
+void __init kaiser_check_boottime_disable(void)
+{
+	bool enable = true;
+	char arg[5];
+	int ret;
+
+	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+	if (ret > 0) {
+		if (!strncmp(arg, "on", 2))
+			goto enable;
+
+		if (!strncmp(arg, "off", 3))
+			goto disable;
+
+		if (!strncmp(arg, "auto", 4))
+			goto skip;
+	}
+
+	if (cmdline_find_option_bool(boot_command_line, "nopti"))
+		goto disable;
+
+skip:
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		goto disable;
+
+enable:
+	if (enable)
+		setup_force_cpu_cap(X86_FEATURE_KAISER);
+
+	return;
+
+disable:
+	pr_info("Kernel/User page tables isolation: disabled\n");
+	kaiser_enabled = 0;
+	setup_clear_cpu_cap(X86_FEATURE_KAISER);
+}
+
 /*
  * If anything in here fails, we will likely die on one of the
  * first kernel->user transitions and init will die.  But, we
@@ -275,12 +313,10 @@ void __init kaiser_init(void)
 {
 	int cpu;
 
-	if (!kaiser_enabled) {
-		setup_clear_cpu_cap(X86_FEATURE_KAISER);
-		return;
-	}
+	kaiser_check_boottime_disable();
 
-	setup_force_cpu_cap(X86_FEATURE_KAISER);
+	if (!kaiser_enabled)
+		return;
 
 	kaiser_init_all_pgds();
 
@@ -424,16 +460,3 @@ void kaiser_flush_tlb_on_return_to_user(void)
 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 }
 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
-
-static int __init x86_nokaiser_setup(char *s)
-{
-	/* nopti doesn't accept parameters */
-	if (s)
-		return -EINVAL;
-
-	kaiser_enabled = 0;
-	pr_info("Kernel/User page tables isolation: disabled\n");
-
-	return 0;
-}
-early_param("nopti", x86_nokaiser_setup);

From 2dff99eb0335f9e0817410696a180dba25ca7371 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2017 20:49:04 -0700
Subject: [PATCH 30/44] kaiser: use ALTERNATIVE instead of x86_cr3_pcid_noflush

Now that we're playing the ALTERNATIVE game, use that more efficient
method: instead of user-mapping an extra page, and reading an extra
cacheline each time for x86_cr3_pcid_noflush.

Neel has found that __stringify(bts $X86_CR3_PCID_NOFLUSH_BIT, %rax)
is a working substitute for the "bts $63, %rax" in these ALTERNATIVEs;
but the one line with $63 in looks clearer, so let's stick with that.

Worried about what happens with an ALTERNATIVE between the jump and
jump label in another ALTERNATIVE?  I was, but have checked the
combinations in SWITCH_KERNEL_CR3_NO_STACK at entry_SYSCALL_64,
and it does a good job.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S     |  7 ++++---
 arch/x86/include/asm/kaiser.h |  6 +++---
 arch/x86/mm/kaiser.c          | 11 +----------
 3 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 85e30957e494..ad33073ce9d9 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1056,7 +1056,8 @@ ENTRY(paranoid_entry)
 	jz	2f
 	orl	$2, %ebx
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
-	orq	x86_cr3_pcid_noflush, %rax
+	/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+	ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
 	movq	%rax, %cr3
 2:
 #endif
@@ -1318,7 +1319,7 @@ ENTRY(nmi)
 	/* %rax is saved above, so OK to clobber here */
 	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
-	orq	x86_cr3_pcid_noflush, %rax
+	ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
@@ -1562,7 +1563,7 @@ end_repeat_nmi:
 	/* %rax is saved above, so OK to clobber here */
 	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
-	orq	x86_cr3_pcid_noflush, %rax
+	ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 96643a9c194c..906150d6094e 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -25,7 +25,8 @@
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
 andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
-orq  x86_cr3_pcid_noflush, \reg
+/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
 movq \reg, %cr3
 .endm
 
@@ -39,7 +40,7 @@ movq \reg, %cr3
 movq %cr3, \reg
 orq  PER_CPU_VAR(x86_cr3_pcid_user), \reg
 js   9f
-/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
+/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
 movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
 9:
 movq \reg, %cr3
@@ -90,7 +91,6 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 */
 DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
-extern unsigned long x86_cr3_pcid_noflush;
 DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 604aa48c8946..c7d4a1258d1d 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -32,7 +32,6 @@ DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
  * This is also handy because systems that do not support PCIDs
  * just end up or'ing a 0 into their CR3, which does no harm.
  */
-unsigned long x86_cr3_pcid_noflush __read_mostly;
 DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 
 /*
@@ -357,10 +356,6 @@ void __init kaiser_init(void)
 	kaiser_add_user_map_early(&debug_idt_table,
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL);
-
-	kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
-				  sizeof(x86_cr3_pcid_noflush),
-				  __PAGE_KERNEL);
 }
 
 /* Add a mapping to the shadow mapping, and synchronize the mappings */
@@ -434,18 +429,14 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 
 void kaiser_setup_pcid(void)
 {
-	unsigned long kern_cr3 = 0;
 	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
 
-	if (this_cpu_has(X86_FEATURE_PCID)) {
-		kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
+	if (this_cpu_has(X86_FEATURE_PCID))
 		user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
-	}
 	/*
 	 * These variables are used by the entry/exit
 	 * code to change PCID and pgd and TLB flushing.
 	 */
-	x86_cr3_pcid_noflush = kern_cr3;
 	this_cpu_write(x86_cr3_pcid_user, user_cr3);
 }
 

From 28c6de5441740f868a5b371804a0e8dde03757fb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 29 Oct 2017 11:36:19 -0700
Subject: [PATCH 31/44] kaiser: drop is_atomic arg to kaiser_pagetable_walk()

I have not observed a might_sleep() warning from setup_fixmap_gdt()'s
use of kaiser_add_mapping() in our tree (why not?), but like upstream
we have not provided a way for that to pass is_atomic true down to
kaiser_pagetable_walk(), and at startup it's far from a likely source
of trouble: so just delete the walk's is_atomic arg and might_sleep().

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index c7d4a1258d1d..c64bfef99ee8 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -108,19 +108,13 @@ static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
  *
  * Returns a pointer to a PTE on success, or NULL on failure.
  */
-static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
+static pte_t *kaiser_pagetable_walk(unsigned long address)
 {
 	pmd_t *pmd;
 	pud_t *pud;
 	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 
-	if (is_atomic) {
-		gfp &= ~GFP_KERNEL;
-		gfp |= __GFP_HIGH | __GFP_ATOMIC;
-	} else
-		might_sleep();
-
 	if (pgd_none(*pgd)) {
 		WARN_ONCE(1, "All shadow pgds should have been populated");
 		return NULL;
@@ -195,7 +189,7 @@ static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 			ret = -EIO;
 			break;
 		}
-		pte = kaiser_pagetable_walk(address, false);
+		pte = kaiser_pagetable_walk(address);
 		if (!pte) {
 			ret = -ENOMEM;
 			break;

From 0651b3ad99dd59269e2ec883338ab8fba617e203 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sat, 4 Nov 2017 18:23:24 -0700
Subject: [PATCH 32/44] kaiser: asm/tlbflush.h handle noPGE at lower level

I found asm/tlbflush.h too twisty, and think it safer not to avoid
__native_flush_tlb_global_irq_disabled() in the kaiser_enabled case,
but instead let it handle kaiser_enabled along with cr3: it can just
use __native_flush_tlb() for that, no harm in re-disabling preemption.

(This is not the same change as Kirill and Dave have suggested for
upstream, flipping PGE in cr4: that's neat, but needs a cpu_has_pge
check; cr3 is enough for kaiser, and thought to be cheaper than cr4.)

Also delete the X86_FEATURE_INVPCID invpcid_flush_all_nonglobals()
preference from __native_flush_tlb(): unlike the invpcid_flush_all()
preference in __native_flush_tlb_global(), it's not seen in upstream
4.14, and was recently reported to be surprisingly slow.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/tlbflush.h | 27 +++------------------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index b376095a1fd9..6fdc8c399601 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -151,14 +151,6 @@ static inline void kaiser_flush_tlb_on_return_to_user(void)
 
 static inline void __native_flush_tlb(void)
 {
-	if (this_cpu_has(X86_FEATURE_INVPCID)) {
-		/*
-		 * Note, this works with CR4.PCIDE=0 or 1.
-		 */
-		invpcid_flush_all_nonglobals();
-		return;
-	}
-
 	/*
 	 * If current->mm == NULL then we borrow a mm which may change during a
 	 * task switch and therefore we must not be preempted while we write CR3
@@ -182,11 +174,8 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
 		/* restore PGE as it was before */
 		native_write_cr4(cr4);
 	} else {
-		/*
-		 * x86_64 microcode update comes this way when CR4.PGE is not
-		 * enabled, and it's safer for all callers to allow this case.
-		 */
-		native_write_cr3(native_read_cr3());
+		/* do it with cr3, letting kaiser flush user PCID */
+		__native_flush_tlb();
 	}
 }
 
@@ -194,12 +183,6 @@ static inline void __native_flush_tlb_global(void)
 {
 	unsigned long flags;
 
-	if (kaiser_enabled) {
-		/* Globals are not used at all */
-		__native_flush_tlb();
-		return;
-	}
-
 	if (this_cpu_has(X86_FEATURE_INVPCID)) {
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
@@ -255,11 +238,7 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 
 static inline void __flush_tlb_all(void)
 {
-	if (cpu_has_pge)
-		__flush_tlb_global();
-	else
-		__flush_tlb();
-
+	__flush_tlb_global();
 	/*
 	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
 	 * we'd end up flushing kernel translations for the current ASID but

From 8eaca4c7d9f167209a9cc568ff028c0a3b0deb2d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sat, 4 Nov 2017 18:43:06 -0700
Subject: [PATCH 33/44] kaiser: kaiser_flush_tlb_on_return_to_user() check PCID

Let kaiser_flush_tlb_on_return_to_user() do the X86_FEATURE_PCID
check, instead of each caller doing it inline first: nobody needs
to optimize for the noPCID case, it's clearer this way, and better
suits later changes.  Replace those no-op X86_CR3_PCID_KERN_FLUSH lines
by a BUILD_BUG_ON() in load_new_mm_cr3(), in case something changes.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/tlbflush.h | 4 ++--
 arch/x86/mm/kaiser.c            | 6 +++---
 arch/x86/mm/tlb.c               | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6fdc8c399601..73865b174090 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -157,7 +157,7 @@ static inline void __native_flush_tlb(void)
 	 * back:
 	 */
 	preempt_disable();
-	if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
+	if (kaiser_enabled)
 		kaiser_flush_tlb_on_return_to_user();
 	native_write_cr3(native_read_cr3());
 	preempt_enable();
@@ -216,7 +216,7 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 	 */
 
 	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
-		if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
+		if (kaiser_enabled)
 			kaiser_flush_tlb_on_return_to_user();
 		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 		return;
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index c64bfef99ee8..9d6b7517fca5 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -436,12 +436,12 @@ void kaiser_setup_pcid(void)
 
 /*
  * Make a note that this cpu will need to flush USER tlb on return to user.
- * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
- * if cpu does not, then the NOFLUSH bit will never have been set.
+ * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
  */
 void kaiser_flush_tlb_on_return_to_user(void)
 {
-	this_cpu_write(x86_cr3_pcid_user,
+	if (this_cpu_has(X86_FEATURE_PCID))
+		this_cpu_write(x86_cr3_pcid_user,
 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 }
 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6ac065d4230c..7cad01af6dcd 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -39,7 +39,7 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 {
 	unsigned long new_mm_cr3 = __pa(pgdir);
 
-	if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) {
+	if (kaiser_enabled) {
 		/*
 		 * We reuse the same PCID for different tasks, so we must
 		 * flush all the entries for the PCID out when we change tasks.
@@ -50,10 +50,10 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 		 * do it here, but can only be used if X86_FEATURE_INVPCID is
 		 * available - and many machines support pcid without invpcid.
 		 *
-		 * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
-		 * but keep that line in there in case something changes.
+		 * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
+		 * would be needed in the write_cr3() below - if PCIDs enabled.
 		 */
-		new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+		BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
 		kaiser_flush_tlb_on_return_to_user();
 	}
 

From 3e809caffdd7beeac731feb16788873c3bdb811e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:30 +0100
Subject: [PATCH 34/44] x86/paravirt: Dont patch flush_tlb_single

commit a035795499ca1c2bd1928808d1a156eda1420383 upstream

native_flush_tlb_single() will be changed with the upcoming
PAGE_TABLE_ISOLATION feature. This requires to have more code in
there than INVLPG.

Remove the paravirt patching for it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Cc: michael.schwarz@iaik.tugraz.at
Cc: moritz.lipp@iaik.tugraz.at
Cc: richard.fellner@student.tugraz.at
Link: https://lkml.kernel.org/r/20171204150606.828111617@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/paravirt_patch_64.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 8aa05583bc42..0677bf8d3a42 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
@@ -62,7 +61,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_cpu_ops, clts);
-		PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 		PATCH_SITE(pv_cpu_ops, wbinvd);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):

From 750fb627d764eb66430c36961b94ab0002694c02 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 2 Jan 2018 14:19:49 +0100
Subject: [PATCH 35/44] x86/kaiser: Reenable PARAVIRT

Now that the required bits have been addressed, reenable
PARAVIRT.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 security/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/Kconfig b/security/Kconfig
index 8d5d2407be7e..89f2d35367b2 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -34,7 +34,7 @@ config SECURITY
 config KAISER
 	bool "Remove the kernel mapping in user mode"
 	default y
-	depends on X86_64 && SMP && !PARAVIRT
+	depends on X86_64 && SMP
 	help
 	  This enforces a strict kernel and user space isolation, in order
 	  to close hardware side channels on kernel address information.

From e4ba212ec64109b17fb8653ccfa2ed2c6e3e8217 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Tue, 2 Jan 2018 14:19:49 +0100
Subject: [PATCH 36/44] kaiser: disabled on Xen PV

Kaiser cannot be used on paravirtualized MMUs (namely reading and writing CR3).
This does not work with KAISER as the CR3 switch from and to user space PGD
would require to map the whole XEN_PV machinery into both.

More importantly, enabling KAISER on Xen PV doesn't make too much sense, as PV
guests use distinct %cr3 values for kernel and user already.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 9d6b7517fca5..6a2e00a80105 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -264,6 +264,9 @@ void __init kaiser_check_boottime_disable(void)
 	char arg[5];
 	int ret;
 
+	if (boot_cpu_has(X86_FEATURE_XENPV))
+		goto silent_disable;
+
 	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
 	if (ret > 0) {
 		if (!strncmp(arg, "on", 2))
@@ -291,6 +294,8 @@ enable:
 
 disable:
 	pr_info("Kernel/User page tables isolation: disabled\n");
+
+silent_disable:
 	kaiser_enabled = 0;
 	setup_clear_cpu_cap(X86_FEATURE_KAISER);
 }

From 7f79599df9c4a36130f7a4f6778b334a97632477 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 25 Dec 2017 13:57:16 +0100
Subject: [PATCH 37/44] x86/kaiser: Move feature detection up

... before the first use of kaiser_enabled as otherwise funky
things happen:

  about to get started...
  (XEN) d0v0 Unhandled page fault fault/trap [#14, ec=0000]
  (XEN) Pagetable walk from ffff88022a449090:
  (XEN)  L4[0x110] = 0000000229e0e067 0000000000001e0e
  (XEN)  L3[0x008] = 0000000000000000 ffffffffffffffff
  (XEN) domain_crash_sync called from entry.S: fault at ffff82d08033fd08
  entry.o#create_bounce_frame+0x135/0x14d
  (XEN) Domain 0 (vcpu#0) crashed on cpu#0:
  (XEN) ----[ Xen-4.9.1_02-3.21  x86_64  debug=n   Not tainted ]----
  (XEN) CPU:    0
  (XEN) RIP:    e033:[<ffffffff81007460>]
  (XEN) RFLAGS: 0000000000000286   EM: 1   CONTEXT: pv guest (d0v0)

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/kaiser.h | 2 ++
 arch/x86/kernel/setup.c       | 7 +++++++
 arch/x86/mm/kaiser.c          | 2 --
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 906150d6094e..b5e46aa683f4 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -96,8 +96,10 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
 
 extern int kaiser_enabled;
+extern void __init kaiser_check_boottime_disable(void);
 #else
 #define kaiser_enabled	0
+static inline void __init kaiser_check_boottime_disable(void) {}
 #endif /* CONFIG_KAISER */
 
 /*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e67b834279b2..bbaae4cf9e8e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,7 @@
 #include <asm/alternative.h>
 #include <asm/prom.h>
 #include <asm/microcode.h>
+#include <asm/kaiser.h>
 
 /*
  * max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -1016,6 +1017,12 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	init_hypervisor_platform();
 
+	/*
+	 * This needs to happen right after XENPV is set on xen and
+	 * kaiser_enabled is checked below in cleanup_highmap().
+	 */
+	kaiser_check_boottime_disable();
+
 	x86_init.resources.probe_roms();
 
 	/* after parse_early_param, so could debug it */
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 6a2e00a80105..d86d56f5e82e 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -311,8 +311,6 @@ void __init kaiser_init(void)
 {
 	int cpu;
 
-	kaiser_check_boottime_disable();
-
 	if (!kaiser_enabled)
 		return;
 

From 3e1457d6bf26d9ec300781f84cd0057e44deb45d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 Jan 2018 10:43:15 -0800
Subject: [PATCH 38/44] KPTI: Rename to PAGE_TABLE_ISOLATION

This renames CONFIG_KAISER to CONFIG_PAGE_TABLE_ISOLATION.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/boot/compressed/misc.h           |  2 +-
 arch/x86/entry/entry_64.S                 | 12 ++++++------
 arch/x86/include/asm/cpufeature.h         |  2 +-
 arch/x86/include/asm/kaiser.h             | 12 ++++++------
 arch/x86/include/asm/pgtable.h            |  4 ++--
 arch/x86/include/asm/pgtable_64.h         |  4 ++--
 arch/x86/include/asm/pgtable_types.h      |  2 +-
 arch/x86/include/asm/tlbflush.h           |  2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c |  4 ++--
 arch/x86/kernel/head_64.S                 |  2 +-
 arch/x86/mm/Makefile                      |  2 +-
 include/linux/kaiser.h                    |  6 +++---
 include/linux/percpu-defs.h               |  2 +-
 security/Kconfig                          |  2 +-
 14 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 4bf52d351022..4abb284a5b9c 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -9,7 +9,7 @@
  */
 #undef CONFIG_PARAVIRT
 #undef CONFIG_PARAVIRT_SPINLOCKS
-#undef CONFIG_KAISER
+#undef CONFIG_PAGE_TABLE_ISOLATION
 #undef CONFIG_KASAN
 
 #include <linux/linkage.h>
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ad33073ce9d9..952b23b5d4e9 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1043,7 +1043,7 @@ ENTRY(paranoid_entry)
 	SWAPGS
 	xorl	%ebx, %ebx
 1:
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/*
 	 * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
 	 * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
@@ -1083,7 +1083,7 @@ ENTRY(paranoid_exit)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF_DEBUG
 	TRACE_IRQS_IRETQ_DEBUG
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
 	testl	$2, %ebx			/* SWITCH_USER_CR3 needed? */
 	jz	paranoid_exit_no_switch
@@ -1314,7 +1314,7 @@ ENTRY(nmi)
 
 	movq	%rsp, %rdi
 	movq	$-1, %rsi
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
 	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
@@ -1328,7 +1328,7 @@ ENTRY(nmi)
 #endif
 	call	do_nmi
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/*
 	 * Unconditionally restore CR3.  I know we return to
 	 * kernel code that needs user CR3, but do we ever return
@@ -1558,7 +1558,7 @@ end_repeat_nmi:
 1:
 	movq	%rsp, %rdi
 	movq	$-1, %rsi
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
 	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
@@ -1574,7 +1574,7 @@ end_repeat_nmi:
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
 	call	do_nmi
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/*
 	 * Unconditionally restore CR3.  We might be returning to
 	 * kernel code that needs user CR3, like just just before
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3fc40a43578f..f6605712ca90 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -201,7 +201,7 @@
 #define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
 
 /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
-#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
+#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index b5e46aa683f4..802bbbdfe143 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -20,7 +20,7 @@
 #define KAISER_SHADOW_PGD_OFFSET 0x1000
 
 #ifdef __ASSEMBLY__
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
@@ -69,7 +69,7 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 8:
 .endm
 
-#else /* CONFIG_KAISER */
+#else /* CONFIG_PAGE_TABLE_ISOLATION */
 
 .macro SWITCH_KERNEL_CR3
 .endm
@@ -78,11 +78,11 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 .macro SWITCH_KERNEL_CR3_NO_STACK
 .endm
 
-#endif /* CONFIG_KAISER */
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
 
 #else /* __ASSEMBLY__ */
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 /*
  * Upon kernel/user mode switch, it may happen that the address
  * space has to be switched before the registers have been
@@ -100,10 +100,10 @@ extern void __init kaiser_check_boottime_disable(void);
 #else
 #define kaiser_enabled	0
 static inline void __init kaiser_check_boottime_disable(void) {}
-#endif /* CONFIG_KAISER */
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
 
 /*
- * Kaiser function prototypes are needed even when CONFIG_KAISER is not set,
+ * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,
  * so as to build with tests on kaiser_enabled instead of #ifdefs.
  */
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 051beec179f4..84c62d950023 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -18,7 +18,7 @@
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 extern int kaiser_enabled;
 #else
 #define kaiser_enabled 0
@@ -872,7 +872,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
 	memcpy(dst, src, count * sizeof(pgd_t));
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	if (kaiser_enabled) {
 		/* Clone the shadow pgd part as well */
 		memcpy(native_get_shadow_pgd(dst),
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 233d19c9f22e..c810226e741a 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,7 +106,7 @@ static inline void native_pud_clear(pud_t *pud)
 	native_set_pud(pud, native_make_pud(0));
 }
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
 
 static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
@@ -127,7 +127,7 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 	BUILD_BUG_ON(1);
 	return NULL;
 }
-#endif /* CONFIG_KAISER */
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 22547baa458a..8dba273da25a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -109,7 +109,7 @@
 #define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
 #define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
 
-#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
 /* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
 #define X86_CR3_PCID_ASID_USER	(_AC(0x80,UL))
 
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 73865b174090..a691b66cc40a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -135,7 +135,7 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
  * Declare a couple of kaiser interfaces here for convenience,
  * to avoid the need for asm/kaiser.h in unexpected places.
  */
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 extern int kaiser_enabled;
 extern void kaiser_setup_pcid(void);
 extern void kaiser_flush_tlb_on_return_to_user(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 4e334e3ac16f..f01b3a12dce0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -274,7 +274,7 @@ static DEFINE_PER_CPU(void *, insn_buffer);
 
 static void *dsalloc(size_t size, gfp_t flags, int node)
 {
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	unsigned int order = get_order(size);
 	struct page *page;
 	unsigned long addr;
@@ -295,7 +295,7 @@ static void *dsalloc(size_t size, gfp_t flags, int node)
 
 static void dsfree(const void *buffer, size_t size)
 {
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	if (!buffer)
 		return;
 	kaiser_remove_mapping((unsigned long)buffer, size);
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 9af949ce8a69..4034e905741a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -441,7 +441,7 @@ early_idt_ripmsg:
 	.balign	PAGE_SIZE; \
 GLOBAL(name)
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 /*
  * Each PGD needs to be 8k long and 8k aligned.  We do not
  * ever go out to userspace with these, so we do not
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 978156081bed..61e6cead9c4a 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,4 +32,4 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
 obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
-obj-$(CONFIG_KAISER)		+= kaiser.o
+obj-$(CONFIG_PAGE_TABLE_ISOLATION)		+= kaiser.o
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
index 4a4d6d911a14..58c55b1589d0 100644
--- a/include/linux/kaiser.h
+++ b/include/linux/kaiser.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_KAISER_H
 #define _LINUX_KAISER_H
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 #include <asm/kaiser.h>
 
 static inline int kaiser_map_thread_stack(void *stack)
@@ -24,7 +24,7 @@ static inline void kaiser_unmap_thread_stack(void *stack)
 #else
 
 /*
- * These stubs are used whenever CONFIG_KAISER is off, which
+ * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which
  * includes architectures that support KAISER, but have it disabled.
  */
 
@@ -48,5 +48,5 @@ static inline void kaiser_unmap_thread_stack(void *stack)
 {
 }
 
-#endif /* !CONFIG_KAISER */
+#endif /* !CONFIG_PAGE_TABLE_ISOLATION */
 #endif /* _LINUX_KAISER_H */
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index cfe13cb4ec63..8902f23bb770 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -35,7 +35,7 @@
 
 #endif
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 #define USER_MAPPED_SECTION "..user_mapped"
 #else
 #define USER_MAPPED_SECTION ""
diff --git a/security/Kconfig b/security/Kconfig
index 89f2d35367b2..a3ebb6ee5bd5 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -31,7 +31,7 @@ config SECURITY
 
 	  If you are unsure how to answer this question, answer N.
 
-config KAISER
+config PAGE_TABLE_ISOLATION
 	bool "Remove the kernel mapping in user mode"
 	default y
 	depends on X86_64 && SMP

From bfd51a4d715b6ef44bd01b9fbfc13da936f93d76 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 Jan 2018 10:43:32 -0800
Subject: [PATCH 39/44] KPTI: Report when enabled

Make sure dmesg reports when KPTI is enabled.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index d86d56f5e82e..bf6a915f60e7 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -11,6 +11,9 @@
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 
+#undef pr_fmt
+#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
+
 #include <asm/kaiser.h>
 #include <asm/tlbflush.h>	/* to verify its kaiser declarations */
 #include <asm/pgtable.h>
@@ -293,7 +296,7 @@ enable:
 	return;
 
 disable:
-	pr_info("Kernel/User page tables isolation: disabled\n");
+	pr_info("disabled\n");
 
 silent_disable:
 	kaiser_enabled = 0;
@@ -353,6 +356,8 @@ void __init kaiser_init(void)
 	kaiser_add_user_map_early(&debug_idt_table,
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL);
+
+	pr_info("enabled\n");
 }
 
 /* Add a mapping to the shadow mapping, and synchronize the mappings */

From 64e239804e21901f1a171681269460878bb5f198 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 10 Dec 2015 19:20:19 -0800
Subject: [PATCH 40/44] x86, vdso, pvclock: Simplify and speed up the vdso
 pvclock reader

commit 6b078f5de7fc0851af4102493c7b5bb07e49c4cb upstream.

The pvclock vdso code was too abstracted to understand easily
and excessively paranoid.  Simplify it for a huge speedup.

This opens the door for additional simplifications, as the vdso
no longer accesses the pvti for any vcpu other than vcpu 0.

Before, vclock_gettime using kvm-clock took about 45ns on my
machine. With this change, it takes 29ns, which is almost as
fast as the pure TSC implementation.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/6b51dcc41f1b101f963945c5ec7093d72bdac429.1449702533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Jamie Iles <jamie.iles@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/vdso/vclock_gettime.c | 79 ++++++++++++++++------------
 1 file changed, 45 insertions(+), 34 deletions(-)

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index ca94fa649251..c325ba1bdddf 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -78,47 +78,58 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
 
 static notrace cycle_t vread_pvclock(int *mode)
 {
-	const struct pvclock_vsyscall_time_info *pvti;
+	const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti;
 	cycle_t ret;
-	u64 last;
-	u32 version;
-	u8 flags;
-	unsigned cpu, cpu1;
-
+	u64 tsc, pvti_tsc;
+	u64 last, delta, pvti_system_time;
+	u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
 
 	/*
-	 * Note: hypervisor must guarantee that:
-	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-	 * 2. that per-CPU pvclock time info is updated if the
-	 *    underlying CPU changes.
-	 * 3. that version is increased whenever underlying CPU
-	 *    changes.
+	 * Note: The kernel and hypervisor must guarantee that cpu ID
+	 * number maps 1:1 to per-CPU pvclock time info.
 	 *
+	 * Because the hypervisor is entirely unaware of guest userspace
+	 * preemption, it cannot guarantee that per-CPU pvclock time
+	 * info is updated if the underlying CPU changes or that that
+	 * version is increased whenever underlying CPU changes.
+	 *
+	 * On KVM, we are guaranteed that pvti updates for any vCPU are
+	 * atomic as seen by *all* vCPUs.  This is an even stronger
+	 * guarantee than we get with a normal seqlock.
+	 *
+	 * On Xen, we don't appear to have that guarantee, but Xen still
+	 * supplies a valid seqlock using the version field.
+
+	 * We only do pvclock vdso timing at all if
+	 * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+	 * mean that all vCPUs have matching pvti and that the TSC is
+	 * synced, so we can just look at vCPU 0's pvti.
 	 */
-	do {
-		cpu = __getcpu() & VGETCPU_CPU_MASK;
-		/* TODO: We can put vcpu id into higher bits of pvti.version.
-		 * This will save a couple of cycles by getting rid of
-		 * __getcpu() calls (Gleb).
-		 */
 
-		pvti = get_pvti(cpu);
-
-		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
-
-		/*
-		 * Test we're still on the cpu as well as the version.
-		 * We could have been migrated just after the first
-		 * vgetcpu but before fetching the version, so we
-		 * wouldn't notice a version change.
-		 */
-		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
-	} while (unlikely(cpu != cpu1 ||
-			  (pvti->pvti.version & 1) ||
-			  pvti->pvti.version != version));
-
-	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
+	if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
 		*mode = VCLOCK_NONE;
+		return 0;
+	}
+
+	do {
+		version = pvti->version;
+
+		/* This is also a read barrier, so we'll read version first. */
+		tsc = rdtsc_ordered();
+
+		pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
+		pvti_tsc_shift = pvti->tsc_shift;
+		pvti_system_time = pvti->system_time;
+		pvti_tsc = pvti->tsc_timestamp;
+
+		/* Make sure that the version double-check is last. */
+		smp_rmb();
+	} while (unlikely((version & 1) || version != pvti->version));
+
+	delta = tsc - pvti_tsc;
+	ret = pvti_system_time +
+		pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
+				    pvti_tsc_shift);
 
 	/* refer to tsc.c read_tsc() comment for rationale */
 	last = gtod->cycle_last;

From 755bd549d9328d6d1e949a0a213f9a78e84d11fc Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 10 Dec 2015 19:20:20 -0800
Subject: [PATCH 41/44] x86/vdso: Get pvclock data from the vvar VMA instead of
 the fixmap

commit dac16fba6fc590fa7239676b35ed75dae4c4cd2b upstream.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9d37826fdc7e2d2809efe31d5345f97186859284.1449702533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Jamie Iles <jamie.iles@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/vdso/vclock_gettime.c  | 20 ++++++++------------
 arch/x86/entry/vdso/vdso-layout.lds.S |  3 ++-
 arch/x86/entry/vdso/vdso2c.c          |  3 +++
 arch/x86/entry/vdso/vma.c             | 13 +++++++++++++
 arch/x86/include/asm/pvclock.h        |  9 +++++++++
 arch/x86/include/asm/vdso.h           |  1 +
 arch/x86/kernel/kvmclock.c            |  5 +++++
 7 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index c325ba1bdddf..5dd363d54348 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void)
 }
 #endif
 
+#ifdef CONFIG_PARAVIRT_CLOCK
+extern u8 pvclock_page
+	__attribute__((visibility("hidden")));
+#endif
+
 #ifndef BUILD_VDSO32
 
 #include <linux/kernel.h>
@@ -62,23 +67,14 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 
 #ifdef CONFIG_PARAVIRT_CLOCK
 
-static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
+static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
 {
-	const struct pvclock_vsyscall_time_info *pvti_base;
-	int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
-	int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
-
-	BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
-
-	pvti_base = (struct pvclock_vsyscall_time_info *)
-		    __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
-
-	return &pvti_base[offset];
+	return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
 }
 
 static notrace cycle_t vread_pvclock(int *mode)
 {
-	const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti;
+	const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
 	cycle_t ret;
 	u64 tsc, pvti_tsc;
 	u64 last, delta, pvti_system_time;
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index de2c921025f5..4158acc17df0 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -25,7 +25,7 @@ SECTIONS
 	 * segment.
 	 */
 
-	vvar_start = . - 2 * PAGE_SIZE;
+	vvar_start = . - 3 * PAGE_SIZE;
 	vvar_page = vvar_start;
 
 	/* Place all vvars at the offsets in asm/vvar.h. */
@@ -36,6 +36,7 @@ SECTIONS
 #undef EMIT_VVAR
 
 	hpet_page = vvar_start + PAGE_SIZE;
+	pvclock_page = vvar_start + 2 * PAGE_SIZE;
 
 	. = SIZEOF_HEADERS;
 
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 785d9922b106..491020b2826d 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -73,6 +73,7 @@ enum {
 	sym_vvar_start,
 	sym_vvar_page,
 	sym_hpet_page,
+	sym_pvclock_page,
 	sym_VDSO_FAKE_SECTION_TABLE_START,
 	sym_VDSO_FAKE_SECTION_TABLE_END,
 };
@@ -80,6 +81,7 @@ enum {
 const int special_pages[] = {
 	sym_vvar_page,
 	sym_hpet_page,
+	sym_pvclock_page,
 };
 
 struct vdso_sym {
@@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {
 	[sym_vvar_start] = {"vvar_start", true},
 	[sym_vvar_page] = {"vvar_page", true},
 	[sym_hpet_page] = {"hpet_page", true},
+	[sym_pvclock_page] = {"pvclock_page", true},
 	[sym_VDSO_FAKE_SECTION_TABLE_START] = {
 		"VDSO_FAKE_SECTION_TABLE_START", false
 	},
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 64df47148160..aa828191c654 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -100,6 +100,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 		.name = "[vvar]",
 		.pages = no_pages,
 	};
+	struct pvclock_vsyscall_time_info *pvti;
 
 	if (calculate_addr) {
 		addr = vdso_addr(current->mm->start_stack,
@@ -169,6 +170,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 	}
 #endif
 
+	pvti = pvclock_pvti_cpu0_va();
+	if (pvti && image->sym_pvclock_page) {
+		ret = remap_pfn_range(vma,
+				      text_start + image->sym_pvclock_page,
+				      __pa(pvti) >> PAGE_SHIFT,
+				      PAGE_SIZE,
+				      PAGE_READONLY);
+
+		if (ret)
+			goto up_fail;
+	}
+
 up_fail:
 	if (ret)
 		current->mm->context.vdso = NULL;
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index baad72e4c100..6045cef376c2 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -4,6 +4,15 @@
 #include <linux/clocksource.h>
 #include <asm/pvclock-abi.h>
 
+#ifdef CONFIG_PARAVIRT_CLOCK
+extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
+#else
+static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
+{
+	return NULL;
+}
+#endif
+
 /* some helper functions for xen and kvm pv clock sources */
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
 u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 756de9190aec..deabaf9759b6 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -22,6 +22,7 @@ struct vdso_image {
 
 	long sym_vvar_page;
 	long sym_hpet_page;
+	long sym_pvclock_page;
 	long sym_VDSO32_NOTE_MASK;
 	long sym___kernel_sigreturn;
 	long sym___kernel_rt_sigreturn;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2bd81e302427..ec1b06dc82d2 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock);
 static struct pvclock_vsyscall_time_info *hv_clock;
 static struct pvclock_wall_clock wall_clock;
 
+struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
+{
+	return hv_clock;
+}
+
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
  * have elapsed since the hypervisor wrote the data. So we try to account for

From 2b24fe5c57af4d20650e1c3157305459fc5f4afc Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Mon, 11 Jan 2016 15:51:18 +0300
Subject: [PATCH 42/44] x86/kasan: Clear kasan_zero_page after TLB flush

commit 69e0210fd01ff157d332102219aaf5c26ca8069b upstream.

Currently we clear kasan_zero_page before __flush_tlb_all(). This
works with current implementation of native_flush_tlb[_global]()
because it doesn't cause do any writes to kasan shadow memory.
But any subtle change made in native_flush_tlb*() could break this.
Also current code seems doesn't work for paravirt guests (lguest).

Only after the TLB flush we can be sure that kasan_zero_page is not
used as early shadow anymore (instrumented code will not write to it).
So it should cleared it only after the TLB flush.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1452516679-32040-2-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Jamie Iles <jamie.iles@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kasan_init_64.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 4e5ac46adc9d..81ec7c02f968 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -121,11 +121,16 @@ void __init kasan_init(void)
 	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
 			(void *)KASAN_SHADOW_END);
 
-	memset(kasan_zero_page, 0, PAGE_SIZE);
-
 	load_cr3(init_level4_pgt);
 	__flush_tlb_all();
-	init_task.kasan_depth = 0;
 
+	/*
+	 * kasan_zero_page has been used as early shadow memory, thus it may
+	 * contain some garbage. Now we can clear it, since after the TLB flush
+	 * no one should write to it.
+	 */
+	memset(kasan_zero_page, 0, PAGE_SIZE);
+
+	init_task.kasan_depth = 0;
 	pr_info("KernelAddressSanitizer initialized\n");
 }

From b33c3c64c4786cd724ccde6fa97c87ada49f6a73 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <groeck@chromium.org>
Date: Thu, 4 Jan 2018 13:41:55 -0800
Subject: [PATCH 43/44] kaiser: Set _PAGE_NX only if supported

This resolves a crash if loaded under qemu + haxm under windows.
See https://www.spinics.net/lists/kernel/msg2689835.html for details.
Here is a boot log (the log is from chromeos-4.4, but Tao Wu says that
the same log is also seen with vanilla v4.4.110-rc1).

[    0.712750] Freeing unused kernel memory: 552K
[    0.721821] init: Corrupted page table at address 57b029b332e0
[    0.722761] PGD 80000000bb238067 PUD bc36a067 PMD bc369067 PTE 45d2067
[    0.722761] Bad pagetable: 000b [#1] PREEMPT SMP
[    0.722761] Modules linked in:
[    0.722761] CPU: 1 PID: 1 Comm: init Not tainted 4.4.96 #31
[    0.722761] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014
[    0.722761] task: ffff8800bc290000 ti: ffff8800bc28c000 task.ti: ffff8800bc28c000
[    0.722761] RIP: 0010:[<ffffffff83f4129e>]  [<ffffffff83f4129e>] __clear_user+0x42/0x67
[    0.722761] RSP: 0000:ffff8800bc28fcf8  EFLAGS: 00010202
[    0.722761] RAX: 0000000000000000 RBX: 00000000000001a4 RCX: 00000000000001a4
[    0.722761] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000057b029b332e0
[    0.722761] RBP: ffff8800bc28fd08 R08: ffff8800bc290000 R09: ffff8800bb2f4000
[    0.722761] R10: ffff8800bc290000 R11: ffff8800bb2f4000 R12: 000057b029b332e0
[    0.722761] R13: 0000000000000000 R14: 000057b029b33340 R15: ffff8800bb1e2a00
[    0.722761] FS:  0000000000000000(0000) GS:ffff8800bfb00000(0000) knlGS:0000000000000000
[    0.722761] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    0.722761] CR2: 000057b029b332e0 CR3: 00000000bb2f8000 CR4: 00000000000006e0
[    0.722761] Stack:
[    0.722761]  000057b029b332e0 ffff8800bb95fa80 ffff8800bc28fd18 ffffffff83f4120c
[    0.722761]  ffff8800bc28fe18 ffffffff83e9e7a1 ffff8800bc28fd68 0000000000000000
[    0.722761]  ffff8800bc290000 ffff8800bc290000 ffff8800bc290000 ffff8800bc290000
[    0.722761] Call Trace:
[    0.722761]  [<ffffffff83f4120c>] clear_user+0x2e/0x30
[    0.722761]  [<ffffffff83e9e7a1>] load_elf_binary+0xa7f/0x18f7
[    0.722761]  [<ffffffff83de2088>] search_binary_handler+0x86/0x19c
[    0.722761]  [<ffffffff83de389e>] do_execveat_common.isra.26+0x909/0xf98
[    0.722761]  [<ffffffff844febe0>] ? rest_init+0x87/0x87
[    0.722761]  [<ffffffff83de40be>] do_execve+0x23/0x25
[    0.722761]  [<ffffffff83c002e3>] run_init_process+0x2b/0x2d
[    0.722761]  [<ffffffff844fec4d>] kernel_init+0x6d/0xda
[    0.722761]  [<ffffffff84505b2f>] ret_from_fork+0x3f/0x70
[    0.722761]  [<ffffffff844febe0>] ? rest_init+0x87/0x87
[    0.722761] Code: 86 84 be 12 00 00 00 e8 87 0d e8 ff 66 66 90 48 89 d8 48 c1
eb 03 4c 89 e7 83 e0 07 48 89 d9 be 08 00 00 00 31 d2 48 85 c9 74 0a <48> 89 17
48 01 f7 ff c9 75 f6 48 89 c1 85 c9 74 09 88 17 48 ff
[    0.722761] RIP  [<ffffffff83f4129e>] __clear_user+0x42/0x67
[    0.722761]  RSP <ffff8800bc28fcf8>
[    0.722761] ---[ end trace def703879b4ff090 ]---
[    0.722761] BUG: sleeping function called from invalid context at /mnt/host/source/src/third_party/kernel/v4.4/kernel/locking/rwsem.c:21
[    0.722761] in_atomic(): 0, irqs_disabled(): 1, pid: 1, name: init
[    0.722761] CPU: 1 PID: 1 Comm: init Tainted: G      D         4.4.96 #31
[    0.722761] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014
[    0.722761]  0000000000000086 dcb5d76098c89836 ffff8800bc28fa30 ffffffff83f34004
[    0.722761]  ffffffff84839dc2 0000000000000015 ffff8800bc28fa40 ffffffff83d57dc9
[    0.722761]  ffff8800bc28fa68 ffffffff83d57e6a ffffffff84a53640 0000000000000000
[    0.722761] Call Trace:
[    0.722761]  [<ffffffff83f34004>] dump_stack+0x4d/0x63
[    0.722761]  [<ffffffff83d57dc9>] ___might_sleep+0x13a/0x13c
[    0.722761]  [<ffffffff83d57e6a>] __might_sleep+0x9f/0xa6
[    0.722761]  [<ffffffff84502788>] down_read+0x20/0x31
[    0.722761]  [<ffffffff83cc5d9b>] __blocking_notifier_call_chain+0x35/0x63
[    0.722761]  [<ffffffff83cc5ddd>] blocking_notifier_call_chain+0x14/0x16
[    0.800374] usb 1-1: new full-speed USB device number 2 using uhci_hcd
[    0.722761]  [<ffffffff83cefe97>] profile_task_exit+0x1a/0x1c
[    0.802309]  [<ffffffff83cac84e>] do_exit+0x39/0xe7f
[    0.802309]  [<ffffffff83ce5938>] ? vprintk_default+0x1d/0x1f
[    0.802309]  [<ffffffff83d7bb95>] ? printk+0x57/0x73
[    0.802309]  [<ffffffff83c46e25>] oops_end+0x80/0x85
[    0.802309]  [<ffffffff83c7b747>] pgtable_bad+0x8a/0x95
[    0.802309]  [<ffffffff83ca7f4a>] __do_page_fault+0x8c/0x352
[    0.802309]  [<ffffffff83eefba5>] ? file_has_perm+0xc4/0xe5
[    0.802309]  [<ffffffff83ca821c>] do_page_fault+0xc/0xe
[    0.802309]  [<ffffffff84507682>] page_fault+0x22/0x30
[    0.802309]  [<ffffffff83f4129e>] ? __clear_user+0x42/0x67
[    0.802309]  [<ffffffff83f4127f>] ? __clear_user+0x23/0x67
[    0.802309]  [<ffffffff83f4120c>] clear_user+0x2e/0x30
[    0.802309]  [<ffffffff83e9e7a1>] load_elf_binary+0xa7f/0x18f7
[    0.802309]  [<ffffffff83de2088>] search_binary_handler+0x86/0x19c
[    0.802309]  [<ffffffff83de389e>] do_execveat_common.isra.26+0x909/0xf98
[    0.802309]  [<ffffffff844febe0>] ? rest_init+0x87/0x87
[    0.802309]  [<ffffffff83de40be>] do_execve+0x23/0x25
[    0.802309]  [<ffffffff83c002e3>] run_init_process+0x2b/0x2d
[    0.802309]  [<ffffffff844fec4d>] kernel_init+0x6d/0xda
[    0.802309]  [<ffffffff84505b2f>] ret_from_fork+0x3f/0x70
[    0.802309]  [<ffffffff844febe0>] ? rest_init+0x87/0x87
[    0.830559] Kernel panic - not syncing: Attempted to kill init!  exitcode=0x00000009
[    0.830559]
[    0.831305] Kernel Offset: 0x2c00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[    0.831305] ---[ end Kernel panic - not syncing: Attempted to kill init!  exitcode=0x00000009

The crash part of this problem may be solved with the following patch
(thanks to Hugh for the hint). There is still another problem, though -
with this patch applied, the qemu session aborts with "VCPU Shutdown
request", whatever that means.

Cc: lepton <ytht.net@gmail.com>
Signed-off-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index bf6a915f60e7..b0b3a69f1c7f 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -414,7 +414,8 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 			 * get out to userspace running on the kernel CR3,
 			 * userspace will crash instead of running.
 			 */
-			pgd.pgd |= _PAGE_NX;
+			if (__supported_pte_mask & _PAGE_NX)
+				pgd.pgd |= _PAGE_NX;
 		}
 	} else if (!pgd.pgd) {
 		/*

From b3e3db15b45027e3b77ec7f722e2b7210b1bf726 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 5 Jan 2018 15:44:27 +0100
Subject: [PATCH 44/44] Linux 4.4.110

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 5d67056e24dd..b028c106535b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 109
+SUBLEVEL = 110
 EXTRAVERSION =
 NAME = Blurry Fish Butt