From 27248d2fa77f028f639f2cbac621447cc94dbbb5 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 15 Dec 2015 15:41:37 -0800
Subject: [PATCH 01/28] netlink: add a start callback for starting a netlink
 dump

commit fc9e50f5a5a4e1fa9ba2756f745a13e693cf6a06 upstream.

The start callback allows the caller to set up a context for the
dump callbacks. Presumably, the context can then be destroyed in
the done callback.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/netlink.h  |  2 ++
 include/net/genetlink.h  |  2 ++
 net/netlink/af_netlink.c |  4 ++++
 net/netlink/genetlink.c  | 16 ++++++++++++++++
 4 files changed, 24 insertions(+)

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 639e9b8b0e4d..0b41959aab9f 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -131,6 +131,7 @@ netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 struct netlink_callback {
 	struct sk_buff		*skb;
 	const struct nlmsghdr	*nlh;
+	int			(*start)(struct netlink_callback *);
 	int			(*dump)(struct sk_buff * skb,
 					struct netlink_callback *cb);
 	int			(*done)(struct netlink_callback *cb);
@@ -153,6 +154,7 @@ struct nlmsghdr *
 __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags);
 
 struct netlink_dump_control {
+	int (*start)(struct netlink_callback *);
 	int (*dump)(struct sk_buff *skb, struct netlink_callback *);
 	int (*done)(struct netlink_callback *);
 	void *data;
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 1b6b6dcb018d..43c0e771f417 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -114,6 +114,7 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net)
  * @flags: flags
  * @policy: attribute validation policy
  * @doit: standard command callback
+ * @start: start callback for dumps
  * @dumpit: callback for dumpers
  * @done: completion callback for dumps
  * @ops_list: operations list
@@ -122,6 +123,7 @@ struct genl_ops {
 	const struct nla_policy	*policy;
 	int		       (*doit)(struct sk_buff *skb,
 				       struct genl_info *info);
+	int		       (*start)(struct netlink_callback *cb);
 	int		       (*dumpit)(struct sk_buff *skb,
 					 struct netlink_callback *cb);
 	int		       (*done)(struct netlink_callback *cb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 9ecdd61c6463..a87afc4f3c91 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2203,6 +2203,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 
 	cb = &nlk->cb;
 	memset(cb, 0, sizeof(*cb));
+	cb->start = control->start;
 	cb->dump = control->dump;
 	cb->done = control->done;
 	cb->nlh = nlh;
@@ -2216,6 +2217,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 
 	mutex_unlock(nlk->cb_mutex);
 
+	if (cb->start)
+		cb->start(cb);
+
 	ret = netlink_dump(sk);
 	sock_put(sk);
 
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index bc0e504f33a6..8e63662c6fb0 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -513,6 +513,20 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
 }
 EXPORT_SYMBOL(genlmsg_put);
 
+static int genl_lock_start(struct netlink_callback *cb)
+{
+	/* our ops are always const - netlink API doesn't propagate that */
+	const struct genl_ops *ops = cb->data;
+	int rc = 0;
+
+	if (ops->start) {
+		genl_lock();
+		rc = ops->start(cb);
+		genl_unlock();
+	}
+	return rc;
+}
+
 static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	/* our ops are always const - netlink API doesn't propagate that */
@@ -577,6 +591,7 @@ static int genl_family_rcv_msg(struct genl_family *family,
 				.module = family->module,
 				/* we have const, but the netlink API doesn't */
 				.data = (void *)ops,
+				.start = genl_lock_start,
 				.dump = genl_lock_dumpit,
 				.done = genl_lock_done,
 			};
@@ -588,6 +603,7 @@ static int genl_family_rcv_msg(struct genl_family *family,
 		} else {
 			struct netlink_dump_control c = {
 				.module = family->module,
+				.start = ops->start,
 				.dump = ops->dumpit,
 				.done = ops->done,
 			};

From b377c453b3631531679cbe594b7d28c5ecd82cea Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 19 Oct 2017 20:51:10 +0800
Subject: [PATCH 02/28] ipsec: Fix aborted xfrm policy dump crash

commit 1137b5e2529a8f5ca8ee709288ecba3e68044df2 upstream.

An independent security researcher, Mohamed Ghannam, has reported
this vulnerability to Beyond Security's SecuriTeam Secure Disclosure
program.

The xfrm_dump_policy_done function expects xfrm_dump_policy to
have been called at least once or it will crash.  This can be
triggered if a dump fails because the target socket's receive
buffer is full.

This patch fixes it by using the cb->start mechanism to ensure that
the initialisation is always done regardless of the buffer situation.

Fixes: 12a169e7d8f4 ("ipsec: Put dumpers on the dump list")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/xfrm/xfrm_user.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 7a5a64e70b4d..76944a4839a5 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1652,32 +1652,34 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
 
 static int xfrm_dump_policy_done(struct netlink_callback *cb)
 {
-	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1];
+	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args;
 	struct net *net = sock_net(cb->skb->sk);
 
 	xfrm_policy_walk_done(walk, net);
 	return 0;
 }
 
+static int xfrm_dump_policy_start(struct netlink_callback *cb)
+{
+	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args;
+
+	BUILD_BUG_ON(sizeof(*walk) > sizeof(cb->args));
+
+	xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY);
+	return 0;
+}
+
 static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
-	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1];
+	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args;
 	struct xfrm_dump_info info;
 
-	BUILD_BUG_ON(sizeof(struct xfrm_policy_walk) >
-		     sizeof(cb->args) - sizeof(cb->args[0]));
-
 	info.in_skb = cb->skb;
 	info.out_skb = skb;
 	info.nlmsg_seq = cb->nlh->nlmsg_seq;
 	info.nlmsg_flags = NLM_F_MULTI;
 
-	if (!cb->args[0]) {
-		cb->args[0] = 1;
-		xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY);
-	}
-
 	(void) xfrm_policy_walk(net, walk, dump_one_policy, &info);
 
 	return skb->len;
@@ -2415,6 +2417,7 @@ static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
 
 static const struct xfrm_link {
 	int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
+	int (*start)(struct netlink_callback *);
 	int (*dump)(struct sk_buff *, struct netlink_callback *);
 	int (*done)(struct netlink_callback *);
 	const struct nla_policy *nla_pol;
@@ -2428,6 +2431,7 @@ static const struct xfrm_link {
 	[XFRM_MSG_NEWPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_add_policy    },
 	[XFRM_MSG_DELPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_get_policy    },
 	[XFRM_MSG_GETPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_get_policy,
+						   .start = xfrm_dump_policy_start,
 						   .dump = xfrm_dump_policy,
 						   .done = xfrm_dump_policy_done },
 	[XFRM_MSG_ALLOCSPI    - XFRM_MSG_BASE] = { .doit = xfrm_alloc_userspi },
@@ -2479,6 +2483,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 		{
 			struct netlink_dump_control c = {
+				.start = link->start,
 				.dump = link->dump,
 				.done = link->done,
 			};

From 87e2bd898d3a79a8c609f183180adac47879a2a4 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Fri, 27 Nov 2015 21:09:31 +0000
Subject: [PATCH 03/28] x86/mm/pat: Ensure cpa->pfn only contains page frame
 numbers

commit edc3b9129cecd0f0857112136f5b8b1bc1d45918 upstream.

The x86 pageattr code is confused about the data that is stored
in cpa->pfn, sometimes it's treated as a page frame number,
sometimes it's treated as an unshifted physical address, and in
one place it's treated as a pte.

The result of this is that the mapping functions do not map the
intended physical address.

This isn't a problem in practice because most of the addresses
we're mapping in the EFI code paths are already mapped in
'trampoline_pgd' and so the pageattr mapping functions don't
actually do anything in this case. But when we move to using a
separate page table for the EFI runtime this will be an issue.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1448658575-17029-3-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: "Ghannam, Yazen" <Yazen.Ghannam@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/pageattr.c         | 17 ++++++-----------
 arch/x86/platform/efi/efi_64.c | 16 ++++++++++------
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index b599a780a5a9..a0fe62e3f4a3 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -911,15 +911,10 @@ static void populate_pte(struct cpa_data *cpa,
 	pte = pte_offset_kernel(pmd, start);
 
 	while (num_pages-- && start < end) {
-
-		/* deal with the NX bit */
-		if (!(pgprot_val(pgprot) & _PAGE_NX))
-			cpa->pfn &= ~_PAGE_NX;
-
-		set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
+		set_pte(pte, pfn_pte(cpa->pfn, pgprot));
 
 		start	 += PAGE_SIZE;
-		cpa->pfn += PAGE_SIZE;
+		cpa->pfn++;
 		pte++;
 	}
 }
@@ -975,11 +970,11 @@ static int populate_pmd(struct cpa_data *cpa,
 
 		pmd = pmd_offset(pud, start);
 
-		set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE |
+		set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
 				   massage_pgprot(pmd_pgprot)));
 
 		start	  += PMD_SIZE;
-		cpa->pfn  += PMD_SIZE;
+		cpa->pfn  += PMD_SIZE >> PAGE_SHIFT;
 		cur_pages += PMD_SIZE >> PAGE_SHIFT;
 	}
 
@@ -1048,11 +1043,11 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
 	 * Map everything starting from the Gb boundary, possibly with 1G pages
 	 */
 	while (end - start >= PUD_SIZE) {
-		set_pud(pud, __pud(cpa->pfn | _PAGE_PSE |
+		set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
 				   massage_pgprot(pud_pgprot)));
 
 		start	  += PUD_SIZE;
-		cpa->pfn  += PUD_SIZE;
+		cpa->pfn  += PUD_SIZE >> PAGE_SHIFT;
 		cur_pages += PUD_SIZE >> PAGE_SHIFT;
 		pud++;
 	}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index a0ac0f9c307f..5aa186db59e3 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -143,7 +143,7 @@ void efi_sync_low_kernel_mappings(void)
 
 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 {
-	unsigned long text;
+	unsigned long pfn, text;
 	struct page *page;
 	unsigned npages;
 	pgd_t *pgd;
@@ -160,7 +160,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	 * and ident-map those pages containing the map before calling
 	 * phys_efi_set_virtual_address_map().
 	 */
-	if (kernel_map_pages_in_pgd(pgd, pa_memmap, pa_memmap, num_pages, _PAGE_NX)) {
+	pfn = pa_memmap >> PAGE_SHIFT;
+	if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX)) {
 		pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
 		return 1;
 	}
@@ -185,8 +186,9 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 
 	npages = (_end - _text) >> PAGE_SHIFT;
 	text = __pa(_text);
+	pfn = text >> PAGE_SHIFT;
 
-	if (kernel_map_pages_in_pgd(pgd, text >> PAGE_SHIFT, text, npages, 0)) {
+	if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, 0)) {
 		pr_err("Failed to map kernel text 1:1\n");
 		return 1;
 	}
@@ -204,12 +206,14 @@ void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 static void __init __map_region(efi_memory_desc_t *md, u64 va)
 {
 	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
-	unsigned long pf = 0;
+	unsigned long flags = 0;
+	unsigned long pfn;
 
 	if (!(md->attribute & EFI_MEMORY_WB))
-		pf |= _PAGE_PCD;
+		flags |= _PAGE_PCD;
 
-	if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
+	pfn = md->phys_addr >> PAGE_SHIFT;
+	if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags))
 		pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
 			   md->phys_addr, va);
 }

From b73adb60852034d84092d123b323196ca42529cd Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Fri, 27 Nov 2015 21:09:33 +0000
Subject: [PATCH 04/28] x86/efi: Hoist page table switching code into
 efi_call_virt()

commit c9f2a9a65e4855b74d92cdad688f6ee4a1a323ff upstream.

This change is a prerequisite for pending patches that switch to
a dedicated EFI page table, instead of using 'trampoline_pgd'
which shares PGD entries with 'swapper_pg_dir'. The pending
patches make it impossible to dereference the runtime service
function pointer without first switching %cr3.

It's true that we now have duplicated switching code in
efi_call_virt() and efi_call_phys_{prolog,epilog}() but we are
sacrificing code duplication for a little more clarity and the
ease of writing the page table switching code in C instead of
asm.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1448658575-17029-5-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: "Ghannam, Yazen" <Yazen.Ghannam@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/efi.h          | 25 +++++++++++++++++
 arch/x86/platform/efi/efi_64.c      | 24 ++++++++--------
 arch/x86/platform/efi/efi_stub_64.S | 43 -----------------------------
 3 files changed, 36 insertions(+), 56 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 0010c78c4998..347eeacb06a8 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -3,6 +3,7 @@
 
 #include <asm/fpu/api.h>
 #include <asm/pgtable.h>
+#include <asm/tlb.h>
 
 /*
  * We map the EFI regions needed for runtime services non-contiguously,
@@ -64,6 +65,17 @@ extern u64 asmlinkage efi_call(void *fp, ...);
 
 #define efi_call_phys(f, args...)		efi_call((f), args)
 
+/*
+ * Scratch space used for switching the pagetable in the EFI stub
+ */
+struct efi_scratch {
+	u64	r15;
+	u64	prev_cr3;
+	pgd_t	*efi_pgt;
+	bool	use_pgd;
+	u64	phys_stack;
+} __packed;
+
 #define efi_call_virt(f, ...)						\
 ({									\
 	efi_status_t __s;						\
@@ -71,7 +83,20 @@ extern u64 asmlinkage efi_call(void *fp, ...);
 	efi_sync_low_kernel_mappings();					\
 	preempt_disable();						\
 	__kernel_fpu_begin();						\
+									\
+	if (efi_scratch.use_pgd) {					\
+		efi_scratch.prev_cr3 = read_cr3();			\
+		write_cr3((unsigned long)efi_scratch.efi_pgt);		\
+		__flush_tlb_all();					\
+	}								\
+									\
 	__s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__);	\
+									\
+	if (efi_scratch.use_pgd) {					\
+		write_cr3(efi_scratch.prev_cr3);			\
+		__flush_tlb_all();					\
+	}								\
+									\
 	__kernel_fpu_end();						\
 	preempt_enable();						\
 	__s;								\
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 5aa186db59e3..be8a32416e12 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -47,16 +47,7 @@
  */
 static u64 efi_va = EFI_VA_START;
 
-/*
- * Scratch space used for switching the pagetable in the EFI stub
- */
-struct efi_scratch {
-	u64 r15;
-	u64 prev_cr3;
-	pgd_t *efi_pgt;
-	bool use_pgd;
-	u64 phys_stack;
-} __packed;
+struct efi_scratch efi_scratch;
 
 static void __init early_code_mapping_set_exec(int executable)
 {
@@ -83,8 +74,11 @@ pgd_t * __init efi_call_phys_prolog(void)
 	int pgd;
 	int n_pgds;
 
-	if (!efi_enabled(EFI_OLD_MEMMAP))
-		return NULL;
+	if (!efi_enabled(EFI_OLD_MEMMAP)) {
+		save_pgd = (pgd_t *)read_cr3();
+		write_cr3((unsigned long)efi_scratch.efi_pgt);
+		goto out;
+	}
 
 	early_code_mapping_set_exec(1);
 
@@ -96,6 +90,7 @@ pgd_t * __init efi_call_phys_prolog(void)
 		vaddress = (unsigned long)__va(pgd * PGDIR_SIZE);
 		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
 	}
+out:
 	__flush_tlb_all();
 
 	return save_pgd;
@@ -109,8 +104,11 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
 	int pgd_idx;
 	int nr_pgds;
 
-	if (!save_pgd)
+	if (!efi_enabled(EFI_OLD_MEMMAP)) {
+		write_cr3((unsigned long)save_pgd);
+		__flush_tlb_all();
 		return;
+	}
 
 	nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
 
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 86d0f9e08dd9..32020cb8bb08 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -38,41 +38,6 @@
 	mov %rsi, %cr0;			\
 	mov (%rsp), %rsp
 
-	/* stolen from gcc */
-	.macro FLUSH_TLB_ALL
-	movq %r15, efi_scratch(%rip)
-	movq %r14, efi_scratch+8(%rip)
-	movq %cr4, %r15
-	movq %r15, %r14
-	andb $0x7f, %r14b
-	movq %r14, %cr4
-	movq %r15, %cr4
-	movq efi_scratch+8(%rip), %r14
-	movq efi_scratch(%rip), %r15
-	.endm
-
-	.macro SWITCH_PGT
-	cmpb $0, efi_scratch+24(%rip)
-	je 1f
-	movq %r15, efi_scratch(%rip)		# r15
-	# save previous CR3
-	movq %cr3, %r15
-	movq %r15, efi_scratch+8(%rip)		# prev_cr3
-	movq efi_scratch+16(%rip), %r15		# EFI pgt
-	movq %r15, %cr3
-	1:
-	.endm
-
-	.macro RESTORE_PGT
-	cmpb $0, efi_scratch+24(%rip)
-	je 2f
-	movq efi_scratch+8(%rip), %r15
-	movq %r15, %cr3
-	movq efi_scratch(%rip), %r15
-	FLUSH_TLB_ALL
-	2:
-	.endm
-
 ENTRY(efi_call)
 	SAVE_XMM
 	mov (%rsp), %rax
@@ -83,16 +48,8 @@ ENTRY(efi_call)
 	mov %r8, %r9
 	mov %rcx, %r8
 	mov %rsi, %rcx
-	SWITCH_PGT
 	call *%rdi
-	RESTORE_PGT
 	addq $48, %rsp
 	RESTORE_XMM
 	ret
 ENDPROC(efi_call)
-
-	.data
-ENTRY(efi_scratch)
-	.fill 3,8,0
-	.byte 0
-	.quad 0

From 36e0f05afd4e1d09fd47936761a502aedbc50649 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Fri, 27 Nov 2015 21:09:34 +0000
Subject: [PATCH 05/28] x86/efi: Build our own page table structures

commit 67a9108ed4313b85a9c53406d80dc1ae3f8c3e36 upstream.

With commit e1a58320a38d ("x86/mm: Warn on W^X mappings") all
users booting on 64-bit UEFI machines see the following warning,

  ------------[ cut here ]------------
  WARNING: CPU: 7 PID: 1 at arch/x86/mm/dump_pagetables.c:225 note_page+0x5dc/0x780()
  x86/mm: Found insecure W+X mapping at address ffff88000005f000/0xffff88000005f000
  ...
  x86/mm: Checked W+X mappings: FAILED, 165660 W+X pages found.
  ...

This is caused by mapping EFI regions with RWX permissions.
There isn't much we can do to restrict the permissions for these
regions due to the way the firmware toolchains mix code and
data, but we can at least isolate these mappings so that they do
not appear in the regular kernel page tables.

In commit d2f7cbe7b26a ("x86/efi: Runtime services virtual
mapping") we started using 'trampoline_pgd' to map the EFI
regions because there was an existing identity mapping there
which we use during the SetVirtualAddressMap() call and for
broken firmware that accesses those addresses.

But 'trampoline_pgd' shares some PGD entries with
'swapper_pg_dir' and does not provide the isolation we require.
Notably the virtual address for __START_KERNEL_map and
MODULES_START are mapped by the same PGD entry so we need to be
more careful when copying changes over in
efi_sync_low_kernel_mappings().

This patch doesn't go the full mile, we still want to share some
PGD entries with 'swapper_pg_dir'. Having completely separate
page tables brings its own issues such as synchronising new
mappings after memory hotplug and module loading. Sharing also
keeps memory usage down.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1448658575-17029-6-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: "Ghannam, Yazen" <Yazen.Ghannam@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/efi.h     |  1 +
 arch/x86/platform/efi/efi.c    | 39 +++++---------
 arch/x86/platform/efi/efi_32.c |  5 ++
 arch/x86/platform/efi/efi_64.c | 97 ++++++++++++++++++++++++++++------
 4 files changed, 102 insertions(+), 40 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 347eeacb06a8..8fd9e637629a 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -136,6 +136,7 @@ extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
 extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
 extern void efi_sync_low_kernel_mappings(void);
+extern int __init efi_alloc_page_tables(void);
 extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
 extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages);
 extern void __init old_map_region(efi_memory_desc_t *md);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index ad285404ea7f..3c1f3cd7b2ba 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -869,7 +869,7 @@ static void __init kexec_enter_virtual_mode(void)
  * This function will switch the EFI runtime services to virtual mode.
  * Essentially, we look through the EFI memmap and map every region that
  * has the runtime attribute bit set in its memory descriptor into the
- * ->trampoline_pgd page table using a top-down VA allocation scheme.
+ * efi_pgd page table.
  *
  * The old method which used to update that memory descriptor with the
  * virtual address obtained from ioremap() is still supported when the
@@ -879,8 +879,8 @@ static void __init kexec_enter_virtual_mode(void)
  *
  * The new method does a pagetable switch in a preemption-safe manner
  * so that we're in a different address space when calling a runtime
- * function. For function arguments passing we do copy the PGDs of the
- * kernel page table into ->trampoline_pgd prior to each call.
+ * function. For function arguments passing we do copy the PUDs of the
+ * kernel page table into efi_pgd prior to each call.
  *
  * Specially for kexec boot, efi runtime maps in previous kernel should
  * be passed in via setup_data. In that case runtime ranges will be mapped
@@ -895,6 +895,12 @@ static void __init __efi_enter_virtual_mode(void)
 
 	efi.systab = NULL;
 
+	if (efi_alloc_page_tables()) {
+		pr_err("Failed to allocate EFI page tables\n");
+		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+		return;
+	}
+
 	efi_merge_regions();
 	new_memmap = efi_map_regions(&count, &pg_shift);
 	if (!new_memmap) {
@@ -954,28 +960,11 @@ static void __init __efi_enter_virtual_mode(void)
 	efi_runtime_mkexec();
 
 	/*
-	 * We mapped the descriptor array into the EFI pagetable above but we're
-	 * not unmapping it here. Here's why:
-	 *
-	 * We're copying select PGDs from the kernel page table to the EFI page
-	 * table and when we do so and make changes to those PGDs like unmapping
-	 * stuff from them, those changes appear in the kernel page table and we
-	 * go boom.
-	 *
-	 * From setup_real_mode():
-	 *
-	 * ...
-	 * trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
-	 *
-	 * In this particular case, our allocation is in PGD 0 of the EFI page
-	 * table but we've copied that PGD from PGD[272] of the EFI page table:
-	 *
-	 *	pgd_index(__PAGE_OFFSET = 0xffff880000000000) = 272
-	 *
-	 * where the direct memory mapping in kernel space is.
-	 *
-	 * new_memmap's VA comes from that direct mapping and thus clearing it,
-	 * it would get cleared in the kernel page table too.
+	 * We mapped the descriptor array into the EFI pagetable above
+	 * but we're not unmapping it here because if we're running in
+	 * EFI mixed mode we need all of memory to be accessible when
+	 * we pass parameters to the EFI runtime services in the
+	 * thunking code.
 	 *
 	 * efi_cleanup_page_tables(__pa(new_memmap), 1 << pg_shift);
 	 */
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index ed5b67338294..58d669bc8250 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -38,6 +38,11 @@
  * say 0 - 3G.
  */
 
+int __init efi_alloc_page_tables(void)
+{
+	return 0;
+}
+
 void efi_sync_low_kernel_mappings(void) {}
 void __init efi_dump_pagetable(void) {}
 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index be8a32416e12..18dfaad71c99 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -40,6 +40,7 @@
 #include <asm/fixmap.h>
 #include <asm/realmode.h>
 #include <asm/time.h>
+#include <asm/pgalloc.h>
 
 /*
  * We allocate runtime services regions bottom-up, starting from -4G, i.e.
@@ -121,22 +122,92 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
 	early_code_mapping_set_exec(0);
 }
 
+static pgd_t *efi_pgd;
+
+/*
+ * We need our own copy of the higher levels of the page tables
+ * because we want to avoid inserting EFI region mappings (EFI_VA_END
+ * to EFI_VA_START) into the standard kernel page tables. Everything
+ * else can be shared, see efi_sync_low_kernel_mappings().
+ */
+int __init efi_alloc_page_tables(void)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	gfp_t gfp_mask;
+
+	if (efi_enabled(EFI_OLD_MEMMAP))
+		return 0;
+
+	gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO;
+	efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
+	if (!efi_pgd)
+		return -ENOMEM;
+
+	pgd = efi_pgd + pgd_index(EFI_VA_END);
+
+	pud = pud_alloc_one(NULL, 0);
+	if (!pud) {
+		free_page((unsigned long)efi_pgd);
+		return -ENOMEM;
+	}
+
+	pgd_populate(NULL, pgd, pud);
+
+	return 0;
+}
+
 /*
  * Add low kernel mappings for passing arguments to EFI functions.
  */
 void efi_sync_low_kernel_mappings(void)
 {
-	unsigned num_pgds;
-	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
+	unsigned num_entries;
+	pgd_t *pgd_k, *pgd_efi;
+	pud_t *pud_k, *pud_efi;
 
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return;
 
-	num_pgds = pgd_index(MODULES_END - 1) - pgd_index(PAGE_OFFSET);
+	/*
+	 * We can share all PGD entries apart from the one entry that
+	 * covers the EFI runtime mapping space.
+	 *
+	 * Make sure the EFI runtime region mappings are guaranteed to
+	 * only span a single PGD entry and that the entry also maps
+	 * other important kernel regions.
+	 */
+	BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
+	BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
+			(EFI_VA_END & PGDIR_MASK));
 
-	memcpy(pgd + pgd_index(PAGE_OFFSET),
-		init_mm.pgd + pgd_index(PAGE_OFFSET),
-		sizeof(pgd_t) * num_pgds);
+	pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET);
+	pgd_k = pgd_offset_k(PAGE_OFFSET);
+
+	num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET);
+	memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
+
+	/*
+	 * We share all the PUD entries apart from those that map the
+	 * EFI regions. Copy around them.
+	 */
+	BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0);
+	BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0);
+
+	pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
+	pud_efi = pud_offset(pgd_efi, 0);
+
+	pgd_k = pgd_offset_k(EFI_VA_END);
+	pud_k = pud_offset(pgd_k, 0);
+
+	num_entries = pud_index(EFI_VA_END);
+	memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
+
+	pud_efi = pud_offset(pgd_efi, EFI_VA_START);
+	pud_k = pud_offset(pgd_k, EFI_VA_START);
+
+	num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START);
+	memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
 }
 
 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
@@ -149,8 +220,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return 0;
 
-	efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
-	pgd = __va(efi_scratch.efi_pgt);
+	efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd);
+	pgd = efi_pgd;
 
 	/*
 	 * It can happen that the physical address of new_memmap lands in memory
@@ -196,16 +267,14 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 
 void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 {
-	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
-
-	kernel_unmap_pages_in_pgd(pgd, pa_memmap, num_pages);
+	kernel_unmap_pages_in_pgd(efi_pgd, pa_memmap, num_pages);
 }
 
 static void __init __map_region(efi_memory_desc_t *md, u64 va)
 {
-	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
 	unsigned long flags = 0;
 	unsigned long pfn;
+	pgd_t *pgd = efi_pgd;
 
 	if (!(md->attribute & EFI_MEMORY_WB))
 		flags |= _PAGE_PCD;
@@ -314,9 +383,7 @@ void __init efi_runtime_mkexec(void)
 void __init efi_dump_pagetable(void)
 {
 #ifdef CONFIG_EFI_PGT_DUMP
-	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
-
-	ptdump_walk_pgd_level(NULL, pgd);
+	ptdump_walk_pgd_level(NULL, efi_pgd);
 #endif
 }
 

From 44ff3af83ee1ebca7b1fc35c8471880628d31dbb Mon Sep 17 00:00:00 2001
From: Adam Ford <aford173@gmail.com>
Date: Thu, 17 Aug 2017 06:01:28 -0500
Subject: [PATCH 06/28] ARM: dts: omap3: logicpd-torpedo-37xx-devkit: Fix MMC1
 cd-gpio

commit b7ace5ed8867ca54503727988adec6b20af54eeb upstream.

Fixes commit 687c27676151 ("ARM: dts: Add minimal support for LogicPD
Torpedo DM3730 devkit")

This patch corrects an issue where the cd-gpios was improperly setup
using IRQ_TYPE_LEVEL_LOW instead of GPIO_ACTIVE_LOW.

Signed-off-by: Adam Ford <aford173@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/logicpd-torpedo-37xx-devkit.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/logicpd-torpedo-37xx-devkit.dts b/arch/arm/boot/dts/logicpd-torpedo-37xx-devkit.dts
index 5b0430041ec6..fec92cd36ae3 100644
--- a/arch/arm/boot/dts/logicpd-torpedo-37xx-devkit.dts
+++ b/arch/arm/boot/dts/logicpd-torpedo-37xx-devkit.dts
@@ -88,7 +88,7 @@
 	interrupts-extended = <&intc 83 &omap3_pmx_core 0x11a>;
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc1_pins &mmc1_cd>;
-	cd-gpios = <&gpio4 31 IRQ_TYPE_LEVEL_LOW>;		/* gpio127 */
+	cd-gpios = <&gpio4 31 GPIO_ACTIVE_LOW>;		/* gpio127 */
 	vmmc-supply = <&vmmc1>;
 	bus-width = <4>;
 	cap-power-off-card;

From e85c6907b2b46b516950a801ea0648fec8f59ba9 Mon Sep 17 00:00:00 2001
From: Sai Praneeth <sai.praneeth.prakhya@intel.com>
Date: Wed, 9 Dec 2015 15:41:08 -0800
Subject: [PATCH 07/28] x86/efi-bgrt: Fix kernel panic when mapping BGRT data

commit 50a0cb565246f20d59cdb161778531e4b19d35ac upstream.

Starting with this commit 35eb8b81edd4 ("x86/efi: Build our own page
table structures") efi regions have a separate page directory called
"efi_pgd". In order to access any efi region we have to first shift %cr3
to this page table. In the bgrt code we are trying to copy bgrt_header
and image, but these regions fall under "EFI_BOOT_SERVICES_DATA"
and to access these regions we have to shift %cr3 to efi_pgd and not
doing so will cause page fault as shown below.

[    0.251599] Last level dTLB entries: 4KB 64, 2MB 0, 4MB 0, 1GB 4
[    0.259126] Freeing SMP alternatives memory: 32K (ffffffff8230e000 - ffffffff82316000)
[    0.271803] BUG: unable to handle kernel paging request at fffffffefce35002
[    0.279740] IP: [<ffffffff821bca49>] efi_bgrt_init+0x144/0x1fd
[    0.286383] PGD 300f067 PUD 0
[    0.289879] Oops: 0000 [#1] SMP
[    0.293566] Modules linked in:
[    0.297039] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.4.0-rc1-eywa-eywa-built-in-47041+ #2
[    0.306619] Hardware name: Intel Corporation Skylake Client platform/Skylake Y LPDDR3 RVP3, BIOS SKLSE2R1.R00.B104.B01.1511110114 11/11/2015
[    0.320925] task: ffffffff820134c0 ti: ffffffff82000000 task.ti: ffffffff82000000
[    0.329420] RIP: 0010:[<ffffffff821bca49>]  [<ffffffff821bca49>] efi_bgrt_init+0x144/0x1fd
[    0.338821] RSP: 0000:ffffffff82003f18  EFLAGS: 00010246
[    0.344852] RAX: fffffffefce35000 RBX: fffffffefce35000 RCX: fffffffefce2b000
[    0.352952] RDX: 000000008a82b000 RSI: ffffffff8235bb80 RDI: 000000008a835000
[    0.361050] RBP: ffffffff82003f30 R08: 000000008a865000 R09: ffffffffff202850
[    0.369149] R10: ffffffff811ad62f R11: 0000000000000000 R12: 0000000000000000
[    0.377248] R13: ffff88016dbaea40 R14: ffffffff822622c0 R15: ffffffff82003fb0
[    0.385348] FS:  0000000000000000(0000) GS:ffff88016d800000(0000) knlGS:0000000000000000
[    0.394533] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    0.401054] CR2: fffffffefce35002 CR3: 000000000300c000 CR4: 00000000003406f0
[    0.409153] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    0.417252] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[    0.425350] Stack:
[    0.427638]  ffffffffffffffff ffffffff82256900 ffff88016dbaea40 ffffffff82003f40
[    0.436086]  ffffffff821bbce0 ffffffff82003f88 ffffffff8219c0c2 0000000000000000
[    0.444533]  ffffffff8219ba4a ffffffff822622c0 0000000000083000 00000000ffffffff
[    0.452978] Call Trace:
[    0.455763]  [<ffffffff821bbce0>] efi_late_init+0x9/0xb
[    0.461697]  [<ffffffff8219c0c2>] start_kernel+0x463/0x47f
[    0.467928]  [<ffffffff8219ba4a>] ? set_init_arg+0x55/0x55
[    0.474159]  [<ffffffff8219b120>] ? early_idt_handler_array+0x120/0x120
[    0.481669]  [<ffffffff8219b5ee>] x86_64_start_reservations+0x2a/0x2c
[    0.488982]  [<ffffffff8219b72d>] x86_64_start_kernel+0x13d/0x14c
[    0.495897] Code: 00 41 b4 01 48 8b 78 28 e8 09 36 01 00 48 85 c0 48 89 c3 75 13 48 c7 c7 f8 ac d3 81 31 c0 e8 d7 3b fb fe e9 b5 00 00 00 45 84 e4 <44> 8b 6b 02 74 0d be 06 00 00 00 48 89 df e8 ae 34 0$
[    0.518151] RIP  [<ffffffff821bca49>] efi_bgrt_init+0x144/0x1fd
[    0.524888]  RSP <ffffffff82003f18>
[    0.528851] CR2: fffffffefce35002
[    0.532615] ---[ end trace 7b06521e6ebf2aea ]---
[    0.537852] Kernel panic - not syncing: Attempted to kill the idle task!

As said above one way to fix this bug is to shift %cr3 to efi_pgd but we
are not doing that way because it leaks inner details of how we switch
to EFI page tables into a new call site and it also adds duplicate code.
Instead, we remove the call to efi_lookup_mapped_addr() and always
perform early_mem*() instead of early_io*() because we want to remap RAM
regions and not I/O regions. We also delete efi_lookup_mapped_addr()
because we are no longer using it.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Reported-by: Wendy Wang <wendy.wang@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Ricardo Neri <ricardo.neri@intel.com>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: "Ghannam, Yazen" <Yazen.Ghannam@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/platform/efi/efi-bgrt.c | 39 ++++++++++++--------------------
 drivers/firmware/efi/efi.c       | 32 --------------------------
 2 files changed, 14 insertions(+), 57 deletions(-)

diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index ea48449b2e63..81c5bcd06b26 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -28,8 +28,7 @@ struct bmp_header {
 void __init efi_bgrt_init(void)
 {
 	acpi_status status;
-	void __iomem *image;
-	bool ioremapped = false;
+	void *image;
 	struct bmp_header bmp_header;
 
 	if (acpi_disabled)
@@ -70,20 +69,14 @@ void __init efi_bgrt_init(void)
 		return;
 	}
 
-	image = efi_lookup_mapped_addr(bgrt_tab->image_address);
+	image = early_memremap(bgrt_tab->image_address, sizeof(bmp_header));
 	if (!image) {
-		image = early_ioremap(bgrt_tab->image_address,
-				       sizeof(bmp_header));
-		ioremapped = true;
-		if (!image) {
-			pr_err("Ignoring BGRT: failed to map image header memory\n");
-			return;
-		}
+		pr_err("Ignoring BGRT: failed to map image header memory\n");
+		return;
 	}
 
-	memcpy_fromio(&bmp_header, image, sizeof(bmp_header));
-	if (ioremapped)
-		early_iounmap(image, sizeof(bmp_header));
+	memcpy(&bmp_header, image, sizeof(bmp_header));
+	early_memunmap(image, sizeof(bmp_header));
 	bgrt_image_size = bmp_header.size;
 
 	bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN);
@@ -93,18 +86,14 @@ void __init efi_bgrt_init(void)
 		return;
 	}
 
-	if (ioremapped) {
-		image = early_ioremap(bgrt_tab->image_address,
-				       bmp_header.size);
-		if (!image) {
-			pr_err("Ignoring BGRT: failed to map image memory\n");
-			kfree(bgrt_image);
-			bgrt_image = NULL;
-			return;
-		}
+	image = early_memremap(bgrt_tab->image_address, bmp_header.size);
+	if (!image) {
+		pr_err("Ignoring BGRT: failed to map image memory\n");
+		kfree(bgrt_image);
+		bgrt_image = NULL;
+		return;
 	}
 
-	memcpy_fromio(bgrt_image, image, bgrt_image_size);
-	if (ioremapped)
-		early_iounmap(image, bmp_header.size);
+	memcpy(bgrt_image, image, bgrt_image_size);
+	early_memunmap(image, bmp_header.size);
 }
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 3b52677f459a..0cd8f039602e 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -325,38 +325,6 @@ u64 __init efi_mem_desc_end(efi_memory_desc_t *md)
 	return end;
 }
 
-/*
- * We can't ioremap data in EFI boot services RAM, because we've already mapped
- * it as RAM.  So, look it up in the existing EFI memory map instead.  Only
- * callable after efi_enter_virtual_mode and before efi_free_boot_services.
- */
-void __iomem *efi_lookup_mapped_addr(u64 phys_addr)
-{
-	struct efi_memory_map *map;
-	void *p;
-	map = efi.memmap;
-	if (!map)
-		return NULL;
-	if (WARN_ON(!map->map))
-		return NULL;
-	for (p = map->map; p < map->map_end; p += map->desc_size) {
-		efi_memory_desc_t *md = p;
-		u64 size = md->num_pages << EFI_PAGE_SHIFT;
-		u64 end = md->phys_addr + size;
-		if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
-		    md->type != EFI_BOOT_SERVICES_CODE &&
-		    md->type != EFI_BOOT_SERVICES_DATA)
-			continue;
-		if (!md->virt_addr)
-			continue;
-		if (phys_addr >= md->phys_addr && phys_addr < end) {
-			phys_addr += md->virt_addr - md->phys_addr;
-			return (__force void __iomem *)(unsigned long)phys_addr;
-		}
-	}
-	return NULL;
-}
-
 static __initdata efi_config_table_type_t common_tables[] = {
 	{ACPI_20_TABLE_GUID, "ACPI 2.0", &efi.acpi20},
 	{ACPI_TABLE_GUID, "ACPI", &efi.acpi},

From f97fc9ab1ce23a8666cfe460264a6a45b7e0b335 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Mon, 21 Dec 2015 14:12:52 +0000
Subject: [PATCH 08/28] x86/efi-bgrt: Replace early_memremap() with memremap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e2c90dd7e11e3025b46719a79fb4bb1e7a5cef9f upstream.

Môshe reported the following warning triggered on his machine since
commit 50a0cb565246 ("x86/efi-bgrt: Fix kernel panic when mapping BGRT
data"),

  [    0.026936] ------------[ cut here ]------------
  [    0.026941] WARNING: CPU: 0 PID: 0 at mm/early_ioremap.c:137 __early_ioremap+0x102/0x1bb()
  [    0.026941] Modules linked in:
  [    0.026944] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.4.0-rc1 #2
  [    0.026945] Hardware name: Dell Inc. XPS 13 9343/09K8G1, BIOS A05 07/14/2015
  [    0.026946]  0000000000000000 900f03d5a116524d ffffffff81c03e60 ffffffff813a3fff
  [    0.026948]  0000000000000000 ffffffff81c03e98 ffffffff810a0852 00000000d7b76000
  [    0.026949]  0000000000000000 0000000000000001 0000000000000001 000000000000017c
  [    0.026951] Call Trace:
  [    0.026955]  [<ffffffff813a3fff>] dump_stack+0x44/0x55
  [    0.026958]  [<ffffffff810a0852>] warn_slowpath_common+0x82/0xc0
  [    0.026959]  [<ffffffff810a099a>] warn_slowpath_null+0x1a/0x20
  [    0.026961]  [<ffffffff81d8c395>] __early_ioremap+0x102/0x1bb
  [    0.026962]  [<ffffffff81d8c602>] early_memremap+0x13/0x15
  [    0.026964]  [<ffffffff81d78361>] efi_bgrt_init+0x162/0x1ad
  [    0.026966]  [<ffffffff81d778ec>] efi_late_init+0x9/0xb
  [    0.026968]  [<ffffffff81d58ff5>] start_kernel+0x46f/0x49f
  [    0.026970]  [<ffffffff81d58120>] ? early_idt_handler_array+0x120/0x120
  [    0.026972]  [<ffffffff81d58339>] x86_64_start_reservations+0x2a/0x2c
  [    0.026974]  [<ffffffff81d58485>] x86_64_start_kernel+0x14a/0x16d
  [    0.026977] ---[ end trace f9b3812eb8e24c58 ]---
  [    0.026978] efi_bgrt: Ignoring BGRT: failed to map image memory

early_memremap() has an upper limit on the size of mapping it can
handle which is ~200KB. Clearly the BGRT image on Môshe's machine is
much larger than that.

There's actually no reason to restrict ourselves to using the early_*
version of memremap() - the ACPI BGRT driver is invoked late enough in
boot that we can use the standard version, with the benefit that the
late version allows mappings of arbitrary size.

Reported-by: Môshe van der Sterre <me@moshe.nl>
Tested-by: Môshe van der Sterre <me@moshe.nl>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1450707172-12561-1-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "Ghannam, Yazen" <Yazen.Ghannam@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/platform/efi/efi-bgrt.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index 81c5bcd06b26..64fbc7e33226 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -69,14 +69,14 @@ void __init efi_bgrt_init(void)
 		return;
 	}
 
-	image = early_memremap(bgrt_tab->image_address, sizeof(bmp_header));
+	image = memremap(bgrt_tab->image_address, sizeof(bmp_header), MEMREMAP_WB);
 	if (!image) {
 		pr_err("Ignoring BGRT: failed to map image header memory\n");
 		return;
 	}
 
 	memcpy(&bmp_header, image, sizeof(bmp_header));
-	early_memunmap(image, sizeof(bmp_header));
+	memunmap(image);
 	bgrt_image_size = bmp_header.size;
 
 	bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN);
@@ -86,7 +86,7 @@ void __init efi_bgrt_init(void)
 		return;
 	}
 
-	image = early_memremap(bgrt_tab->image_address, bmp_header.size);
+	image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB);
 	if (!image) {
 		pr_err("Ignoring BGRT: failed to map image memory\n");
 		kfree(bgrt_image);
@@ -95,5 +95,5 @@ void __init efi_bgrt_init(void)
 	}
 
 	memcpy(bgrt_image, image, bgrt_image_size);
-	early_memunmap(image, bmp_header.size);
+	memunmap(image);
 }

From 2b7ef6bdd28610f2907cd766362cf95d0d023801 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 27 Nov 2017 06:21:25 +0300
Subject: [PATCH 09/28] mm, thp: Do not make page table dirty unconditionally
 in touch_p[mu]d()

commit a8f97366452ed491d13cf1e44241bc0b5740b1f0 upstream.

Currently, we unconditionally make page table dirty in touch_pmd().
It may result in false-positive can_follow_write_pmd().

We may avoid the situation, if we would only make the page table entry
dirty if caller asks for write access -- FOLL_WRITE.

The patch also changes touch_pud() in the same way.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[Salvatore Bonaccorso: backport for 3.16:
 - Adjust context
 - Drop specific part for PUD-sized transparent hugepages. Support
   for PUD-sized transparent hugepages was added in v4.11-rc1
]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/huge_memory.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6c6f5ccfcda1..8f3769ec8575 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1304,17 +1304,11 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 	VM_BUG_ON_PAGE(!PageHead(page), page);
 	if (flags & FOLL_TOUCH) {
 		pmd_t _pmd;
-		/*
-		 * We should set the dirty bit only for FOLL_WRITE but
-		 * for now the dirty bit in the pmd is meaningless.
-		 * And if the dirty bit will become meaningful and
-		 * we'll only set it with FOLL_WRITE, an atomic
-		 * set_bit will be required on the pmd to set the
-		 * young bit, instead of the current set_pmd_at.
-		 */
-		_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+		_pmd = pmd_mkyoung(*pmd);
+		if (flags & FOLL_WRITE)
+			_pmd = pmd_mkdirty(_pmd);
 		if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
-					  pmd, _pmd,  1))
+					  pmd, _pmd, flags & FOLL_WRITE))
 			update_mmu_cache_pmd(vma, addr, pmd);
 	}
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {

From 0d05a5593f6309cb0df0df474a807a3220855703 Mon Sep 17 00:00:00 2001
From: chenjie <chenjie6@huawei.com>
Date: Wed, 29 Nov 2017 16:10:54 -0800
Subject: [PATCH 10/28] mm/madvise.c: fix madvise() infinite loop under special
 circumstances

commit 6ea8d958a2c95a1d514015d4e29ba21a8c0a1a91 upstream.

MADVISE_WILLNEED has always been a noop for DAX (formerly XIP) mappings.
Unfortunately madvise_willneed() doesn't communicate this information
properly to the generic madvise syscall implementation.  The calling
convention is quite subtle there.  madvise_vma() is supposed to either
return an error or update &prev otherwise the main loop will never
advance to the next vma and it will keep looping for ever without a way
to get out of the kernel.

It seems this has been broken since introduction.  Nobody has noticed
because nobody seems to be using MADVISE_WILLNEED on these DAX mappings.

[mhocko@suse.com: rewrite changelog]
Link: http://lkml.kernel.org/r/20171127115318.911-1-guoxuenan@huawei.com
Fixes: fe77ba6f4f97 ("[PATCH] xip: madvice/fadvice: execute in place")
Signed-off-by: chenjie <chenjie6@huawei.com>
Signed-off-by: guoxuenan <guoxuenan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: zhangyi (F) <yi.zhang@huawei.com>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/madvise.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index c889fcbb530e..2a0f9a4504f1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -223,15 +223,14 @@ static long madvise_willneed(struct vm_area_struct *vma,
 {
 	struct file *file = vma->vm_file;
 
+	*prev = vma;
 #ifdef CONFIG_SWAP
 	if (!file) {
-		*prev = vma;
 		force_swapin_readahead(vma, start, end);
 		return 0;
 	}
 
 	if (shmem_mapping(file->f_mapping)) {
-		*prev = vma;
 		force_shm_swapin_readahead(vma, start, end,
 					file->f_mapping);
 		return 0;
@@ -246,7 +245,6 @@ static long madvise_willneed(struct vm_area_struct *vma,
 		return 0;
 	}
 
-	*prev = vma;
 	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 	if (end > vma->vm_end)
 		end = vma->vm_end;

From 591ddc92d756b49a95060b0203fe329a31f18c3e Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 17 Nov 2017 14:50:46 -0500
Subject: [PATCH 11/28] btrfs: clear space cache inode generation always

commit 8e138e0d92c6c9d3d481674fb14e3439b495be37 upstream.

We discovered a box that had double allocations, and suspected the space
cache may be to blame.  While auditing the write out path I noticed that
if we've already setup the space cache we will just carry on.  This
means that any error we hit after cache_save_setup before we go to
actually write the cache out we won't reset the inode generation, so
whatever was already written will be considered correct, except it'll be
stale.  Fix this by _always_ resetting the generation on the block group
inode, this way we only ever have valid or invalid cache.

With this patch I was no longer able to reproduce cache corruption with
dm-log-writes and my bpf error injection tool.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c36a03fa7678..260f94b019c9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3361,13 +3361,6 @@ again:
 		goto again;
 	}
 
-	/* We've already setup this transaction, go ahead and exit */
-	if (block_group->cache_generation == trans->transid &&
-	    i_size_read(inode)) {
-		dcs = BTRFS_DC_SETUP;
-		goto out_put;
-	}
-
 	/*
 	 * We want to set the generation to 0, that way if anything goes wrong
 	 * from here on out we know not to trust this cache when we load up next
@@ -3391,6 +3384,13 @@ again:
 	}
 	WARN_ON(ret);
 
+	/* We've already setup this transaction, go ahead and exit */
+	if (block_group->cache_generation == trans->transid &&
+	    i_size_read(inode)) {
+		dcs = BTRFS_DC_SETUP;
+		goto out_put;
+	}
+
 	if (i_size_read(inode) > 0) {
 		ret = btrfs_check_trunc_cache_free_space(root,
 					&root->fs_info->global_block_rsv);

From ab29b6b818aa1cc8c36fc510464c732d392f5df2 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sun, 5 Nov 2017 16:11:30 +0200
Subject: [PATCH 12/28] KVM: x86: pvclock: Handle first-time write to
 pvclock-page contains random junk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 51c4b8bba674cfd2260d173602c4dac08e4c3a99 upstream.

When guest passes KVM it's pvclock-page GPA via WRMSR to
MSR_KVM_SYSTEM_TIME / MSR_KVM_SYSTEM_TIME_NEW, KVM don't initialize
pvclock-page to some start-values. It just requests a clock-update which
will happen before entering to guest.

The clock-update logic will call kvm_setup_pvclock_page() to update the
pvclock-page with info. However, kvm_setup_pvclock_page() *wrongly*
assumes that the version-field is initialized to an even number. This is
wrong because at first-time write, field could be any-value.

Fix simply makes sure that if first-time version-field is odd, increment
it once more to make it even and only then start standard logic.
This follows same logic as done in other pvclock shared-pages (See
kvm_write_wall_clock() and record_steal_time()).

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/x86.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3ffd5900da5b..6a624989b13e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1812,6 +1812,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	 */
 	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
 
+	if (guest_hv_clock.version & 1)
+		++guest_hv_clock.version;  /* first time write, random junk */
+
 	vcpu->hv_clock.version = guest_hv_clock.version + 1;
 	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
 				&vcpu->hv_clock,

From 1e9e6bdccb80d9dca2a9dfd6cf2d34ce8f226d47 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sun, 5 Nov 2017 16:56:32 +0200
Subject: [PATCH 13/28] KVM: x86: Exit to user-mode on #UD intercept when
 emulator requires
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 61cb57c9ed631c95b54f8e9090c89d18b3695b3c upstream.

Instruction emulation after trapping a #UD exception can result in an
MMIO access, for example when emulating a MOVBE on a processor that
doesn't support the instruction.  In this case, the #UD vmexit handler
must exit to user mode, but there wasn't any code to do so.  Add it for
both VMX and SVM.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/svm.c | 2 ++
 arch/x86/kvm/vmx.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4e1b254c3695..4b1152e57340 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1696,6 +1696,8 @@ static int ud_interception(struct vcpu_svm *svm)
 	int er;
 
 	er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
+	if (er == EMULATE_USER_EXIT)
+		return 0;
 	if (er != EMULATE_DONE)
 		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
 	return 1;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 67ba0d8f87c7..253a8c8207bb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5267,6 +5267,8 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 			return 1;
 		}
 		er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
+		if (er == EMULATE_USER_EXIT)
+			return 0;
 		if (er != EMULATE_DONE)
 			kvm_queue_exception(vcpu, UD_VECTOR);
 		return 1;

From a6493ad6fc893431705637d7efa3035235d9e135 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 10 Nov 2017 10:49:38 +0100
Subject: [PATCH 14/28] KVM: x86: inject exceptions produced by x86_decode_insn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 6ea6e84309ca7e0e850b3083e6b09344ee15c290 upstream.

Sometimes, a processor might execute an instruction while another
processor is updating the page tables for that instruction's code page,
but before the TLB shootdown completes.  The interesting case happens
if the page is in the TLB.

In general, the processor will succeed in executing the instruction and
nothing bad happens.  However, what if the instruction is an MMIO access?
If *that* happens, KVM invokes the emulator, and the emulator gets the
updated page tables.  If the update side had marked the code page as non
present, the page table walk then will fail and so will x86_decode_insn.

Unfortunately, even though kvm_fetch_guest_virt is correctly returning
X86EMUL_PROPAGATE_FAULT, x86_decode_insn's caller treats the failure as
a fatal error if the instruction cannot simply be reexecuted (as is the
case for MMIO).  And this in fact happened sometimes when rebooting
Windows 2012r2 guests.  Just checking ctxt->have_exception and injecting
the exception if true is enough to fix the case.

Thanks to Eduardo Habkost for helping in the debugging of this issue.

Reported-by: Yanan Fu <yfu@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6a624989b13e..df81717a92f3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5429,6 +5429,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 			if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
 						emulation_type))
 				return EMULATE_DONE;
+			if (ctxt->have_exception && inject_emulated_exception(vcpu))
+				return EMULATE_DONE;
 			if (emulation_type & EMULTYPE_SKIP)
 				return EMULATE_FAIL;
 			return handle_emulation_failure(vcpu);

From 5c65b739389fbc353fb42d379e9b7379cfe6d3f6 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 21 Nov 2017 15:42:29 +0200
Subject: [PATCH 15/28] mmc: core: Do not leave the block driver in a suspended
 state

commit ebe7dd45cf49e3b49cacbaace17f9f878f21fbea upstream.

The block driver must be resumed if the mmc bus fails to suspend the card.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/core/bus.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index 972ff844cf5a..cf7c7bc1e940 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -155,6 +155,9 @@ static int mmc_bus_suspend(struct device *dev)
 		return ret;
 
 	ret = host->bus_ops->suspend(host);
+	if (ret)
+		pm_generic_resume(dev);
+
 	return ret;
 }
 

From 6b4901e0e3495f1ffc846f55f704bd19049579e2 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 24 Nov 2017 07:47:50 +0100
Subject: [PATCH 16/28] eeprom: at24: check at24_read/write arguments

commit d9bcd462daf34aebb8de9ad7f76de0198bb5a0f0 upstream.

So far we completely rely on the caller to provide valid arguments.
To be on the safe side perform an own sanity check.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/eeprom/at24.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c
index 5d7c0900fa1b..f112c5bc082a 100644
--- a/drivers/misc/eeprom/at24.c
+++ b/drivers/misc/eeprom/at24.c
@@ -257,6 +257,9 @@ static ssize_t at24_read(struct at24_data *at24,
 	if (unlikely(!count))
 		return count;
 
+	if (off + count > at24->chip.byte_len)
+		return -EINVAL;
+
 	/*
 	 * Read data from chip, protecting against concurrent updates
 	 * from this host, but not from other I2C masters.
@@ -311,6 +314,9 @@ static ssize_t at24_eeprom_write(struct at24_data *at24, const char *buf,
 	unsigned long timeout, write_time;
 	unsigned next_page;
 
+	if (offset + count > at24->chip.byte_len)
+		return -EINVAL;
+
 	/* Get corresponding I2C address and adjust offset */
 	client = at24_translate_offset(at24, &offset);
 

From 5a7391b6d8983a2d36b0df15349caef10624e241 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhc@lemote.com>
Date: Fri, 24 Nov 2017 15:14:25 -0800
Subject: [PATCH 17/28] bcache: Fix building error on MIPS

commit cf33c1ee5254c6a430bc1538232b49c3ea13e613 upstream.

This patch try to fix the building error on MIPS. The reason is MIPS
has already defined the PTR macro, which conflicts with the PTR macro
in include/uapi/linux/bcache.h.

[fixed by mlyle: corrected a line-length issue]

Signed-off-by: Huacai Chen <chenhc@lemote.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/md/bcache/alloc.c   | 2 +-
 drivers/md/bcache/extents.c | 2 +-
 drivers/md/bcache/journal.c | 2 +-
 include/uapi/linux/bcache.h | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ea47980949ef..4d46f2ce606f 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -479,7 +479,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
 		if (b == -1)
 			goto err;
 
-		k->ptr[i] = PTR(ca->buckets[b].gen,
+		k->ptr[i] = MAKE_PTR(ca->buckets[b].gen,
 				bucket_to_sector(c, b),
 				ca->sb.nr_this_dev);
 
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 243de0bf15cd..4bf15182c4da 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -584,7 +584,7 @@ static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey
 		return false;
 
 	for (i = 0; i < KEY_PTRS(l); i++)
-		if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+		if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
 		    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
 			return false;
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 29eba7219b01..6ed066a0e7c0 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -508,7 +508,7 @@ static void journal_reclaim(struct cache_set *c)
 			continue;
 
 		ja->cur_idx = next;
-		k->ptr[n++] = PTR(0,
+		k->ptr[n++] = MAKE_PTR(0,
 				  bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
 				  ca->sb.nr_this_dev);
 	}
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 22b6ad31c706..8562b1cb776b 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -90,7 +90,7 @@ PTR_FIELD(PTR_GEN,			0,  8)
 
 #define PTR_CHECK_DEV			((1 << PTR_DEV_BITS) - 1)
 
-#define PTR(gen, offset, dev)						\
+#define MAKE_PTR(gen, offset, dev)					\
 	((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen)
 
 /* Bkey utility code */

From 4d027a8bcc7febabe908d4ca6049a8af21a8fd7c Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Tue, 14 Nov 2017 17:19:29 -0500
Subject: [PATCH 18/28] Revert "drm/radeon: dont switch vt on suspend"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 18c437caa5b18a235dd65cec224eab54bebcee65 upstream.

Fixes distorted colors on some cards on resume from suspend.

This reverts commit b9729b17a414f99c61f4db9ac9f9ed987fa0cbfe.

Bug: https://bugs.freedesktop.org/show_bug.cgi?id=98832
Bug: https://bugs.freedesktop.org/show_bug.cgi?id=99163
Bug: https://bugzilla.kernel.org/show_bug.cgi?id=107001
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/radeon/radeon_fb.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/radeon_fb.c b/drivers/gpu/drm/radeon/radeon_fb.c
index 26da2f4d7b4f..a2937a693591 100644
--- a/drivers/gpu/drm/radeon/radeon_fb.c
+++ b/drivers/gpu/drm/radeon/radeon_fb.c
@@ -226,7 +226,6 @@ static int radeonfb_create(struct drm_fb_helper *helper,
 	}
 
 	info->par = rfbdev;
-	info->skip_vt_switch = true;
 
 	ret = radeon_framebuffer_init(rdev->ddev, &rfbdev->rfb, &mode_cmd, gobj);
 	if (ret) {

From c7716f65721a851cd925ee911f791e1ae92b398d Mon Sep 17 00:00:00 2001
From: Roman Kapl <rka@sysgo.com>
Date: Mon, 30 Oct 2017 11:56:13 +0100
Subject: [PATCH 19/28] drm/radeon: fix atombios on big endian

commit 4f626a4ac8f57ddabf06d03870adab91e463217f upstream.

The function for byteswapping the data send to/from atombios was buggy for
num_bytes not divisible by four. The function must be aware of the fact
that after byte-swapping the u32 units, valid bytes might end up after the
num_bytes boundary.

This patch was tested on kernel 3.12 and allowed us to sucesfully use
DisplayPort on and Radeon SI card. Namely it fixed the link training and
EDID readout.

The function is patched both in radeon and amd drivers, since the functions
and the fixes are identical.

Signed-off-by: Roman Kapl <rka@sysgo.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c | 38 ++++++++++----------
 drivers/gpu/drm/radeon/atombios_dp.c         | 38 ++++++++++----------
 2 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
index f4cae5357e40..3e90ddcbb24a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
@@ -1575,34 +1575,32 @@ void amdgpu_atombios_scratch_regs_restore(struct amdgpu_device *adev)
 		WREG32(mmBIOS_SCRATCH_0 + i, adev->bios_scratch[i]);
 }
 
-/* Atom needs data in little endian format
- * so swap as appropriate when copying data to
- * or from atom. Note that atom operates on
- * dw units.
+/* Atom needs data in little endian format so swap as appropriate when copying
+ * data to or from atom. Note that atom operates on dw units.
+ *
+ * Use to_le=true when sending data to atom and provide at least
+ * ALIGN(num_bytes,4) bytes in the dst buffer.
+ *
+ * Use to_le=false when receiving data from atom and provide ALIGN(num_bytes,4)
+ * byes in the src buffer.
  */
 void amdgpu_atombios_copy_swap(u8 *dst, u8 *src, u8 num_bytes, bool to_le)
 {
 #ifdef __BIG_ENDIAN
-	u8 src_tmp[20], dst_tmp[20]; /* used for byteswapping */
-	u32 *dst32, *src32;
+	u32 src_tmp[5], dst_tmp[5];
 	int i;
+	u8 align_num_bytes = ALIGN(num_bytes, 4);
 
-	memcpy(src_tmp, src, num_bytes);
-	src32 = (u32 *)src_tmp;
-	dst32 = (u32 *)dst_tmp;
 	if (to_le) {
-		for (i = 0; i < ((num_bytes + 3) / 4); i++)
-			dst32[i] = cpu_to_le32(src32[i]);
-		memcpy(dst, dst_tmp, num_bytes);
+		memcpy(src_tmp, src, num_bytes);
+		for (i = 0; i < align_num_bytes / 4; i++)
+			dst_tmp[i] = cpu_to_le32(src_tmp[i]);
+		memcpy(dst, dst_tmp, align_num_bytes);
 	} else {
-		u8 dws = num_bytes & ~3;
-		for (i = 0; i < ((num_bytes + 3) / 4); i++)
-			dst32[i] = le32_to_cpu(src32[i]);
-		memcpy(dst, dst_tmp, dws);
-		if (num_bytes % 4) {
-			for (i = 0; i < (num_bytes % 4); i++)
-				dst[dws+i] = dst_tmp[dws+i];
-		}
+		memcpy(src_tmp, src, align_num_bytes);
+		for (i = 0; i < align_num_bytes / 4; i++)
+			dst_tmp[i] = le32_to_cpu(src_tmp[i]);
+		memcpy(dst, dst_tmp, num_bytes);
 	}
 #else
 	memcpy(dst, src, num_bytes);
diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c
index b5760851195c..0c6216a6ee9e 100644
--- a/drivers/gpu/drm/radeon/atombios_dp.c
+++ b/drivers/gpu/drm/radeon/atombios_dp.c
@@ -45,34 +45,32 @@ static char *pre_emph_names[] = {
 
 /***** radeon AUX functions *****/
 
-/* Atom needs data in little endian format
- * so swap as appropriate when copying data to
- * or from atom. Note that atom operates on
- * dw units.
+/* Atom needs data in little endian format so swap as appropriate when copying
+ * data to or from atom. Note that atom operates on dw units.
+ *
+ * Use to_le=true when sending data to atom and provide at least
+ * ALIGN(num_bytes,4) bytes in the dst buffer.
+ *
+ * Use to_le=false when receiving data from atom and provide ALIGN(num_bytes,4)
+ * byes in the src buffer.
  */
 void radeon_atom_copy_swap(u8 *dst, u8 *src, u8 num_bytes, bool to_le)
 {
 #ifdef __BIG_ENDIAN
-	u8 src_tmp[20], dst_tmp[20]; /* used for byteswapping */
-	u32 *dst32, *src32;
+	u32 src_tmp[5], dst_tmp[5];
 	int i;
+	u8 align_num_bytes = ALIGN(num_bytes, 4);
 
-	memcpy(src_tmp, src, num_bytes);
-	src32 = (u32 *)src_tmp;
-	dst32 = (u32 *)dst_tmp;
 	if (to_le) {
-		for (i = 0; i < ((num_bytes + 3) / 4); i++)
-			dst32[i] = cpu_to_le32(src32[i]);
-		memcpy(dst, dst_tmp, num_bytes);
+		memcpy(src_tmp, src, num_bytes);
+		for (i = 0; i < align_num_bytes / 4; i++)
+			dst_tmp[i] = cpu_to_le32(src_tmp[i]);
+		memcpy(dst, dst_tmp, align_num_bytes);
 	} else {
-		u8 dws = num_bytes & ~3;
-		for (i = 0; i < ((num_bytes + 3) / 4); i++)
-			dst32[i] = le32_to_cpu(src32[i]);
-		memcpy(dst, dst_tmp, dws);
-		if (num_bytes % 4) {
-			for (i = 0; i < (num_bytes % 4); i++)
-				dst[dws+i] = dst_tmp[dws+i];
-		}
+		memcpy(src_tmp, src, align_num_bytes);
+		for (i = 0; i < align_num_bytes / 4; i++)
+			dst_tmp[i] = le32_to_cpu(src_tmp[i]);
+		memcpy(dst, dst_tmp, num_bytes);
 	}
 #else
 	memcpy(dst, src, num_bytes);

From a155a9568d9abf50fe4913c4bf7b46a8281d3dc3 Mon Sep 17 00:00:00 2001
From: Jonathan Liu <net147@gmail.com>
Date: Mon, 7 Aug 2017 21:55:45 +1000
Subject: [PATCH 20/28] drm/panel: simple: Add missing panel_simple_unprepare()
 calls

commit f3621a8eb59a913612c8e6e37d81f16b649f8b6c upstream.

During panel removal or system shutdown panel_simple_disable() is called
which disables the panel backlight but the panel is still powered due to
missing calls to panel_simple_unprepare().

Fixes: d02fd93e2cd8 ("drm/panel: simple - Disable panel on shutdown")
Signed-off-by: Jonathan Liu <net147@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170807115545.27747-1-net147@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/panel/panel-simple.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c
index f97b73ec4713..f418c002d323 100644
--- a/drivers/gpu/drm/panel/panel-simple.c
+++ b/drivers/gpu/drm/panel/panel-simple.c
@@ -352,6 +352,7 @@ static int panel_simple_remove(struct device *dev)
 	drm_panel_remove(&panel->base);
 
 	panel_simple_disable(&panel->base);
+	panel_simple_unprepare(&panel->base);
 
 	if (panel->ddc)
 		put_device(&panel->ddc->dev);
@@ -367,6 +368,7 @@ static void panel_simple_shutdown(struct device *dev)
 	struct panel_simple *panel = dev_get_drvdata(dev);
 
 	panel_simple_disable(&panel->base);
+	panel_simple_unprepare(&panel->base);
 }
 
 static const struct drm_display_mode ampire_am800480r3tmqwa1h_mode = {

From c8ea49b690eef5c180369a8d132c5e50a74b3acf Mon Sep 17 00:00:00 2001
From: Brent Taylor <motobud@gmail.com>
Date: Mon, 30 Oct 2017 22:32:45 -0500
Subject: [PATCH 21/28] mtd: nand: Fix writing mtdoops to nand flash.

commit 30863e38ebeb500a31cecee8096fb5002677dd9b upstream.

When mtdoops calls mtd_panic_write(), it eventually calls
panic_nand_write() in nand_base.c. In order to properly wait for the
nand chip to be ready in panic_nand_wait(), the chip must first be
selected.

When using the atmel nand flash controller, a panic would occur due to
a NULL pointer exception.

Fixes: 2af7c6539931 ("mtd: Add panic_write for NAND flashes")
Cc: <stable@vger.kernel.org>
Signed-off-by: Brent Taylor <motobud@gmail.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mtd/nand/nand_base.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 54ab48827258..7ba109e8cf88 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2663,15 +2663,18 @@ static int panic_nand_write(struct mtd_info *mtd, loff_t to, size_t len,
 			    size_t *retlen, const uint8_t *buf)
 {
 	struct nand_chip *chip = mtd->priv;
+	int chipnr = (int)(to >> chip->chip_shift);
 	struct mtd_oob_ops ops;
 	int ret;
 
-	/* Wait for the device to get ready */
-	panic_nand_wait(mtd, chip, 400);
-
 	/* Grab the device */
 	panic_nand_get_device(chip, mtd, FL_WRITING);
 
+	chip->select_chip(mtd, chipnr);
+
+	/* Wait for the device to get ready */
+	panic_nand_wait(mtd, chip, 400);
+
 	memset(&ops, 0, sizeof(ops));
 	ops.len = len;
 	ops.datbuf = (uint8_t *)buf;

From a61474fa8749f67ef18aa2104b48b0b56ea654b7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 25 Aug 2017 17:34:41 +1000
Subject: [PATCH 22/28] NFS: revalidate "." etc correctly on "open".

commit b688741cb06695312f18b730653d6611e1bad28d upstream.

For correct close-to-open semantics, NFS must validate
the change attribute of a directory (or file) on open.

Since commit ecf3d1f1aa74 ("vfs: kill FS_REVAL_DOT by adding a
d_weak_revalidate dentry op"), open() of "." or a path ending ".." is
not revalidated reliably (except when that direct is a mount point).

Prior to that commit, "." was revalidated using nfs_lookup_revalidate()
which checks the LOOKUP_OPEN flag and forces revalidation if the flag is
set.
Since that commit, nfs_weak_revalidate() is used for NFSv3 (which
ignores the flags) and nothing is used for NFSv4.

This is fixed by using nfs_lookup_verify_inode() in
nfs_weak_revalidate().  This does the revalidation exactly when needed.
Also, add a definition of .d_weak_revalidate for NFSv4.

The incorrect behavior is easily demonstrated by running "echo *" in
some non-mountpoint NFS directory while watching network traffic.
Without this patch, "echo *" sometimes doesn't produce any traffic.
With the patch it always does.

Fixes: ecf3d1f1aa74 ("vfs: kill FS_REVAL_DOT by adding a d_weak_revalidate dentry op")
cc: stable@vger.kernel.org (3.9+)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/dir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 348e0a05bd18..44e09483d2cd 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1260,7 +1260,7 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
 		return 0;
 	}
 
-	error = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	error = nfs_lookup_verify_inode(inode, flags);
 	dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n",
 			__func__, inode->i_ino, error ? "invalid" : "valid");
 	return !error;
@@ -1420,6 +1420,7 @@ static int nfs4_lookup_revalidate(struct dentry *, unsigned int);
 
 const struct dentry_operations nfs4_dentry_operations = {
 	.d_revalidate	= nfs4_lookup_revalidate,
+	.d_weak_revalidate	= nfs_weak_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
 	.d_automount	= nfs_d_automount,

From e6bcff7b6aa1c0fb30234864821e8169eeb8054d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Thu, 23 Nov 2017 21:41:56 +0200
Subject: [PATCH 23/28] drm/i915: Don't try indexed reads to alternate slave
 addresses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit ae5c631e605a452a5a0e73205a92810c01ed954b upstream.

We can only specify the one slave address to indexed reads/writes.
Make sure the messages we check are destined to the same slave
address before deciding to do an indexed transfer.

Cc: Daniel Kurtz <djkurtz@chromium.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Sean Paul <seanpaul@chromium.org>
Fixes: 56f9eac05489 ("drm/i915/intel_i2c: use INDEX cycles for i2c read transactions")
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171123194157.25367-2-ville.syrjala@linux.intel.com
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
(cherry picked from commit c4deb62d7821672265b87952bcd1c808f3bf3e8f)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/i915/intel_i2c.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/intel_i2c.c b/drivers/gpu/drm/i915/intel_i2c.c
index f3bee54c414f..ce625496ce01 100644
--- a/drivers/gpu/drm/i915/intel_i2c.c
+++ b/drivers/gpu/drm/i915/intel_i2c.c
@@ -440,6 +440,7 @@ static bool
 gmbus_is_index_read(struct i2c_msg *msgs, int i, int num)
 {
 	return (i + 1 < num &&
+		msgs[i].addr == msgs[i + 1].addr &&
 		!(msgs[i].flags & I2C_M_RD) && msgs[i].len <= 2 &&
 		(msgs[i + 1].flags & I2C_M_RD));
 }

From 68e6cd9a0547d40d921baff918c111db96d96d76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Thu, 23 Nov 2017 21:41:57 +0200
Subject: [PATCH 24/28] drm/i915: Prevent zero length "index" write
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 56350fb8978bbf4aafe08f21234e161dd128b417 upstream.

The hardware always writes one or two bytes in the index portion of
an indexed transfer. Make sure the message we send as the index
doesn't have a zero length.

Cc: Daniel Kurtz <djkurtz@chromium.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Sean Paul <seanpaul@chromium.org>
Fixes: 56f9eac05489 ("drm/i915/intel_i2c: use INDEX cycles for i2c read transactions")
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171123194157.25367-3-ville.syrjala@linux.intel.com
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
(cherry picked from commit bb9e0d4bca50f429152e74a459160b41f3d60fb2)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/i915/intel_i2c.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_i2c.c b/drivers/gpu/drm/i915/intel_i2c.c
index ce625496ce01..cb4313c68f71 100644
--- a/drivers/gpu/drm/i915/intel_i2c.c
+++ b/drivers/gpu/drm/i915/intel_i2c.c
@@ -441,7 +441,8 @@ gmbus_is_index_read(struct i2c_msg *msgs, int i, int num)
 {
 	return (i + 1 < num &&
 		msgs[i].addr == msgs[i + 1].addr &&
-		!(msgs[i].flags & I2C_M_RD) && msgs[i].len <= 2 &&
+		!(msgs[i].flags & I2C_M_RD) &&
+		(msgs[i].len == 1 || msgs[i].len == 2) &&
 		(msgs[i + 1].flags & I2C_M_RD));
 }
 

From d7f5f10f5159fd37dff9d566c0e6a9a61f2c8a66 Mon Sep 17 00:00:00 2001
From: Oleg Drokin <green@linuxhacker.ru>
Date: Tue, 14 Jun 2016 23:28:06 -0400
Subject: [PATCH 25/28] nfsd: Make init_open_stateid() a bit more whole

commit 8c7245abda877d4689b3371db8ae2a4400d7d9ce upstream.

Move the state selection logic inside from the caller,
always making it return correct stp to use.

Signed-off-by: J . Bruce Fields <bfields@fieldses.org>
Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfsd/nfs4state.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 421935f3d909..76caee97d2cd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3420,13 +3420,14 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
 }
 
 static struct nfs4_ol_stateid *
-init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
-		struct nfsd4_open *open)
+init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open)
 {
 
 	struct nfs4_openowner *oo = open->op_openowner;
 	struct nfs4_ol_stateid *retstp = NULL;
+	struct nfs4_ol_stateid *stp;
 
+	stp = open->op_stp;
 	/* We are moving these outside of the spinlocks to avoid the warnings */
 	mutex_init(&stp->st_mutex);
 	mutex_lock(&stp->st_mutex);
@@ -3437,6 +3438,8 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
 	retstp = nfsd4_find_existing_open(fp, open);
 	if (retstp)
 		goto out_unlock;
+
+	open->op_stp = NULL;
 	atomic_inc(&stp->st_stid.sc_count);
 	stp->st_stid.sc_type = NFS4_OPEN_STID;
 	INIT_LIST_HEAD(&stp->st_locks);
@@ -3454,10 +3457,11 @@ out_unlock:
 	spin_unlock(&oo->oo_owner.so_client->cl_lock);
 	if (retstp) {
 		mutex_lock(&retstp->st_mutex);
-		/* Not that we need to, just for neatness */
+		/* To keep mutex tracking happy */
 		mutex_unlock(&stp->st_mutex);
+		stp = retstp;
 	}
-	return retstp;
+	return stp;
 }
 
 /*
@@ -4260,7 +4264,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
 	struct nfs4_file *fp = NULL;
 	struct nfs4_ol_stateid *stp = NULL;
-	struct nfs4_ol_stateid *swapstp = NULL;
 	struct nfs4_delegation *dp = NULL;
 	__be32 status;
 
@@ -4297,16 +4300,10 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 			goto out;
 		}
 	} else {
-		stp = open->op_stp;
-		open->op_stp = NULL;
-		/*
-		 * init_open_stateid() either returns a locked stateid
-		 * it found, or initializes and locks the new one we passed in
-		 */
-		swapstp = init_open_stateid(stp, fp, open);
-		if (swapstp) {
-			nfs4_put_stid(&stp->st_stid);
-			stp = swapstp;
+		/* stp is returned locked. */
+		stp = init_open_stateid(fp, open);
+		/* See if we lost the race to some other thread */
+		if (stp->st_access_bmap != 0) {
 			status = nfs4_upgrade_open(rqstp, fp, current_fh,
 						stp, open);
 			if (status) {

From 54298082dbacba2591b2aa461320f31946cdbffb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 3 Nov 2017 08:00:10 -0400
Subject: [PATCH 26/28] nfsd: Fix stateid races between OPEN and CLOSE

commit 15ca08d3299682dc49bad73251677b2c5017ef08 upstream.

Open file stateids can linger on the nfs4_file list of stateids even
after they have been closed. In order to avoid reusing such a
stateid, and confusing the client, we need to recheck the
nfs4_stid's type after taking the mutex.
Otherwise, we risk reusing an old stateid that was already closed,
which will confuse clients that expect new stateids to conform to
RFC7530 Sections 9.1.4.2 and 16.2.5 or RFC5661 Sections 8.2.2 and 18.2.4.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfsd/nfs4state.c | 67 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 8 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 76caee97d2cd..3bf086aca5c0 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3379,7 +3379,9 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
 		/* ignore lock owners */
 		if (local->st_stateowner->so_is_open_owner == 0)
 			continue;
-		if (local->st_stateowner == &oo->oo_owner) {
+		if (local->st_stateowner != &oo->oo_owner)
+			continue;
+		if (local->st_stid.sc_type == NFS4_OPEN_STID) {
 			ret = local;
 			atomic_inc(&ret->st_stid.sc_count);
 			break;
@@ -3388,6 +3390,52 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
 	return ret;
 }
 
+static __be32
+nfsd4_verify_open_stid(struct nfs4_stid *s)
+{
+	__be32 ret = nfs_ok;
+
+	switch (s->sc_type) {
+	default:
+		break;
+	case NFS4_CLOSED_STID:
+	case NFS4_CLOSED_DELEG_STID:
+		ret = nfserr_bad_stateid;
+		break;
+	case NFS4_REVOKED_DELEG_STID:
+		ret = nfserr_deleg_revoked;
+	}
+	return ret;
+}
+
+/* Lock the stateid st_mutex, and deal with races with CLOSE */
+static __be32
+nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp)
+{
+	__be32 ret;
+
+	mutex_lock(&stp->st_mutex);
+	ret = nfsd4_verify_open_stid(&stp->st_stid);
+	if (ret != nfs_ok)
+		mutex_unlock(&stp->st_mutex);
+	return ret;
+}
+
+static struct nfs4_ol_stateid *
+nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+	struct nfs4_ol_stateid *stp;
+	for (;;) {
+		spin_lock(&fp->fi_lock);
+		stp = nfsd4_find_existing_open(fp, open);
+		spin_unlock(&fp->fi_lock);
+		if (!stp || nfsd4_lock_ol_stateid(stp) == nfs_ok)
+			break;
+		nfs4_put_stid(&stp->st_stid);
+	}
+	return stp;
+}
+
 static struct nfs4_openowner *
 alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
 			   struct nfsd4_compound_state *cstate)
@@ -3432,6 +3480,7 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open)
 	mutex_init(&stp->st_mutex);
 	mutex_lock(&stp->st_mutex);
 
+retry:
 	spin_lock(&oo->oo_owner.so_client->cl_lock);
 	spin_lock(&fp->fi_lock);
 
@@ -3456,7 +3505,11 @@ out_unlock:
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&oo->oo_owner.so_client->cl_lock);
 	if (retstp) {
-		mutex_lock(&retstp->st_mutex);
+		/* Handle races with CLOSE */
+		if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) {
+			nfs4_put_stid(&retstp->st_stid);
+			goto retry;
+		}
 		/* To keep mutex tracking happy */
 		mutex_unlock(&stp->st_mutex);
 		stp = retstp;
@@ -4277,9 +4330,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		status = nfs4_check_deleg(cl, open, &dp);
 		if (status)
 			goto out;
-		spin_lock(&fp->fi_lock);
-		stp = nfsd4_find_existing_open(fp, open);
-		spin_unlock(&fp->fi_lock);
+		stp = nfsd4_find_and_lock_existing_open(fp, open);
 	} else {
 		open->op_file = NULL;
 		status = nfserr_bad_stateid;
@@ -4293,7 +4344,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	 */
 	if (stp) {
 		/* Stateid was found, this is an OPEN upgrade */
-		mutex_lock(&stp->st_mutex);
 		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
 		if (status) {
 			mutex_unlock(&stp->st_mutex);
@@ -5150,7 +5200,6 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 	bool unhashed;
 	LIST_HEAD(reaplist);
 
-	s->st_stid.sc_type = NFS4_CLOSED_STID;
 	spin_lock(&clp->cl_lock);
 	unhashed = unhash_open_stateid(s, &reaplist);
 
@@ -5189,10 +5238,12 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfsd4_bump_seqid(cstate, status);
 	if (status)
 		goto out; 
+
+	stp->st_stid.sc_type = NFS4_CLOSED_STID;
 	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
-	mutex_unlock(&stp->st_mutex);
 
 	nfsd4_close_open_stateid(stp);
+	mutex_unlock(&stp->st_mutex);
 
 	/* put reference from nfs4_preprocess_seqid_op */
 	nfs4_put_stid(&stp->st_stid);

From bd249dd078272e8cef53b98a73c1cdd60c532251 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 3 Nov 2017 08:00:11 -0400
Subject: [PATCH 27/28] nfsd: Fix another OPEN stateid race

commit d8a1a000555ecd1b824ac1ed6df8fe364dfbbbb0 upstream.

If nfsd4_process_open2() is initialising a new stateid, and yet the
call to nfs4_get_vfs_file() fails for some reason, then we must
declare the stateid closed, and unhash it before dropping the mutex.

Right now, we unhash the stateid after dropping the mutex, and without
changing the stateid type, meaning that another OPEN could theoretically
look it up and attempt to use it.

Reported-by: Andrew W Elble <aweits@rit.edu>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfsd/nfs4state.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3bf086aca5c0..11c67e8b939d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4319,6 +4319,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	struct nfs4_ol_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
 	__be32 status;
+	bool new_stp = false;
 
 	/*
 	 * Lookup file; if found, lookup stateid and check open request,
@@ -4338,11 +4339,19 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 			goto out;
 	}
 
+	if (!stp) {
+		stp = init_open_stateid(fp, open);
+		if (!open->op_stp)
+			new_stp = true;
+	}
+
 	/*
 	 * OPEN the file, or upgrade an existing OPEN.
 	 * If truncate fails, the OPEN fails.
+	 *
+	 * stp is already locked.
 	 */
-	if (stp) {
+	if (!new_stp) {
 		/* Stateid was found, this is an OPEN upgrade */
 		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
 		if (status) {
@@ -4350,22 +4359,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 			goto out;
 		}
 	} else {
-		/* stp is returned locked. */
-		stp = init_open_stateid(fp, open);
-		/* See if we lost the race to some other thread */
-		if (stp->st_access_bmap != 0) {
-			status = nfs4_upgrade_open(rqstp, fp, current_fh,
-						stp, open);
-			if (status) {
-				mutex_unlock(&stp->st_mutex);
-				goto out;
-			}
-			goto upgrade_out;
-		}
 		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
 		if (status) {
-			mutex_unlock(&stp->st_mutex);
+			stp->st_stid.sc_type = NFS4_CLOSED_STID;
 			release_open_stateid(stp);
+			mutex_unlock(&stp->st_mutex);
 			goto out;
 		}
 
@@ -4374,7 +4372,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		if (stp->st_clnt_odstate == open->op_odstate)
 			open->op_odstate = NULL;
 	}
-upgrade_out:
+
 	nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
 	mutex_unlock(&stp->st_mutex);
 

From bd379939182247ace70e425ab60b3b1352db601b Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 5 Dec 2017 11:22:52 +0100
Subject: [PATCH 28/28] Linux 4.4.104

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index f5a51cd7ca49..55500e023f61 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 103
+SUBLEVEL = 104
 EXTRAVERSION =
 NAME = Blurry Fish Butt