From 8357b00fc7ddb84306a711547ea7f6cf91f8ef58 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 19 Jul 2018 10:49:13 +0530 Subject: [PATCH 1/2] arm64: vmlinux.ld: Add mmuoff data sections and move mmuoff text into idmap Resume from hibernate needs to clean any text executed by the kernel with the MMU off to the PoC. Collect these functions together into the .idmap.text section as all this code is tightly coupled and also needs the same cleaning after resume. Data is more complicated, secondary_holding_pen_release is written with the MMU on, clean and invalidated, then read with the MMU off. In contrast __boot_cpu_mode is written with the MMU off, the corresponding cache line is invalidated, so when we read it with the MMU on we don't get stale data. These cache maintenance operations conflict with each other if the values are within a Cache Writeback Granule (CWG) of each other. Collect the data into two sections .mmuoff.data.read and .mmuoff.data.write, the linker script ensures mmuoff.data.write section is aligned to the architectural maximum CWG of 2KB. Change-Id: I3f5add863896e0acaa54dd11929fc1d553d402f4 Signed-off-by: James Morse Cc: Ard Biesheuvel Cc: Mark Rutland Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Git-Commit: b61130381120398876b86282082ad9f24976dfcf Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git Signed-off-by: Arun KS --- arch/arm64/include/asm/sections.h | 1 + arch/arm64/kernel/head.S | 19 +++++++++++++++---- arch/arm64/kernel/sleep.S | 2 +- arch/arm64/kernel/smp_spin_table.c | 3 ++- arch/arm64/kernel/vmlinux.lds.S | 19 +++++++++++++++++++ arch/arm64/mm/proc.S | 4 ++++ 6 files changed, 42 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 2686c431f8d6..b865e83e57f5 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -23,5 +23,6 @@ extern char __exception_text_start[], __exception_text_end[]; extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[]; extern char __idmap_text_start[], __idmap_text_end[]; extern char __irqentry_text_start[], __irqentry_text_end[]; +extern char __mmuoff_data_start[], __mmuoff_data_end[]; #endif /* __ASM_SECTIONS_H */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index b8121a51404f..d5d4afbf0518 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -472,7 +472,7 @@ ENDPROC(__primary_switched) * end early head section, begin head code that is also used for * hotplug and needs to have the same protections as the text region */ - .section ".text","ax" + .section ".idmap.text","ax" ENTRY(kimage_vaddr) .quad _text - TEXT_OFFSET @@ -594,6 +594,13 @@ set_cpu_boot_mode_flag: ret ENDPROC(set_cpu_boot_mode_flag) +/* + * These values are written with the MMU off, but read with the MMU on. + * Writers will invalidate the corresponding address, discarding up to a + * 'Cache Writeback Granule' (CWG) worth of data. The linker script ensures + * sufficient alignment that the CWG doesn't overlap another section. + */ + .pushsection ".mmuoff.data.write", "aw" /* * We need to find out the CPU boot mode long after boot, so we need to * store it in a writable variable. @@ -601,11 +608,16 @@ ENDPROC(set_cpu_boot_mode_flag) * This is not in .bss, because we set it sufficiently early that the boot-time * zeroing of .bss would clobber it. */ - .pushsection .data..cacheline_aligned - .align L1_CACHE_SHIFT ENTRY(__boot_cpu_mode) .long BOOT_CPU_MODE_EL2 .long BOOT_CPU_MODE_EL1 +/* + * The booting CPU updates the failed status @__early_cpu_boot_status, + * with MMU turned off. + */ +ENTRY(__early_cpu_boot_status) + .long 0 + .popsection /* @@ -679,7 +691,6 @@ ENDPROC(__secondary_switched) * Checks if the selected granule size is supported by the CPU. * If it isn't, park the CPU */ - .section ".idmap.text", "ax" ENTRY(__enable_mmu) mrs x22, sctlr_el1 // preserve old SCTLR_EL1 value mrs x1, ID_AA64MMFR0_EL1 diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index dd9745262e1a..100f92f13113 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -97,6 +97,7 @@ ENTRY(__cpu_suspend_enter) ENDPROC(__cpu_suspend_enter) .ltorg + .pushsection ".idmap.text", "ax" ENTRY(cpu_resume) bl el2_setup // if in EL2 drop to EL1 cleanly /* enable the MMU early - so we can access sleep_save_stash by va */ @@ -107,7 +108,6 @@ ENTRY(cpu_resume) b __cpu_setup ENDPROC(cpu_resume) - .pushsection ".idmap.text", "ax" _resume_switched: ldr x8, =_cpu_resume br x8 diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c index 2ccb883353d9..303d571702ea 100644 --- a/arch/arm64/kernel/smp_spin_table.c +++ b/arch/arm64/kernel/smp_spin_table.c @@ -30,7 +30,8 @@ #include extern void secondary_holding_pen(void); -volatile unsigned long secondary_holding_pen_release = INVALID_HWID; +volatile unsigned long __section(".mmuoff.data.read") +secondary_holding_pen_release = INVALID_HWID; static phys_addr_t cpu_release_addr[NR_CPUS]; diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 42a6c4f35af2..71c8076bbc60 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -197,6 +197,25 @@ SECTIONS _data = .; _sdata = .; RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + + /* + * Data written with the MMU off but read with the MMU on requires + * cache lines to be invalidated, discarding up to a Cache Writeback + * Granule (CWG) of data from the cache. Keep the section that + * requires this type of maintenance to be in its own Cache Writeback + * Granule (CWG) area so the cache maintenance operations don't + * interfere with adjacent data. + */ + .mmuoff.data.write : ALIGN(SZ_2K) { + __mmuoff_data_start = .; + *(.mmuoff.data.write) + } + . = ALIGN(SZ_2K); + .mmuoff.data.read : { + *(.mmuoff.data.read) + __mmuoff_data_end = .; + } + PECOFF_EDATA_PADDING _edata = .; diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 7a7cf5c2215d..b78688806652 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -132,6 +132,7 @@ ENDPROC(cpu_do_suspend) * * x0: Address of context pointer */ + .pushsection ".idmap.text", "ax" ENTRY(cpu_do_resume) ldp x2, x3, [x0] ldp x4, x5, [x0, #16] @@ -163,6 +164,7 @@ ENTRY(cpu_do_resume) isb ret ENDPROC(cpu_do_resume) + .popsection #endif /* @@ -220,6 +222,7 @@ ENDPROC(idmap_cpu_replace_ttbr1) * Initialise the processor for turning the MMU on. Return in x0 the * value of the SCTLR_EL1 register. */ + .pushsection ".idmap.text", "ax" ENTRY(__cpu_setup) tlbi vmalle1 // Invalidate local TLB dsb nsh @@ -321,3 +324,4 @@ crval: #endif .word 0xfcffffff // clear .word 0x34d5d91d | CR_IBIT | CR_CBIT // set + .popsection From 66859cc29f5259cf934dcfe1ec9ba8015d591ea9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 19 Jul 2018 10:55:14 +0530 Subject: [PATCH 2/2] arm64: hibernate: Support DEBUG_PAGEALLOC DEBUG_PAGEALLOC removes the valid bit of page table entries to prevent any access to unallocated memory. Hibernate uses this as a hint that those pages don't need to be saved/restored. This patch adds the kernel_page_present() function it uses. hibernate.c copies the resume kernel's linear map for use during restore. Add _copy_pte() to fill-in the holes made by DEBUG_PAGEALLOC in the resume kernel, so we can restore data the original kernel had at these addresses. Finally, DEBUG_PAGEALLOC means the linear-map alias of KERNEL_START to KERNEL_END may have holes in it, so we can't lazily clean this whole area to the PoC. Only clean the new mmuoff region, and the kernel/kvm idmaps. This reverts commit da24eb1f3f9e2c7b75c5f8c40d8e48e2c4789596. Change-Id: I862226802c9c726590c89e3d9e8062ed680309f3 Reported-by: Will Deacon Signed-off-by: James Morse Git-Commit: 5ebe3a44cc744d11cb60d8438106a9322b7c04dc Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git Signed-off-by: Arun KS --- arch/arm64/include/asm/pgtable.h | 10 ++++++++ arch/arm64/kernel/hibernate.c | 41 +++++++++++++++++++++++++------- arch/arm64/mm/pageattr.c | 41 +++++++++++++++++++++++++++++++- 3 files changed, 82 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b190367fe6be..13d6b496de92 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -232,6 +232,16 @@ static inline pte_t pte_mknoncont(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_CONT)); } +static inline pte_t pte_clear_rdonly(pte_t pte) +{ + return clear_pte_bit(pte, __pgprot(PTE_RDONLY)); +} + +static inline pte_t pte_mkpresent(pte_t pte) +{ + return set_pte_bit(pte, __pgprot(PTE_VALID)); +} + static inline pmd_t pmd_mkcont(pmd_t pmd) { return __pmd(pmd_val(pmd) | PMD_SECT_CONT); diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 55a872b89fbb..8761eb95ed27 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -234,6 +234,7 @@ out: return rc; } +#define dcache_clean_range(start, end) __flush_dcache_area(start, (end - start)) int swsusp_arch_suspend(void) { @@ -246,8 +247,9 @@ int swsusp_arch_suspend(void) if (__cpu_suspend_enter(&state)) { ret = swsusp_save(); } else { - /* Clean kernel to PoC for secondary core startup */ - __flush_dcache_area(LMADDR(KERNEL_START), KERNEL_END - KERNEL_START); + /* Clean kernel core startup/idle code to PoC*/ + dcache_clean_range(__mmuoff_data_start, __mmuoff_data_end); + dcache_clean_range(__idmap_text_start, __idmap_text_end); /* * Tell the hibernation core that we've just restored @@ -263,6 +265,33 @@ int swsusp_arch_suspend(void) return ret; } +static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr) +{ + pte_t pte = *src_pte; + + if (pte_valid(pte)) { + /* + * Resume will overwrite areas that may be marked + * read only (code, rodata). Clear the RDONLY bit from + * the temporary mappings we use during restore. + */ + set_pte(dst_pte, pte_clear_rdonly(pte)); + } else if (debug_pagealloc_enabled() && !pte_none(pte)) { + /* + * debug_pagealloc will removed the PTE_VALID bit if + * the page isn't in use by the resume kernel. It may have + * been in use by the original kernel, in which case we need + * to put it back in our copy to do the restore. + * + * Before marking this entry valid, check the pfn should + * be mapped. + */ + BUG_ON(!pfn_valid(pte_pfn(pte))); + + set_pte(dst_pte, pte_mkpresent(pte_clear_rdonly(pte))); + } +} + static int copy_pte(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long start, unsigned long end) { @@ -278,13 +307,7 @@ static int copy_pte(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long start, src_pte = pte_offset_kernel(src_pmd, start); do { - if (!pte_none(*src_pte)) - /* - * Resume will overwrite areas that may be marked - * read only (code, rodata). Clear the RDONLY bit from - * the temporary mappings we use during restore. - */ - set_pte(dst_pte, __pte(pte_val(*src_pte) & ~PTE_RDONLY)); + _copy_pte(dst_pte, src_pte, addr); } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); return 0; diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 4754762bde49..6ea71387ee12 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -146,4 +146,43 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) __pgprot(0), __pgprot(PTE_VALID)); } -#endif +#ifdef CONFIG_HIBERNATION +/* + * When built with CONFIG_DEBUG_PAGEALLOC and CONFIG_HIBERNATION, this function + * is used to determine if a linear map page has been marked as not-valid by + * CONFIG_DEBUG_PAGEALLOC. Walk the page table and check the PTE_VALID bit. + * This is based on kern_addr_valid(), which almost does what we need. + * + * Because this is only called on the kernel linear map, p?d_sect() implies + * p?d_present(). When debug_pagealloc is enabled, sections mappings are + * disabled. + */ +bool kernel_page_present(struct page *page) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long addr = (unsigned long)page_address(page); + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + return false; + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) + return false; + if (pud_sect(*pud)) + return true; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return false; + if (pmd_sect(*pmd)) + return true; + + pte = pte_offset_kernel(pmd, addr); + return pte_valid(*pte); +} +#endif /* CONFIG_HIBERNATION */ +#endif /* CONFIG_DEBUG_PAGEALLOC */