From 7cbb87d67e38cfc55680290a706fd7517f10050d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 28 Oct 2014 19:36:45 +0000 Subject: [PATCH 1/5] arm64: KVM: fix unmapping with 48-bit VAs Currently if using a 48-bit VA, tearing down the hyp page tables (which can happen in the absence of a GICH or GICV resource) results in the rather nasty splat below, evidently becasue we access a table that doesn't actually exist. Commit 38f791a4e499792e (arm64: KVM: Implement 48 VA support for KVM EL2 and Stage-2) added a pgd_none check to __create_hyp_mappings to account for the additional level of tables, but didn't add a corresponding check to unmap_range, and this seems to be the source of the problem. This patch adds the missing pgd_none check, ensuring we don't try to access tables that don't exist. Original splat below: kvm [1]: Using HYP init bounce page @83fe94a000 kvm [1]: Cannot obtain GICH resource Unable to handle kernel paging request at virtual address ffff7f7fff000000 pgd = ffff800000770000 [ffff7f7fff000000] *pgd=0000000000000000 Internal error: Oops: 96000004 [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 1 Comm: swapper/0 Not tainted 3.18.0-rc2+ #89 task: ffff8003eb500000 ti: ffff8003eb45c000 task.ti: ffff8003eb45c000 PC is at unmap_range+0x120/0x580 LR is at free_hyp_pgds+0xac/0xe4 pc : [] lr : [] pstate: 80000045 sp : ffff8003eb45fbf0 x29: ffff8003eb45fbf0 x28: ffff800000736000 x27: ffff800000735000 x26: ffff7f7fff000000 x25: 0000000040000000 x24: ffff8000006f5000 x23: 0000000000000000 x22: 0000007fffffffff x21: 0000800000000000 x20: 0000008000000000 x19: 0000000000000000 x18: ffff800000648000 x17: ffff800000537228 x16: 0000000000000000 x15: 000000000000001f x14: 0000000000000000 x13: 0000000000000001 x12: 0000000000000020 x11: 0000000000000062 x10: 0000000000000006 x9 : 0000000000000000 x8 : 0000000000000063 x7 : 0000000000000018 x6 : 00000003ff000000 x5 : ffff800000744188 x4 : 0000000000000001 x3 : 0000000040000000 x2 : ffff800000000000 x1 : 0000007fffffffff x0 : 000000003fffffff Process swapper/0 (pid: 1, stack limit = 0xffff8003eb45c058) Stack: (0xffff8003eb45fbf0 to 0xffff8003eb460000) fbe0: eb45fcb0 ffff8003 0009cad8 ffff8000 fc00: 00000000 00000080 00736140 ffff8000 00736000 ffff8000 00000000 00007c80 fc20: 00000000 00000080 006f5000 ffff8000 00000000 00000080 00743000 ffff8000 fc40: 00735000 ffff8000 006d3030 ffff8000 006fe7b8 ffff8000 00000000 00000080 fc60: ffffffff 0000007f fdac1000 ffff8003 fd94b000 ffff8003 fda47000 ffff8003 fc80: 00502b40 ffff8000 ff000000 ffff7f7f fdec6000 00008003 fdac1630 ffff8003 fca0: eb45fcb0 ffff8003 ffffffff 0000007f eb45fd00 ffff8003 0009b378 ffff8000 fcc0: ffffffea 00000000 006fe000 ffff8000 00736728 ffff8000 00736120 ffff8000 fce0: 00000040 00000000 00743000 ffff8000 006fe7b8 ffff8000 0050cd48 00000000 fd00: eb45fd60 ffff8003 00096070 ffff8000 006f06e0 ffff8000 006f06e0 ffff8000 fd20: fd948b40 ffff8003 0009a320 ffff8000 00000000 00000000 00000000 00000000 fd40: 00000ae0 00000000 006aa25c ffff8000 eb45fd60 ffff8003 0017ca44 00000002 fd60: eb45fdc0 ffff8003 0009a33c ffff8000 006f06e0 ffff8000 006f06e0 ffff8000 fd80: fd948b40 ffff8003 0009a320 ffff8000 00000000 00000000 00735000 ffff8000 fda0: 006d3090 ffff8000 006aa25c ffff8000 00735000 ffff8000 006d3030 ffff8000 fdc0: eb45fdd0 ffff8003 000814c0 ffff8000 eb45fe50 ffff8003 006aaac4 ffff8000 fde0: 006ddd90 ffff8000 00000006 00000000 006d3000 ffff8000 00000095 00000000 fe00: 006a1e90 ffff8000 00735000 ffff8000 006d3000 ffff8000 006aa25c ffff8000 fe20: 00735000 ffff8000 006d3030 ffff8000 eb45fe50 ffff8003 006fac68 ffff8000 fe40: 00000006 00000006 fe293ee6 ffff8003 eb45feb0 ffff8003 004f8ee8 ffff8000 fe60: 004f8ed4 ffff8000 00735000 ffff8000 00000000 00000000 00000000 00000000 fe80: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 fea0: 00000000 00000000 00000000 00000000 00000000 00000000 000843d0 ffff8000 fec0: 004f8ed4 ffff8000 00000000 00000000 00000000 00000000 00000000 00000000 fee0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ff00: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ff20: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ff40: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ff60: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ff80: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ffa0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ffc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000005 00000000 ffe0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 Call trace: [] unmap_range+0x120/0x580 [] free_hyp_pgds+0xa8/0xe4 [] kvm_arch_init+0x268/0x44c [] kvm_init+0x24/0x260 [] arm_init+0x18/0x24 [] do_one_initcall+0x88/0x1a0 [] kernel_init_freeable+0x148/0x1e8 [] kernel_init+0x10/0xd4 Code: 8b000263 92628479 d1000720 eb01001f (f9400340) ---[ end trace 3bc230562e926fa4 ]--- Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Jungseok Lee Acked-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Paolo Bonzini --- arch/arm/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 57a403a5c22b..79d3fbfb5b0e 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -197,7 +197,8 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, pgd = pgdp + pgd_index(addr); do { next = kvm_pgd_addr_end(addr, end); - unmap_puds(kvm, pgd, addr, next); + if (!pgd_none(*pgd)) + unmap_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } From db7dedd0de714a59d8fa2a1be3b6b00543ad415a Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Wed, 19 Nov 2014 11:23:54 +0000 Subject: [PATCH 2/5] arm64: KVM: Handle traps of ICC_SRE_EL1 as RAZ/WI When running on a system with a GICv3, we currenly don't allow the guest to access the system register interface of the GICv3. We do this by clearing the ICC_SRE_EL2.Enable, which causes all guest accesses to ICC_SRE_EL1 to trap to EL2 and causes all guest accesses to other ICC_ registers to cause an undefined exception in the guest. However, we currently don't handle the trap of guest accesses to ICC_SRE_EL1 and will spill out a warning. The trap just needs to handle the access as RAZ/WI, and a guest that tries to prod this register and set ICC_SRE_EL1.SRE=1, must read back the value (which Linux already does) to see if it succeeded, and will thus observe that ICC_SRE_EL1.SRE was not set. Add the simple trap handler in the sorted table of the system registers. Signed-off-by: Christoffer Dall [ardb: added cp15 handling] Signed-off-by: Ard Biesheuvel Signed-off-by: Marc Zyngier Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/sys_regs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 4cc3b719208e..3d7c2df89946 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -424,6 +424,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* VBAR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000), NULL, reset_val, VBAR_EL1, 0 }, + + /* ICC_SRE_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1100), Op2(0b101), + trap_raz_wi }, + /* CONTEXTIDR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b001), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, @@ -690,6 +695,10 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, c10_NMRR }, { Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, c10_AMAIR0 }, { Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, c10_AMAIR1 }, + + /* ICC_SRE */ + { Op1( 0), CRn(12), CRm(12), Op2( 5), trap_raz_wi }, + { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID }, }; From 6b50f54064a02b77a7b990032b80234fee59bcd6 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 6 Nov 2014 11:47:39 +0000 Subject: [PATCH 3/5] arm/arm64: KVM: vgic: Fix error code in kvm_vgic_create() If we detect another vCPU is running we just exit and return 0 as if we succesfully created the VGIC, but the VGIC wouldn't actual be created. This shouldn't break in-kernel behavior because the kernel will not observe the failed the attempt to create the VGIC, but userspace could be rightfully confused. Cc: Andre Przywara Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Paolo Bonzini --- virt/kvm/arm/vgic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 3aaca49de325..aacdb59f30de 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1933,7 +1933,7 @@ out: int kvm_vgic_create(struct kvm *kvm) { - int i, vcpu_lock_idx = -1, ret = 0; + int i, vcpu_lock_idx = -1, ret; struct kvm_vcpu *vcpu; mutex_lock(&kvm->lock); @@ -1948,6 +1948,7 @@ int kvm_vgic_create(struct kvm *kvm) * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure * that no other VCPUs are run while we create the vgic. */ + ret = -EBUSY; kvm_for_each_vcpu(i, vcpu, kvm) { if (!mutex_trylock(&vcpu->mutex)) goto out_unlock; @@ -1955,11 +1956,10 @@ int kvm_vgic_create(struct kvm *kvm) } kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->arch.has_run_once) { - ret = -EBUSY; + if (vcpu->arch.has_run_once) goto out_unlock; - } } + ret = 0; spin_lock_init(&kvm->arch.vgic.lock); kvm->arch.vgic.in_kernel = true; From bb55e9b131d70ab9e30d73ab1342ad4907f9e0de Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 10 Nov 2014 09:33:55 +0100 Subject: [PATCH 4/5] arm/arm64: kvm: drop inappropriate use of kvm_is_mmio_pfn() Instead of using kvm_is_mmio_pfn() to decide whether a host region should be stage 2 mapped with device attributes, add a new static function kvm_is_device_pfn() that disregards RAM pages with the reserved bit set, as those should usually not be mapped as device memory. Signed-off-by: Ard Biesheuvel Signed-off-by: Paolo Bonzini --- arch/arm/kvm/mmu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 79d3fbfb5b0e..8664ff17cbbe 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -835,6 +835,11 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) return kvm_vcpu_dabt_iswrite(vcpu); } +static bool kvm_is_device_pfn(unsigned long pfn) +{ + return !pfn_valid(pfn); +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) @@ -905,7 +910,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (is_error_pfn(pfn)) return -EFAULT; - if (kvm_is_mmio_pfn(pfn)) + if (kvm_is_device_pfn(pfn)) mem_type = PAGE_S2_DEVICE; spin_lock(&kvm->mmu_lock); From d3fccc7ef831d1d829b4da5eaa081db55b1e38f3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 10 Nov 2014 09:33:56 +0100 Subject: [PATCH 5/5] kvm: fix kvm_is_mmio_pfn() and rename to kvm_is_reserved_pfn() This reverts commit 85c8555ff0 ("KVM: check for !is_zero_pfn() in kvm_is_mmio_pfn()") and renames the function to kvm_is_reserved_pfn. The problem being addressed by the patch above was that some ARM code based the memory mapping attributes of a pfn on the return value of kvm_is_mmio_pfn(), whose name indeed suggests that such pfns should be mapped as device memory. However, kvm_is_mmio_pfn() doesn't do quite what it says on the tin, and the existing non-ARM users were already using it in a way which suggests that its name should probably have been 'kvm_is_reserved_pfn' from the beginning, e.g., whether or not to call get_page/put_page on it etc. This means that returning false for the zero page is a mistake and the patch above should be reverted. Signed-off-by: Ard Biesheuvel Signed-off-by: Paolo Bonzini --- arch/ia64/kvm/kvm-ia64.c | 2 +- arch/x86/kvm/mmu.c | 6 +++--- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 16 ++++++++-------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index ec6b9acb6bea..dbe46f43884d 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1563,7 +1563,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, for (i = 0; i < npages; i++) { pfn = gfn_to_pfn(kvm, base_gfn + i); - if (!kvm_is_mmio_pfn(pfn)) { + if (!kvm_is_reserved_pfn(pfn)) { kvm_set_pmt_entry(kvm, base_gfn + i, pfn << PAGE_SHIFT, _PAGE_AR_RWX | _PAGE_MA_WB); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ac1c4de3a484..978f402006ee 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -630,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) * kvm mmu, before reclaiming the page, we should * unmap it from mmu first. */ - WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn))); + WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); @@ -2461,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + kvm_is_reserved_pfn(pfn)); if (host_writable) spte |= SPTE_HOST_WRITEABLE; @@ -2737,7 +2737,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * PT_PAGE_TABLE_LEVEL and there would be no adjustment done * here. */ - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ea53b04993f2..a6059bdf7b03 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -703,7 +703,7 @@ void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); -bool kvm_is_mmio_pfn(pfn_t pfn); +bool kvm_is_reserved_pfn(pfn_t pfn); struct kvm_irq_ack_notifier { struct hlist_node link; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 25ffac9e947d..3cee7b167052 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -107,10 +107,10 @@ EXPORT_SYMBOL_GPL(kvm_rebooting); static bool largepages_enabled = true; -bool kvm_is_mmio_pfn(pfn_t pfn) +bool kvm_is_reserved_pfn(pfn_t pfn) { if (pfn_valid(pfn)) - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); + return PageReserved(pfn_to_page(pfn)); return true; } @@ -1321,7 +1321,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, else if ((vma->vm_flags & VM_PFNMAP)) { pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - BUG_ON(!kvm_is_mmio_pfn(pfn)); + BUG_ON(!kvm_is_reserved_pfn(pfn)); } else { if (async && vma_is_valid(vma, write_fault)) *async = true; @@ -1427,7 +1427,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn) if (is_error_noslot_pfn(pfn)) return KVM_ERR_PTR_BAD_PAGE; - if (kvm_is_mmio_pfn(pfn)) { + if (kvm_is_reserved_pfn(pfn)) { WARN_ON(1); return KVM_ERR_PTR_BAD_PAGE; } @@ -1456,7 +1456,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_pfn_clean(pfn_t pfn) { - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) put_page(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); @@ -1477,7 +1477,7 @@ static void kvm_release_pfn_dirty(pfn_t pfn) void kvm_set_pfn_dirty(pfn_t pfn) { - if (!kvm_is_mmio_pfn(pfn)) { + if (!kvm_is_reserved_pfn(pfn)) { struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) SetPageDirty(page); @@ -1487,14 +1487,14 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); void kvm_set_pfn_accessed(pfn_t pfn) { - if (!kvm_is_mmio_pfn(pfn)) + if (!kvm_is_reserved_pfn(pfn)) mark_page_accessed(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); void kvm_get_pfn(pfn_t pfn) { - if (!kvm_is_mmio_pfn(pfn)) + if (!kvm_is_reserved_pfn(pfn)) get_page(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_get_pfn);