edac: Snapshot arm cortex-a cpu edac driver

Snapshot cortex-a arm cpu edac driver from msm-3.18@
0922caf50f22e751a05e6d613bb1d0c1dc940d7d
("Merge "usb: dwc3-msm: Fix incorrect roles with
multiple instances")
Also fix up ESR macro usage to follow upstream norms.

Change-Id: Ic5498a1ebc9c9d402baf4b839ed8f427e9510083
Signed-off-by: Abhimanyu Kapur <abhimany@codeaurora.org>
This commit is contained in:
Abhimanyu Kapur 2016-02-22 17:17:26 -08:00 committed by David Keitel
parent 8469f207b4
commit a6aa6045d7
3 changed files with 941 additions and 0 deletions

View file

@ -390,4 +390,44 @@ config EDAC_XGENE
Support for error detection and correction on the
APM X-Gene family of SOCs.
config EDAC_CORTEX_ARM64
depends on EDAC_MM_EDAC && ARM64
bool "ARM Cortex A CPUs L1/L2 Caches"
help
Support for error detection and correction on the
ARM Cortex A53 and A57 CPUs. For debugging issues having to do with
stability and overall system health, you should probably say 'Y'
here.
config EDAC_CORTEX_ARM64_PANIC_ON_CE
depends on EDAC_CORTEX_ARM64
bool "Panic on correctable errors"
help
Forcibly cause a kernel panic if an correctable error (CE) is
detected, even though the error is (by definition) correctable and
would otherwise result in no adverse system effects. This can reduce
debugging times on hardware which may be operating at voltages or
frequencies outside normal specification.
For production builds, you should definitely say 'N' here.
config EDAC_CORTEX_ARM64_DBE_IRQ_ONLY
depends on EDAC_CORTEX_ARM64
bool "Only check for parity errors when an irq is generated"
help
In ARM64, parity errors will cause an interrupt
to be triggered but may also cause a data abort to
occur. Only check for EDAC errors for the interrupt.
If unsure, say no.
config EDAC_CORTEX_ARM64_PANIC_ON_UE
depends on EDAC_CORTEX_ARM64
bool "Panic on uncorrectable errors"
help
Forcibly cause a kernel panic if an uncorrectable error (UE) is
detected. This can reduce debugging times on hardware which may be
operating at voltages or frequencies outside normal specification.
For production builds, you should probably say 'N' here.
endif # EDAC

View file

@ -70,3 +70,4 @@ obj-$(CONFIG_EDAC_OCTEON_PCI) += octeon_edac-pci.o
obj-$(CONFIG_EDAC_ALTERA_MC) += altera_edac.o
obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o
obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o
obj-$(CONFIG_EDAC_CORTEX_ARM64) += cortex_arm64_edac.o

View file

@ -0,0 +1,900 @@
/* Copyright (c) 2014-2015, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
* only version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/edac.h>
#include <linux/interrupt.h>
#include <linux/of_device.h>
#include <linux/platform_device.h>
#include <linux/perf_event.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/cpu_pm.h>
#include <linux/of_irq.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/percpu.h>
#include <linux/msm_rtb.h>
#include <asm/cputype.h>
#include <asm/esr.h>
#include "edac_core.h"
#define A53_CPUMERRSR_FATAL(a) ((a) & (1LL << 63))
#define A53_CPUMERRSR_OTHER(a) (((a) >> 40) & 0xff)
#define A53_CPUMERRSR_REPT(a) (((a) >> 32) & 0xff)
#define A53_CPUMERRSR_VALID(a) ((a) & (1 << 31))
#define A53_CPUMERRSR_RAMID(a) (((a) >> 24) & 0x7f)
#define A53_CPUMERRSR_CPUID(a) (((a) >> 18) & 0x07)
#define A53_CPUMERRSR_ADDR(a) ((a) & 0xfff)
#define A53_L2MERRSR_FATAL(a) ((a) & (1LL << 63))
#define A53_L2MERRSR_OTHER(a) (((a) >> 40) & 0xff)
#define A53_L2MERRSR_REPT(a) (((a) >> 32) & 0xff)
#define A53_L2MERRSR_VALID(a) ((a) & (1 << 31))
#define A53_L2MERRSR_RAMID(a) (((a) >> 24) & 0x7f)
#define A53_L2MERRSR_CPUID(a) (((a) >> 18) & 0x0f)
#define A53_L2MERRSR_INDEX(a) (((a) >> 3) & 0x3fff)
#define A57_CPUMERRSR_FATAL(a) ((a) & (1LL << 63))
#define A57_CPUMERRSR_OTHER(a) (((a) >> 40) & 0xff)
#define A57_CPUMERRSR_REPT(a) (((a) >> 32) & 0xff)
#define A57_CPUMERRSR_VALID(a) ((a) & (1 << 31))
#define A57_CPUMERRSR_RAMID(a) (((a) >> 24) & 0x7f)
#define A57_CPUMERRSR_BANK(a) (((a) >> 18) & 0x1f)
#define A57_CPUMERRSR_INDEX(a) ((a) & 0x1ffff)
#define A57_L2MERRSR_FATAL(a) ((a) & (1LL << 63))
#define A57_L2MERRSR_OTHER(a) (((a) >> 40) & 0xff)
#define A57_L2MERRSR_REPT(a) (((a) >> 32) & 0xff)
#define A57_L2MERRSR_VALID(a) ((a) & (1 << 31))
#define A57_L2MERRSR_RAMID(a) (((a) >> 24) & 0x7f)
#define A57_L2MERRSR_CPUID(a) (((a) >> 18) & 0x0f)
#define A57_L2MERRSR_INDEX(a) ((a) & 0x1ffff)
#define L2ECTLR_INT_ERR (1 << 30)
#define L2ECTLR_EXT_ERR (1 << 29)
#define ESR_SERROR(a) ((a) >> ESR_ELx_EC_SHIFT == ESR_ELx_EC_SERROR)
#define ESR_VALID(a) ((a) & BIT(24))
#define ESR_L2_DBE(a) (ESR_SERROR(a) && ESR_VALID(a) && \
(((a) & 0x00C00003) == 0x1))
#define CCI_IMPRECISEERROR_REG 0x10
#define L1_CACHE 0
#define L2_CACHE 1
#define CCI 2
#define A53_L1_CE 0
#define A53_L1_UE 1
#define A53_L2_CE 2
#define A53_L2_UE 3
#define A57_L1_CE 4
#define A57_L1_UE 5
#define A57_L2_CE 6
#define A57_L2_UE 7
#define L2_EXT_UE 8
#define CCI_UE 9
#ifdef CONFIG_EDAC_CORTEX_ARM64_PANIC_ON_UE
#define ARM64_ERP_PANIC_ON_UE 1
#else
#define ARM64_ERP_PANIC_ON_UE 0
#endif
#ifdef CONFIG_EDAC_CORTEX_ARM64_PANIC_ON_CE
static int panic_on_ce = 1;
#else
static int panic_on_ce;
#endif
module_param(panic_on_ce, int, 0);
#define EDAC_CPU "arm64"
enum error_type {
SBE,
DBE,
};
const char *err_name[] = {
"Single-bit",
"Double-bit",
};
struct erp_drvdata {
struct edac_device_ctl_info *edev_ctl;
void __iomem *cci_base;
struct notifier_block nb_pm;
struct notifier_block nb_cpu;
struct notifier_block nb_panic;
struct work_struct work;
struct perf_event *memerr_counters[NR_CPUS];
};
static struct erp_drvdata *panic_handler_drvdata;
struct erp_local_data {
struct erp_drvdata *drv;
enum error_type err;
};
#define MEM_ERROR_EVENT 0x1A
struct errors_edac {
const char * const msg;
void (*func)(struct edac_device_ctl_info *edac_dev,
int inst_nr, int block_nr, const char *msg);
};
static const struct errors_edac errors[] = {
{"A53 L1 Correctable Error", edac_device_handle_ce },
{"A53 L1 Uncorrectable Error", edac_device_handle_ue },
{"A53 L2 Correctable Error", edac_device_handle_ce },
{"A53 L2 Uncorrectable Error", edac_device_handle_ue },
{"A57 L1 Correctable Error", edac_device_handle_ce },
{"A57 L1 Uncorrectable Error", edac_device_handle_ue },
{"A57 L2 Correctable Error", edac_device_handle_ce },
{"A57 L2 Uncorrectable Error", edac_device_handle_ue },
{"L2 External Error", edac_device_handle_ue },
{"CCI Error", edac_device_handle_ue },
};
#define read_l2merrsr_el1 ({ \
u64 __val; \
asm("mrs %0, s3_1_c15_c2_3" : "=r" (__val)); \
__val; \
})
#define read_l2ectlr_el1 ({ \
u32 __val; \
asm("mrs %0, s3_1_c11_c0_3" : "=r" (__val)); \
__val; \
})
#define read_cpumerrsr_el1 ({ \
u64 __val; \
asm("mrs %0, s3_1_c15_c2_2" : "=r" (__val)); \
__val; \
})
#define read_esr_el1 ({ \
u64 __val; \
asm("mrs %0, esr_el1" : "=r" (__val)); \
__val; \
})
#define write_l2merrsr_el1(val) ({ \
asm("msr s3_1_c15_c2_3, %0" : : "r" (val)); \
})
#define write_l2ectlr_el1(val) ({ \
asm("msr s3_1_c11_c0_3, %0" : : "r" (val)); \
})
#define write_cpumerrsr_el1(val) ({ \
asm("msr s3_1_c15_c2_2, %0" : : "r" (val)); \
})
static void ca53_ca57_print_error_state_regs(void)
{
u64 l2merrsr;
u64 cpumerrsr;
u32 esr_el1;
u32 l2ectlr;
cpumerrsr = read_cpumerrsr_el1;
l2merrsr = read_l2merrsr_el1;
esr_el1 = read_esr_el1;
l2ectlr = read_l2ectlr_el1;
/* store data in uncached rtb logs */
uncached_logk_pc(LOGK_READL, __builtin_return_address(0),
(void *)cpumerrsr);
uncached_logk_pc(LOGK_READL, __builtin_return_address(0),
(void *)l2merrsr);
uncached_logk_pc(LOGK_READL, __builtin_return_address(0),
(void *)((u64)esr_el1));
uncached_logk_pc(LOGK_READL, __builtin_return_address(0),
(void *)((u64)l2ectlr));
edac_printk(KERN_CRIT, EDAC_CPU, "CPUMERRSR value = %#llx\n",
cpumerrsr);
edac_printk(KERN_CRIT, EDAC_CPU, "L2MERRSR value = %#llx\n", l2merrsr);
edac_printk(KERN_CRIT, EDAC_CPU, "ESR value = %#x\n", esr_el1);
edac_printk(KERN_CRIT, EDAC_CPU, "L2ECTLR value = %#x\n", l2ectlr);
if (ESR_L2_DBE(esr_el1))
edac_printk(KERN_CRIT, EDAC_CPU,
"Double bit error on dirty L2 cacheline\n");
}
static void ca53_parse_cpumerrsr(struct erp_local_data *ed)
{
u64 cpumerrsr;
int cpuid;
cpumerrsr = read_cpumerrsr_el1;
if (!A53_CPUMERRSR_VALID(cpumerrsr))
return;
if (A53_CPUMERRSR_FATAL(cpumerrsr))
ed->err = DBE;
edac_printk(KERN_CRIT, EDAC_CPU, "Cortex A53 CPU%d L1 %s Error detected\n",
smp_processor_id(), err_name[ed->err]);
ca53_ca57_print_error_state_regs();
if (ed->err == DBE)
edac_printk(KERN_CRIT, EDAC_CPU, "Fatal error\n");
cpuid = A53_CPUMERRSR_CPUID(cpumerrsr);
switch (A53_CPUMERRSR_RAMID(cpumerrsr)) {
case 0x0:
edac_printk(KERN_CRIT, EDAC_CPU,
"L1 Instruction tag RAM way is %d\n", cpuid);
break;
case 0x1:
edac_printk(KERN_CRIT, EDAC_CPU,
"L1 Instruction data RAM bank is %d\n", cpuid);
break;
case 0x8:
edac_printk(KERN_CRIT, EDAC_CPU,
"L1 Data tag RAM cpu %d way is %d\n",
cpuid / 4, cpuid % 4);
break;
case 0x9:
edac_printk(KERN_CRIT, EDAC_CPU,
"L1 Data data RAM cpu %d way is %d\n",
cpuid / 4, cpuid % 4);
break;
case 0xA:
edac_printk(KERN_CRIT, EDAC_CPU,
"L1 Data dirty RAM cpu %d way is %d\n",
cpuid / 4, cpuid % 4);
break;
case 0x18:
edac_printk(KERN_CRIT, EDAC_CPU, "TLB RAM way is %d\n", cpuid);
break;
default:
edac_printk(KERN_CRIT, EDAC_CPU,
"Error in unknown RAM ID: %d\n",
(int) A53_CPUMERRSR_RAMID(cpumerrsr));
break;
}
edac_printk(KERN_CRIT, EDAC_CPU, "Repeated error count: %d\n",
(int) A53_CPUMERRSR_REPT(cpumerrsr));
edac_printk(KERN_CRIT, EDAC_CPU, "Other error count: %d\n",
(int) A53_CPUMERRSR_OTHER(cpumerrsr));
if (ed->err == SBE)
errors[A53_L1_CE].func(ed->drv->edev_ctl, smp_processor_id(),
L1_CACHE, errors[A53_L1_CE].msg);
else if (ed->err == DBE)
errors[A53_L1_UE].func(ed->drv->edev_ctl, smp_processor_id(),
L1_CACHE, errors[A53_L1_UE].msg);
write_cpumerrsr_el1(0);
}
static void ca53_parse_l2merrsr(struct erp_local_data *ed)
{
u64 l2merrsr;
u32 l2ectlr;
int cpuid;
l2merrsr = read_l2merrsr_el1;
l2ectlr = read_l2ectlr_el1;
if (!A53_L2MERRSR_VALID(l2merrsr))
return;
if (A53_L2MERRSR_FATAL(l2merrsr))
ed->err = DBE;
edac_printk(KERN_CRIT, EDAC_CPU, "CortexA53 L2 %s Error detected\n",
err_name[ed->err]);
ca53_ca57_print_error_state_regs();
if (ed->err == DBE)
edac_printk(KERN_CRIT, EDAC_CPU, "Fatal error\n");
cpuid = A53_L2MERRSR_CPUID(l2merrsr);
switch (A53_L2MERRSR_RAMID(l2merrsr)) {
case 0x10:
edac_printk(KERN_CRIT, EDAC_CPU,
"L2 tag RAM way is %d\n", cpuid);
break;
case 0x11:
edac_printk(KERN_CRIT, EDAC_CPU,
"L2 data RAM bank is %d\n", cpuid);
break;
case 0x12:
edac_printk(KERN_CRIT, EDAC_CPU,
"SCU snoop filter RAM cpu %d way is %d\n",
cpuid / 4, cpuid % 4);
break;
default:
edac_printk(KERN_CRIT, EDAC_CPU,
"Error in unknown RAM ID: %d\n",
(int) A53_L2MERRSR_RAMID(l2merrsr));
break;
}
edac_printk(KERN_CRIT, EDAC_CPU, "Repeated error count: %d\n",
(int) A53_L2MERRSR_REPT(l2merrsr));
edac_printk(KERN_CRIT, EDAC_CPU, "Other error count: %d\n",
(int) A53_L2MERRSR_OTHER(l2merrsr));
if (ed->err == SBE)
errors[A53_L2_CE].func(ed->drv->edev_ctl, smp_processor_id(),
L2_CACHE, errors[A53_L2_CE].msg);
else if (ed->err == DBE)
errors[A53_L2_UE].func(ed->drv->edev_ctl, smp_processor_id(),
L2_CACHE, errors[A53_L2_UE].msg);
write_l2merrsr_el1(0);
}
static void ca57_parse_cpumerrsr(struct erp_local_data *ed)
{
u64 cpumerrsr;
int bank;
cpumerrsr = read_cpumerrsr_el1;
if (!A57_CPUMERRSR_VALID(cpumerrsr))
return;
if (A57_CPUMERRSR_FATAL(cpumerrsr))
ed->err = DBE;
edac_printk(KERN_CRIT, EDAC_CPU, "Cortex A57 CPU%d L1 %s Error detected\n",
smp_processor_id(), err_name[ed->err]);
ca53_ca57_print_error_state_regs();
if (ed->err == DBE)
edac_printk(KERN_CRIT, EDAC_CPU, "Fatal error\n");
bank = A57_CPUMERRSR_BANK(cpumerrsr);
switch (A57_CPUMERRSR_RAMID(cpumerrsr)) {
case 0x0:
edac_printk(KERN_CRIT, EDAC_CPU,
"L1 Instruction tag RAM bank %d\n", bank);
break;
case 0x1:
edac_printk(KERN_CRIT, EDAC_CPU,
"L1 Instruction data RAM bank %d\n", bank);
break;
case 0x8:
edac_printk(KERN_CRIT, EDAC_CPU,
"L1 Data tag RAM bank %d\n", bank);
break;
case 0x9:
edac_printk(KERN_CRIT, EDAC_CPU,
"L1 Data data RAM bank %d\n", bank);
break;
case 0x18:
edac_printk(KERN_CRIT, EDAC_CPU,
"TLB RAM bank %d\n", bank);
break;
default:
edac_printk(KERN_CRIT, EDAC_CPU,
"Error in unknown RAM ID: %d\n",
(int) A57_CPUMERRSR_RAMID(cpumerrsr));
break;
}
edac_printk(KERN_CRIT, EDAC_CPU, "Repeated error count: %d\n",
(int) A57_CPUMERRSR_REPT(cpumerrsr));
edac_printk(KERN_CRIT, EDAC_CPU, "Other error count: %d\n",
(int) A57_CPUMERRSR_OTHER(cpumerrsr));
if (ed->err == SBE)
errors[A57_L1_CE].func(ed->drv->edev_ctl, smp_processor_id(),
L1_CACHE, errors[A57_L1_CE].msg);
else if (ed->err == DBE)
errors[A57_L1_UE].func(ed->drv->edev_ctl, smp_processor_id(),
L1_CACHE, errors[A57_L1_UE].msg);
write_cpumerrsr_el1(0);
}
static void ca57_parse_l2merrsr(struct erp_local_data *ed)
{
u64 l2merrsr;
u32 l2ectlr;
int cpuid;
l2merrsr = read_l2merrsr_el1;
l2ectlr = read_l2ectlr_el1;
if (!A57_L2MERRSR_VALID(l2merrsr))
return;
if (A57_L2MERRSR_FATAL(l2merrsr))
ed->err = DBE;
edac_printk(KERN_CRIT, EDAC_CPU, "CortexA57 L2 %s Error detected\n",
err_name[ed->err]);
ca53_ca57_print_error_state_regs();
if (ed->err == DBE)
edac_printk(KERN_CRIT, EDAC_CPU, "Fatal error\n");
cpuid = A57_L2MERRSR_CPUID(l2merrsr);
switch (A57_L2MERRSR_RAMID(l2merrsr)) {
case 0x10:
edac_printk(KERN_CRIT, EDAC_CPU,
"L2 tag RAM cpu %d way is %d\n",
cpuid / 2, cpuid % 2);
break;
case 0x11:
edac_printk(KERN_CRIT, EDAC_CPU,
"L2 data RAM cpu %d bank is %d\n",
cpuid / 2, cpuid % 2);
break;
case 0x12:
edac_printk(KERN_CRIT, EDAC_CPU,
"SCU snoop tag RAM bank is %d\n", cpuid);
break;
case 0x14:
edac_printk(KERN_CRIT, EDAC_CPU,
"L2 dirty RAM cpu %d bank is %d\n",
cpuid / 2, cpuid % 2);
break;
case 0x18:
edac_printk(KERN_CRIT, EDAC_CPU,
"L2 inclusion PF RAM bank is %d\n", cpuid);
break;
default:
edac_printk(KERN_CRIT, EDAC_CPU,
"Error in unknown RAM ID: %d\n",
(int) A57_L2MERRSR_RAMID(l2merrsr));
break;
}
edac_printk(KERN_CRIT, EDAC_CPU, "Repeated error count: %d\n",
(int) A57_L2MERRSR_REPT(l2merrsr));
edac_printk(KERN_CRIT, EDAC_CPU, "Other error count: %d\n",
(int) A57_L2MERRSR_OTHER(l2merrsr));
if (ed->err == SBE) {
errors[A57_L2_CE].func(ed->drv->edev_ctl, smp_processor_id(),
L2_CACHE, errors[A57_L2_CE].msg);
} else if (ed->err == DBE) {
errors[A57_L2_UE].func(ed->drv->edev_ctl, smp_processor_id(),
L2_CACHE, errors[A57_L2_UE].msg);
}
write_l2merrsr_el1(0);
}
static DEFINE_SPINLOCK(local_handler_lock);
static DEFINE_SPINLOCK(l2ectlr_lock);
static void arm64_erp_local_handler(void *info)
{
struct erp_local_data *errdata = info;
unsigned int cpuid = read_cpuid_id();
unsigned int partnum = read_cpuid_part_number();
unsigned long flags, flags2;
u32 l2ectlr;
spin_lock_irqsave(&local_handler_lock, flags);
edac_printk(KERN_CRIT, EDAC_CPU, "%s error information from CPU %d, MIDR=%#08x:\n",
err_name[errdata->err], raw_smp_processor_id(), cpuid);
switch (partnum) {
case ARM_CPU_PART_CORTEX_A53:
ca53_parse_cpumerrsr(errdata);
ca53_parse_l2merrsr(errdata);
break;
case ARM_CPU_PART_CORTEX_A72:
case ARM_CPU_PART_CORTEX_A57:
ca57_parse_cpumerrsr(errdata);
ca57_parse_l2merrsr(errdata);
break;
default:
edac_printk(KERN_CRIT, EDAC_CPU, "Unknown CPU Part Number in MIDR: %#04x (%#08x)\n",
partnum, cpuid);
};
/* Acklowledge internal error in L2ECTLR */
spin_lock_irqsave(&l2ectlr_lock, flags2);
l2ectlr = read_l2ectlr_el1;
if (l2ectlr & L2ECTLR_INT_ERR) {
l2ectlr &= ~L2ECTLR_INT_ERR;
write_l2ectlr_el1(l2ectlr);
}
spin_unlock_irqrestore(&l2ectlr_lock, flags2);
spin_unlock_irqrestore(&local_handler_lock, flags);
}
static irqreturn_t arm64_dbe_handler(int irq, void *drvdata)
{
struct erp_local_data errdata;
errdata.drv = drvdata;
errdata.err = DBE;
edac_printk(KERN_CRIT, EDAC_CPU, "ARM64 CPU ERP: Double-bit error interrupt received!\n");
on_each_cpu(arm64_erp_local_handler, &errdata, 1);
return IRQ_HANDLED;
}
static void arm64_ext_local_handler(void *info)
{
struct erp_drvdata *drv = info;
unsigned long flags, flags2;
u32 l2ectlr;
spin_lock_irqsave(&local_handler_lock, flags);
/* TODO: Shared locking for L2ECTLR access */
spin_lock_irqsave(&l2ectlr_lock, flags2);
l2ectlr = read_l2ectlr_el1;
if (l2ectlr & L2ECTLR_EXT_ERR) {
edac_printk(KERN_CRIT, EDAC_CPU,
"L2 external error detected by CPU%d\n",
smp_processor_id());
errors[L2_EXT_UE].func(drv->edev_ctl, smp_processor_id(),
L2_CACHE, errors[L2_EXT_UE].msg);
l2ectlr &= ~L2ECTLR_EXT_ERR;
write_l2ectlr_el1(l2ectlr);
}
spin_unlock_irqrestore(&l2ectlr_lock, flags2);
spin_unlock_irqrestore(&local_handler_lock, flags);
}
static irqreturn_t arm64_ext_handler(int irq, void *drvdata)
{
edac_printk(KERN_CRIT, EDAC_CPU, "External error interrupt received!\n");
on_each_cpu(arm64_ext_local_handler, drvdata, 1);
return IRQ_HANDLED;
}
static irqreturn_t arm64_cci_handler(int irq, void *drvdata)
{
struct erp_drvdata *drv = drvdata;
u32 cci_err_reg;
edac_printk(KERN_CRIT, EDAC_CPU, "CCI error interrupt received!\n");
if (drv->cci_base) {
cci_err_reg = readl_relaxed(drv->cci_base +
CCI_IMPRECISEERROR_REG);
edac_printk(KERN_CRIT, EDAC_CPU, "CCI imprecise error register: %#08x.\n",
cci_err_reg);
/* This register has write-clear semantics */
writel_relaxed(cci_err_reg, drv->cci_base +
CCI_IMPRECISEERROR_REG);
/* Ensure error bits cleared before exiting ISR */
mb();
} else {
edac_printk(KERN_CRIT, EDAC_CPU, "CCI registers not available.\n");
}
errors[CCI_UE].func(drv->edev_ctl, 0, CCI, errors[CCI_UE].msg);
return IRQ_HANDLED;
}
static void arm64_sbe_handler(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct erp_local_data errdata;
int cpu = raw_smp_processor_id();
errdata.drv = event->overflow_handler_context;
errdata.err = SBE;
edac_printk(KERN_CRIT, EDAC_CPU, "ARM64 CPU ERP: Single-bit error interrupt received on CPU %d!\n",
cpu);
WARN_ON(!panic_on_ce);
arm64_erp_local_handler(&errdata);
}
static int request_erp_irq(struct platform_device *pdev, const char *propname,
const char *desc, irq_handler_t handler,
void *ed)
{
int rc;
struct resource *r;
r = platform_get_resource_byname(pdev, IORESOURCE_IRQ, propname);
if (!r) {
pr_err("ARM64 CPU ERP: Could not find <%s> IRQ property. Proceeding anyway.\n",
propname);
return -EINVAL;
}
rc = devm_request_threaded_irq(&pdev->dev, r->start, NULL,
handler,
IRQF_ONESHOT | IRQF_TRIGGER_RISING,
desc,
ed);
if (rc) {
pr_err("ARM64 CPU ERP: Failed to request IRQ %d: %d (%s / %s). Proceeding anyway.\n",
(int) r->start, rc, propname, desc);
return -EINVAL;
}
return 0;
}
static void check_sbe_event(struct erp_drvdata *drv)
{
unsigned int partnum = read_cpuid_part_number();
struct erp_local_data errdata;
unsigned long flags;
errdata.drv = drv;
errdata.err = SBE;
spin_lock_irqsave(&local_handler_lock, flags);
switch (partnum) {
case ARM_CPU_PART_CORTEX_A53:
ca53_parse_cpumerrsr(&errdata);
ca53_parse_l2merrsr(&errdata);
break;
case ARM_CPU_PART_CORTEX_A72:
case ARM_CPU_PART_CORTEX_A57:
ca57_parse_cpumerrsr(&errdata);
ca57_parse_l2merrsr(&errdata);
break;
};
spin_unlock_irqrestore(&local_handler_lock, flags);
}
#ifdef CONFIG_EDAC_CORTEX_ARM64_DBE_IRQ_ONLY
static void create_sbe_counter(int cpu, void *info)
{ }
#else
static void create_sbe_counter(int cpu, void *info)
{
struct erp_drvdata *drv = info;
struct perf_event *event = drv->memerr_counters[cpu];
struct perf_event_attr attr = {
.pinned = 1,
.disabled = 0, /* 0 will enable the counter upon creation */
.sample_period = 1, /* 1 will set the counter to max int */
.type = PERF_TYPE_RAW,
.config = MEM_ERROR_EVENT,
.size = sizeof(struct perf_event_attr),
};
if (event)
return;
/* Fails if cpu is not online */
event = perf_event_create_kernel_counter(&attr, cpu, NULL,
arm64_sbe_handler,
drv);
if (IS_ERR(event)) {
pr_err("PERF Event creation failed on cpu %d ptr_err %ld\n",
cpu, PTR_ERR(event));
return;
}
drv->memerr_counters[cpu] = event;
}
#endif
static int arm64_pmu_cpu_pm_notify(struct notifier_block *self,
unsigned long action, void *v)
{
struct erp_drvdata *drv = container_of(self, struct erp_drvdata, nb_pm);
switch (action) {
case CPU_PM_EXIT:
check_sbe_event(drv);
break;
}
return NOTIFY_OK;
}
static int arm64_edac_pmu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
struct erp_drvdata *drv = container_of(self, struct erp_drvdata,
nb_cpu);
unsigned long cpu = (unsigned long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
create_sbe_counter(cpu, drv);
break;
};
return NOTIFY_OK;
}
#ifndef CONFIG_EDAC_CORTEX_ARM64_DBE_IRQ_ONLY
void arm64_check_cache_ecc(void *info)
{
if (panic_handler_drvdata)
check_sbe_event(panic_handler_drvdata);
}
#else
static inline void arm64_check_cache_ecc(void *info) {}
#endif
static int arm64_erp_panic_notify(struct notifier_block *this,
unsigned long event, void *ptr)
{
arm64_check_cache_ecc(NULL);
return NOTIFY_OK;
}
static void arm64_monitor_cache_errors(struct edac_device_ctl_info *edev)
{
struct cpumask cluster_mask, old_mask;
int cpu;
cpumask_clear(&cluster_mask);
cpumask_clear(&old_mask);
for_each_possible_cpu(cpu) {
cpumask_copy(&cluster_mask, topology_core_cpumask(cpu));
if (cpumask_equal(&cluster_mask, &old_mask))
continue;
cpumask_copy(&old_mask, &cluster_mask);
smp_call_function_any(&cluster_mask,
arm64_check_cache_ecc, NULL, 0);
}
}
static int arm64_cpu_erp_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
struct erp_drvdata *drv;
struct resource *r;
int cpu;
u32 poll_msec;
int rc, fail = 0;
drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
if (!drv)
return -ENOMEM;
drv->edev_ctl = edac_device_alloc_ctl_info(0, "cpu",
num_possible_cpus(), "L", 3, 1, NULL, 0,
edac_device_alloc_index());
if (!drv->edev_ctl)
return -ENOMEM;
rc = of_property_read_u32(pdev->dev.of_node, "poll-delay-ms",
&poll_msec);
if (!rc && !IS_ENABLED(CONFIG_EDAC_CORTEX_ARM64_DBE_IRQ_ONLY)) {
drv->edev_ctl->edac_check = arm64_monitor_cache_errors;
drv->edev_ctl->poll_msec = poll_msec;
drv->edev_ctl->defer_work = 1;
}
drv->edev_ctl->dev = dev;
drv->edev_ctl->mod_name = dev_name(dev);
drv->edev_ctl->dev_name = dev_name(dev);
drv->edev_ctl->ctl_name = "cache";
drv->edev_ctl->panic_on_ce = panic_on_ce;
drv->edev_ctl->panic_on_ue = ARM64_ERP_PANIC_ON_UE;
rc = edac_device_add_device(drv->edev_ctl);
if (rc)
goto out_mem;
r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cci");
if (r)
drv->cci_base = devm_ioremap_resource(dev, r);
if (request_erp_irq(pdev, "pri-dbe-irq", "ARM64 primary DBE IRQ",
arm64_dbe_handler, drv))
fail++;
if (request_erp_irq(pdev, "sec-dbe-irq", "ARM64 secondary DBE IRQ",
arm64_dbe_handler, drv))
fail++;
if (request_erp_irq(pdev, "pri-ext-irq", "ARM64 primary ext IRQ",
arm64_ext_handler, drv))
fail++;
if (request_erp_irq(pdev, "sec-ext-irq", "ARM64 secondary ext IRQ",
arm64_ext_handler, drv))
fail++;
/*
* We still try to register a handler for CCI errors even if we don't
* have access to cci_base, but error reporting becomes best-effort in
* that case.
*/
if (request_erp_irq(pdev, "cci-irq", "CCI error IRQ",
arm64_cci_handler, drv))
fail++;
if (IS_ENABLED(CONFIG_EDAC_CORTEX_ARM64_DBE_IRQ_ONLY)) {
pr_err("ARM64 CPU ERP: SBE detection is disabled.\n");
goto out_irq;
}
drv->nb_pm.notifier_call = arm64_pmu_cpu_pm_notify;
cpu_pm_register_notifier(&(drv->nb_pm));
drv->nb_panic.notifier_call = arm64_erp_panic_notify;
atomic_notifier_chain_register(&panic_notifier_list,
&drv->nb_panic);
drv->nb_cpu.notifier_call = arm64_edac_pmu_cpu_notify;
register_cpu_notifier(&drv->nb_cpu);
get_online_cpus();
for_each_online_cpu(cpu)
create_sbe_counter(cpu, drv);
put_online_cpus();
out_irq:
if (fail == of_irq_count(dev->of_node)) {
pr_err("ARM64 CPU ERP: Could not request any IRQs. Giving up.\n");
rc = -ENODEV;
goto out_dev;
}
panic_handler_drvdata = drv;
return 0;
out_dev:
edac_device_del_device(dev);
out_mem:
edac_device_free_ctl_info(drv->edev_ctl);
return rc;
}
static const struct of_device_id arm64_cpu_erp_match_table[] = {
{ .compatible = "arm,arm64-cpu-erp" },
{ }
};
static struct platform_driver arm64_cpu_erp_driver = {
.probe = arm64_cpu_erp_probe,
.driver = {
.name = "arm64_cpu_cache_erp",
.owner = THIS_MODULE,
.of_match_table = of_match_ptr(arm64_cpu_erp_match_table),
},
};
static int __init arm64_cpu_erp_init(void)
{
return platform_driver_register(&arm64_cpu_erp_driver);
}
device_initcall_sync(arm64_cpu_erp_init);