drm/msm: Add support for the QTI GPU snapshot format

When a fault happens on the Adreno GPU we want to collect a considerable amount of information to diagnose the problem including registers, caches, and GPU memory structures (ringbuffers, etc). The snapshot collects all of this information following a GPU fault and encodes it into a binary file format that can be pulled from debugfs or extracted from a memory dump. This may seem a duplication of other debug methods (the ->show functions for example) and while that is true for small numbers of registers the snapshot goes much further - it collects hundreds (thousands) of registers in addition to memory and other structures that would be impractical to dump as ascii. The binary format allows for the snapshot to be easily shared and post-processed in different ways to extract patterns. Add the basic snapshot infrastructure and enable ringbuffer, register and shader bank collection for A5XX targets. Change-Id: Ic0dedbadcf0513096d05870f522ac73da74ceb31 Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
2017-02-21 14:50:45 -07:00 · 2017-02-21 14:50:45 -07:00 · 20e281de48
commit 20e281de48
parent 869486c969
14 changed files with 1371 additions and 18 deletions
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@ -55,7 +55,8 @@ msm_drm-y += adreno/adreno_device.o \
 	adreno/a4xx_gpu.o \
 	adreno/a5xx_gpu.o \
 	adreno/a5xx_power.o \
-	adreno/a5xx_preempt.o
+	adreno/a5xx_preempt.o \
+	adreno/a5xx_snapshot.o
 endif

 msm_drm-$(CONFIG_DRM_MSM_MDP4) += mdp/mdp4/mdp4_crtc.o \
@ -131,6 +132,7 @@ msm_drm-$(CONFIG_DRM_MSM) += \
 	msm_perf.o \
 	msm_rd.o \
 	msm_ringbuffer.o \
-	msm_prop.o
+	msm_prop.o \
+	msm_snapshot.o

 obj-$(CONFIG_DRM_MSM)	+= msm_drm.o
--- a/drivers/gpu/drm/msm/adreno/a5xx.xml.h
+++ b/drivers/gpu/drm/msm/adreno/a5xx.xml.h
@ -155,6 +155,114 @@ enum a5xx_depth_format {
 	DEPTH5_32 = 4,
 };

+enum a5xx_debugbus {
+	A5XX_RBBM_DBGBUS_CP = 1,
+	A5XX_RBBM_DBGBUS_RBBM = 2,
+	A5XX_RBBM_DBGBUS_VBIF = 3,
+	A5XX_RBBM_DBGBUS_HLSQ = 4,
+	A5XX_RBBM_DBGBUS_UCHE = 5,
+	A5XX_RBBM_DBGBUS_DPM = 6,
+	A5XX_RBBM_DBGBUS_TESS = 7,
+	A5XX_RBBM_DBGBUS_PC = 8,
+	A5XX_RBBM_DBGBUS_VFDP = 9,
+	A5XX_RBBM_DBGBUS_VPC = 10,
+	A5XX_RBBM_DBGBUS_TSE = 11,
+	A5XX_RBBM_DBGBUS_RAS = 12,
+	A5XX_RBBM_DBGBUS_VSC = 13,
+	A5XX_RBBM_DBGBUS_COM = 14,
+	A5XX_RBBM_DBGBUS_DCOM = 15,
+	A5XX_RBBM_DBGBUS_LRZ = 16,
+	A5XX_RBBM_DBGBUS_A2D_DSP = 17,
+	A5XX_RBBM_DBGBUS_CCUFCHE = 18,
+	A5XX_RBBM_DBGBUS_GPMU = 19,
+	A5XX_RBBM_DBGBUS_RBP = 20,
+	A5XX_RBBM_DBGBUS_HM = 21,
+	A5XX_RBBM_DBGBUS_RBBM_CFG = 22,
+	A5XX_RBBM_DBGBUS_VBIF_CX = 23,
+	A5XX_RBBM_DBGBUS_GPC = 29,
+	A5XX_RBBM_DBGBUS_LARC = 30,
+	A5XX_RBBM_DBGBUS_HLSQ_SPTP = 31,
+	A5XX_RBBM_DBGBUS_RB_0 = 32,
+	A5XX_RBBM_DBGBUS_RB_1 = 33,
+	A5XX_RBBM_DBGBUS_RB_2 = 34,
+	A5XX_RBBM_DBGBUS_RB_3 = 35,
+	A5XX_RBBM_DBGBUS_CCU_0 = 40,
+	A5XX_RBBM_DBGBUS_CCU_1 = 41,
+	A5XX_RBBM_DBGBUS_CCU_2 = 42,
+	A5XX_RBBM_DBGBUS_CCU_3 = 43,
+	A5XX_RBBM_DBGBUS_A2D_RAS_0 = 48,
+	A5XX_RBBM_DBGBUS_A2D_RAS_1 = 49,
+	A5XX_RBBM_DBGBUS_A2D_RAS_2 = 50,
+	A5XX_RBBM_DBGBUS_A2D_RAS_3 = 51,
+	A5XX_RBBM_DBGBUS_VFD_0 = 56,
+	A5XX_RBBM_DBGBUS_VFD_1 = 57,
+	A5XX_RBBM_DBGBUS_VFD_2 = 58,
+	A5XX_RBBM_DBGBUS_VFD_3 = 59,
+	A5XX_RBBM_DBGBUS_SP_0 = 64,
+	A5XX_RBBM_DBGBUS_SP_1 = 65,
+	A5XX_RBBM_DBGBUS_SP_2 = 66,
+	A5XX_RBBM_DBGBUS_SP_3 = 67,
+	A5XX_RBBM_DBGBUS_TPL1_0 = 72,
+	A5XX_RBBM_DBGBUS_TPL1_1 = 73,
+	A5XX_RBBM_DBGBUS_TPL1_2 = 74,
+	A5XX_RBBM_DBGBUS_TPL1_3 = 75,
+};
+
+enum a5xx_shader_blocks {
+	A5XX_TP_W_MEMOBJ = 1,
+	A5XX_TP_W_SAMPLER = 2,
+	A5XX_TP_W_MIPMAP_BASE = 3,
+	A5XX_TP_W_MEMOBJ_TAG = 4,
+	A5XX_TP_W_SAMPLER_TAG = 5,
+	A5XX_TP_S_3D_MEMOBJ = 6,
+	A5XX_TP_S_3D_SAMPLER = 7,
+	A5XX_TP_S_3D_MEMOBJ_TAG = 8,
+	A5XX_TP_S_3D_SAMPLER_TAG = 9,
+	A5XX_TP_S_CS_MEMOBJ = 10,
+	A5XX_TP_S_CS_SAMPLER = 11,
+	A5XX_TP_S_CS_MEMOBJ_TAG = 12,
+	A5XX_TP_S_CS_SAMPLER_TAG = 13,
+	A5XX_SP_W_INSTR = 14,
+	A5XX_SP_W_CONST = 15,
+	A5XX_SP_W_UAV_SIZE = 16,
+	A5XX_SP_W_CB_SIZE = 17,
+	A5XX_SP_W_UAV_BASE = 18,
+	A5XX_SP_W_CB_BASE = 19,
+	A5XX_SP_W_INST_TAG = 20,
+	A5XX_SP_W_STATE = 21,
+	A5XX_SP_S_3D_INSTR = 22,
+	A5XX_SP_S_3D_CONST = 23,
+	A5XX_SP_S_3D_CB_BASE = 24,
+	A5XX_SP_S_3D_CB_SIZE = 25,
+	A5XX_SP_S_3D_UAV_BASE = 26,
+	A5XX_SP_S_3D_UAV_SIZE = 27,
+	A5XX_SP_S_CS_INSTR = 28,
+	A5XX_SP_S_CS_CONST = 29,
+	A5XX_SP_S_CS_CB_BASE = 30,
+	A5XX_SP_S_CS_CB_SIZE = 31,
+	A5XX_SP_S_CS_UAV_BASE = 32,
+	A5XX_SP_S_CS_UAV_SIZE = 33,
+	A5XX_SP_S_3D_INSTR_DIRTY = 34,
+	A5XX_SP_S_3D_CONST_DIRTY = 35,
+	A5XX_SP_S_3D_CB_BASE_DIRTY = 36,
+	A5XX_SP_S_3D_CB_SIZE_DIRTY = 37,
+	A5XX_SP_S_3D_UAV_BASE_DIRTY = 38,
+	A5XX_SP_S_3D_UAV_SIZE_DIRTY = 39,
+	A5XX_SP_S_CS_INSTR_DIRTY = 40,
+	A5XX_SP_S_CS_CONST_DIRTY = 41,
+	A5XX_SP_S_CS_CB_BASE_DIRTY = 42,
+	A5XX_SP_S_CS_CB_SIZE_DIRTY = 43,
+	A5XX_SP_S_CS_UAV_BASE_DIRTY = 44,
+	A5XX_SP_S_CS_UAV_SIZE_DIRTY = 45,
+	A5XX_HLSQ_ICB = 46,
+	A5XX_HLSQ_ICB_DIRTY = 47,
+	A5XX_HLSQ_ICB_CB_BASE_DIRTY = 48,
+	A5XX_SP_POWER_RESTORE_RAM = 64,
+	A5XX_SP_POWER_RESTORE_RAM_TAG = 65,
+	A5XX_TP_POWER_RESTORE_RAM = 66,
+	A5XX_TP_POWER_RESTORE_RAM_TAG = 67,
+};
+
 enum a5xx_tex_filter {
 	A5XX_TEX_NEAREST = 0,
 	A5XX_TEX_LINEAR = 1,
@ -396,6 +504,18 @@ static inline uint32_t A5XX_CP_PROTECT_REG_MASK_LEN(uint32_t val)
 #define REG_A5XX_CP_POWERCTR_CP_SEL_3				0x00000bbd

 #define REG_A5XX_RBBM_CFG_DBGBUS_SEL_A				0x00000004
+#define A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX__MASK		0x000000ff
+#define A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX__SHIFT		0
+static inline uint32_t A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX(uint32_t val)
+{
+	return ((val) << A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX__SHIFT) & A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX__MASK;
+}
+#define A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL__MASK		0x0000ff00
+#define A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL__SHIFT		8
+static inline uint32_t A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL(uint32_t val)
+{
+	return ((val) << A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL__SHIFT) & A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL__MASK;
+}

 #define REG_A5XX_RBBM_CFG_DBGBUS_SEL_B				0x00000005

@ -406,6 +526,12 @@ static inline uint32_t A5XX_CP_PROTECT_REG_MASK_LEN(uint32_t val)
 #define REG_A5XX_RBBM_CFG_DBGBUS_CNTLT				0x00000008

 #define REG_A5XX_RBBM_CFG_DBGBUS_CNTLM				0x00000009
+#define A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE__MASK			0x0f000000
+#define A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE__SHIFT		24
+static inline uint32_t A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE(uint32_t val)
+{
+	return ((val) << A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE__SHIFT) & A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE__MASK;
+}

 #define REG_A5XX_RBBM_CFG_DEBBUS_CTLTM_ENABLE_SHIFT		0x00000018

@ -1413,6 +1539,12 @@ static inline uint32_t A5XX_VSC_BIN_SIZE_Y(uint32_t val)
 #define REG_A5XX_HLSQ_SPTP_RDSEL				0x00000f08

 #define REG_A5XX_HLSQ_DBG_READ_SEL				0x0000bc00
+#define A5XX_HLSQ_DBG_READ_SEL_STATETYPE__MASK			0x0000ff00
+#define A5XX_HLSQ_DBG_READ_SEL_STATETYPE__SHIFT			8
+static inline uint32_t A5XX_HLSQ_DBG_READ_SEL_STATETYPE(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_DBG_READ_SEL_STATETYPE__SHIFT) & A5XX_HLSQ_DBG_READ_SEL_STATETYPE__MASK;
+}

 #define REG_A5XX_HLSQ_DBG_AHB_READ_APERTURE			0x0000a000

@ -1583,6 +1715,8 @@ static inline uint32_t A5XX_VSC_BIN_SIZE_Y(uint32_t val)
 #define REG_A5XX_VBIF_VERSION					0x00003000

 #define REG_A5XX_VBIF_CLKON					0x00003001
+#define A5XX_VBIF_CLKON_FORCE_ON				0x00000001
+#define A5XX_VBIF_CLKON_FORCE_ON_TESTBUS			0x00000002

 #define REG_A5XX_VBIF_ABIT_SORT					0x00003028

@ -1601,14 +1735,27 @@ static inline uint32_t A5XX_VSC_BIN_SIZE_Y(uint32_t val)
 #define REG_A5XX_VBIF_XIN_HALT_CTRL1				0x00003081

 #define REG_A5XX_VBIF_TEST_BUS_OUT_CTRL				0x00003084
+#define A5XX_VBIF_TEST_BUS_OUT_CTRL_TEST_BUS_CTRL_EN		0x00000001

 #define REG_A5XX_VBIF_TEST_BUS1_CTRL0				0x00003085

 #define REG_A5XX_VBIF_TEST_BUS1_CTRL1				0x00003086
+#define A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL__MASK	0x0000000f
+#define A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL__SHIFT	0
+static inline uint32_t A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL(uint32_t val)
+{
+	return ((val) << A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL__SHIFT) & A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL__MASK;
+}

 #define REG_A5XX_VBIF_TEST_BUS2_CTRL0				0x00003087

 #define REG_A5XX_VBIF_TEST_BUS2_CTRL1				0x00003088
+#define A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL__MASK	0x0000001f
+#define A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL__SHIFT	0
+static inline uint32_t A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL(uint32_t val)
+{
+	return ((val) << A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL__SHIFT) & A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL__MASK;
+}

 #define REG_A5XX_VBIF_TEST_BUS_OUT				0x0000308c

--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@ -15,9 +15,6 @@
 #include "msm_iommu.h"
 #include "a5xx_gpu.h"

-extern bool hang_debug;
-static void a5xx_dump(struct msm_gpu *gpu);
-
 static void a5xx_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@ -800,8 +797,7 @@ static void a5xx_recover(struct msm_gpu *gpu)
 {
 	adreno_dump_info(gpu);

-	if (hang_debug)
-		a5xx_dump(gpu);
+	msm_gpu_snapshot(gpu, gpu->snapshot);

 	/* Reset the GPU so it can work again */
 	gpu_write(gpu, REG_A5XX_RBBM_SW_RESET_CMD, 1);
@ -1112,13 +1108,6 @@ static const u32 a5xx_registers[] = {
 	~0
 };

-static void a5xx_dump(struct msm_gpu *gpu)
-{
-	dev_info(gpu->dev->dev, "status:   %08x\n",
-		gpu_read(gpu, REG_A5XX_RBBM_STATUS));
-	adreno_dump(gpu);
-}
-
 static int a5xx_pm_resume(struct msm_gpu *gpu)
 {
 	int ret;
@ -1225,6 +1214,7 @@ static const struct adreno_gpu_funcs funcs = {
 #ifdef CONFIG_DEBUG_FS
 		.show = a5xx_show,
 #endif
+		.snapshot = a5xx_snapshot,
 	},
 	.get_timestamp = a5xx_get_timestamp,
 };
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
@ -176,6 +176,8 @@ void a5xx_preempt_trigger(struct msm_gpu *gpu);
 void a5xx_preempt_irq(struct msm_gpu *gpu);
 void a5xx_preempt_fini(struct msm_gpu *gpu);

+int a5xx_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot);
+
 /* Return true if we are in a preempt state */
 static inline bool a5xx_in_preempt(struct a5xx_gpu *a5xx_gpu)
 {
--- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
@ -46,10 +46,6 @@ static void *alloc_kernel_bo(struct drm_device *drm, struct msm_gpu *gpu,
 	if (iova)
 		*iova = _iova;

-	pr_err("[%ps] buffer size %x, iova [%llx : %llx]\n",
-		__builtin_return_address(0), size,
-		_iova, _iova+size-1);
-
 	return ptr;
 out:
 	drm_gem_object_unreference_unlocked(_bo);
--- a/drivers/gpu/drm/msm/adreno/a5xx_snapshot.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_snapshot.c
@ -0,0 +1,796 @@
+/* Copyright (c) 2016-2017 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "msm_gpu.h"
+#include "msm_gem.h"
+#include "a5xx_gpu.h"
+#include "msm_snapshot_api.h"
+
+#define A5XX_NR_SHADER_BANKS 4
+
+/*
+ * These are a list of the registers that need to be read through the HLSQ
+ * aperture through the crashdumper.  These are not nominally accessible from
+ * the CPU on a secure platform.
+ */
+static const struct {
+	u32 type;
+	u32 regoffset;
+	u32 count;
+} a5xx_hlsq_aperture_regs[] = {
+	{ 0x35, 0xE00, 0x32 },   /* HSLQ non-context */
+	{ 0x31, 0x2080, 0x1 },   /* HLSQ 2D context 0 */
+	{ 0x33, 0x2480, 0x1 },   /* HLSQ 2D context 1 */
+	{ 0x32, 0xE780, 0x62 },  /* HLSQ 3D context 0 */
+	{ 0x34, 0xEF80, 0x62 },  /* HLSQ 3D context 1 */
+	{ 0x3f, 0x0EC0, 0x40 },  /* SP non-context */
+	{ 0x3d, 0x2040, 0x1 },   /* SP 2D context 0 */
+	{ 0x3b, 0x2440, 0x1 },   /* SP 2D context 1 */
+	{ 0x3e, 0xE580, 0x180 }, /* SP 3D context 0 */
+	{ 0x3c, 0xED80, 0x180 }, /* SP 3D context 1 */
+	{ 0x3a, 0x0F00, 0x1c },  /* TP non-context */
+	{ 0x38, 0x2000, 0xa },   /* TP 2D context 0 */
+	{ 0x36, 0x2400, 0xa },   /* TP 2D context 1 */
+	{ 0x39, 0xE700, 0x80 },  /* TP 3D context 0 */
+	{ 0x37, 0xEF00, 0x80 },  /* TP 3D context 1 */
+};
+
+/*
+ * The debugbus registers contain device state that presumably makes
+ * sense to the hardware designers. 'count' is the number of indexes to read,
+ * each index value is 64 bits
+ */
+static const struct {
+	enum a5xx_debugbus id;
+	u32 count;
+} a5xx_debugbus_blocks[] = {
+	{  A5XX_RBBM_DBGBUS_CP, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RBBM, 0x100, },
+	{  A5XX_RBBM_DBGBUS_HLSQ, 0x100, },
+	{  A5XX_RBBM_DBGBUS_UCHE, 0x100, },
+	{  A5XX_RBBM_DBGBUS_DPM, 0x100, },
+	{  A5XX_RBBM_DBGBUS_TESS, 0x100, },
+	{  A5XX_RBBM_DBGBUS_PC, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VFDP, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VPC, 0x100, },
+	{  A5XX_RBBM_DBGBUS_TSE, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RAS, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VSC, 0x100, },
+	{  A5XX_RBBM_DBGBUS_COM, 0x100, },
+	{  A5XX_RBBM_DBGBUS_DCOM, 0x100, },
+	{  A5XX_RBBM_DBGBUS_LRZ, 0x100, },
+	{  A5XX_RBBM_DBGBUS_A2D_DSP, 0x100, },
+	{  A5XX_RBBM_DBGBUS_CCUFCHE, 0x100, },
+	{  A5XX_RBBM_DBGBUS_GPMU, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RBP, 0x100, },
+	{  A5XX_RBBM_DBGBUS_HM, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RBBM_CFG, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VBIF_CX, 0x100, },
+	{  A5XX_RBBM_DBGBUS_GPC, 0x100, },
+	{  A5XX_RBBM_DBGBUS_LARC, 0x100, },
+	{  A5XX_RBBM_DBGBUS_HLSQ_SPTP, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RB_0, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RB_1, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RB_2, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RB_3, 0x100, },
+	{  A5XX_RBBM_DBGBUS_CCU_0, 0x100, },
+	{  A5XX_RBBM_DBGBUS_CCU_1, 0x100, },
+	{  A5XX_RBBM_DBGBUS_CCU_2, 0x100, },
+	{  A5XX_RBBM_DBGBUS_CCU_3, 0x100, },
+	{  A5XX_RBBM_DBGBUS_A2D_RAS_0, 0x100, },
+	{  A5XX_RBBM_DBGBUS_A2D_RAS_1, 0x100, },
+	{  A5XX_RBBM_DBGBUS_A2D_RAS_2, 0x100, },
+	{  A5XX_RBBM_DBGBUS_A2D_RAS_3, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VFD_0, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VFD_1, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VFD_2, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VFD_3, 0x100, },
+	{  A5XX_RBBM_DBGBUS_SP_0, 0x100, },
+	{  A5XX_RBBM_DBGBUS_SP_1, 0x100, },
+	{  A5XX_RBBM_DBGBUS_SP_2, 0x100, },
+	{  A5XX_RBBM_DBGBUS_SP_3, 0x100, },
+	{  A5XX_RBBM_DBGBUS_TPL1_0, 0x100, },
+	{  A5XX_RBBM_DBGBUS_TPL1_1, 0x100, },
+	{  A5XX_RBBM_DBGBUS_TPL1_2, 0x100, },
+	{  A5XX_RBBM_DBGBUS_TPL1_3, 0x100, },
+};
+
+/*
+ * The shader blocks are read from the HLSQ aperture - each one has its own
+ * identifier for the aperture read
+ */
+static const struct {
+	enum a5xx_shader_blocks id;
+	u32 size;
+} a5xx_shader_blocks[] = {
+	{A5XX_TP_W_MEMOBJ,              0x200},
+	{A5XX_TP_W_MIPMAP_BASE,         0x3C0},
+	{A5XX_TP_W_SAMPLER_TAG,          0x40},
+	{A5XX_TP_S_3D_SAMPLER,           0x80},
+	{A5XX_TP_S_3D_SAMPLER_TAG,       0x20},
+	{A5XX_TP_S_CS_SAMPLER,           0x40},
+	{A5XX_TP_S_CS_SAMPLER_TAG,       0x10},
+	{A5XX_SP_W_CONST,               0x800},
+	{A5XX_SP_W_CB_SIZE,              0x30},
+	{A5XX_SP_W_CB_BASE,              0xF0},
+	{A5XX_SP_W_STATE,                 0x1},
+	{A5XX_SP_S_3D_CONST,            0x800},
+	{A5XX_SP_S_3D_CB_SIZE,           0x28},
+	{A5XX_SP_S_3D_UAV_SIZE,          0x80},
+	{A5XX_SP_S_CS_CONST,            0x400},
+	{A5XX_SP_S_CS_CB_SIZE,            0x8},
+	{A5XX_SP_S_CS_UAV_SIZE,          0x80},
+	{A5XX_SP_S_3D_CONST_DIRTY,       0x12},
+	{A5XX_SP_S_3D_CB_SIZE_DIRTY,      0x1},
+	{A5XX_SP_S_3D_UAV_SIZE_DIRTY,     0x2},
+	{A5XX_SP_S_CS_CONST_DIRTY,        0xA},
+	{A5XX_SP_S_CS_CB_SIZE_DIRTY,      0x1},
+	{A5XX_SP_S_CS_UAV_SIZE_DIRTY,     0x2},
+	{A5XX_HLSQ_ICB_DIRTY,             0xB},
+	{A5XX_SP_POWER_RESTORE_RAM_TAG,   0xA},
+	{A5XX_TP_POWER_RESTORE_RAM_TAG,   0xA},
+	{A5XX_TP_W_SAMPLER,              0x80},
+	{A5XX_TP_W_MEMOBJ_TAG,           0x40},
+	{A5XX_TP_S_3D_MEMOBJ,           0x200},
+	{A5XX_TP_S_3D_MEMOBJ_TAG,        0x20},
+	{A5XX_TP_S_CS_MEMOBJ,           0x100},
+	{A5XX_TP_S_CS_MEMOBJ_TAG,        0x10},
+	{A5XX_SP_W_INSTR,               0x800},
+	{A5XX_SP_W_UAV_SIZE,             0x80},
+	{A5XX_SP_W_UAV_BASE,             0x80},
+	{A5XX_SP_W_INST_TAG,             0x40},
+	{A5XX_SP_S_3D_INSTR,            0x800},
+	{A5XX_SP_S_3D_CB_BASE,           0xC8},
+	{A5XX_SP_S_3D_UAV_BASE,          0x80},
+	{A5XX_SP_S_CS_INSTR,            0x400},
+	{A5XX_SP_S_CS_CB_BASE,           0x28},
+	{A5XX_SP_S_CS_UAV_BASE,          0x80},
+	{A5XX_SP_S_3D_INSTR_DIRTY,        0x1},
+	{A5XX_SP_S_3D_CB_BASE_DIRTY,      0x5},
+	{A5XX_SP_S_3D_UAV_BASE_DIRTY,     0x2},
+	{A5XX_SP_S_CS_INSTR_DIRTY,        0x1},
+	{A5XX_SP_S_CS_CB_BASE_DIRTY,      0x1},
+	{A5XX_SP_S_CS_UAV_BASE_DIRTY,     0x2},
+	{A5XX_HLSQ_ICB,                 0x200},
+	{A5XX_HLSQ_ICB_CB_BASE_DIRTY,     0x4},
+	{A5XX_SP_POWER_RESTORE_RAM,     0x140},
+	{A5XX_TP_POWER_RESTORE_RAM,      0x40},
+};
+
+/*
+ * The A5XX architecture has a a built in engine to asynchronously dump
+ * registers from the GPU. It is used to accelerate the copy of hundreds
+ * (thousands) of registers and as a safe way to access registers that might
+ * have secure data in them (if the GPU is in secure, the crashdumper returns
+ * bogus values for those registers). On a fully secured device the CPU will be
+ * blocked from accessing those registers directly and so the crashdump is the
+ * only way that we can access context registers and the shader banks for debug
+ * purposes.
+ *
+ * The downside of the crashdump is that it requires access to GPU accessible
+ * memory (so the VBIF and the bus and the SMMU need to be up and working) and
+ * you need enough memory to write the script for the crashdumper and to store
+ * the data that you are dumping so there is a balancing act between the work to
+ * set up a crash dumper and the value we get out of it.
+ */
+
+/*
+ * The crashdump uses a pseudo-script format to read and write registers.  Each
+ * operation is two 64 bit values.
+ *
+ * READ:
+ *  [qword 0] [64:00] - The absolute IOVA address target for the register value
+ *  [qword 1] [63:44] - the dword address of the register offset to read
+ *            [15:00] - Number of dwords to read at once
+ *
+ * WRITE:
+ *  [qword 0] [31:0] 32 bit value to write to the register
+ *  [qword 1] [63:44] - the dword address of the register offset to write
+ *            [21:21] - set 1 to write
+ *            [15:00] - Number of dwords to write (usually 1)
+ *
+ * At the bottom of the script, write quadword zeros to trigger the end.
+ */
+struct crashdump {
+	struct drm_gem_object *bo;
+	void *ptr;
+	u64 iova;
+	u32 index;
+};
+
+#define CRASHDUMP_BO_SIZE (SZ_1M)
+#define CRASHDUMP_SCRIPT_SIZE (256 * SZ_1K)
+#define CRASHDUMP_DATA_SIZE (CRASHDUMP_BO_SIZE - CRASHDUMP_SCRIPT_SIZE)
+
+static int crashdump_init(struct msm_gpu *gpu, struct crashdump *crashdump)
+{
+	struct drm_device *drm = gpu->dev;
+	int ret = -ENOMEM;
+
+	crashdump->bo = msm_gem_new(drm, CRASHDUMP_BO_SIZE, MSM_BO_UNCACHED);
+	if (IS_ERR(crashdump->bo)) {
+		ret = PTR_ERR(crashdump->bo);
+		crashdump->bo = NULL;
+		return ret;
+	}
+
+	crashdump->ptr = msm_gem_vaddr_locked(crashdump->bo);
+	if (!crashdump->ptr)
+		goto out;
+
+	ret = msm_gem_get_iova_locked(crashdump->bo, gpu->aspace,
+		&crashdump->iova);
+
+out:
+	if (ret) {
+		drm_gem_object_unreference(crashdump->bo);
+		crashdump->bo = NULL;
+	}
+
+	return ret;
+}
+
+static int crashdump_run(struct msm_gpu *gpu, struct crashdump *crashdump)
+{
+	if (!crashdump->ptr || !crashdump->index)
+		return -EINVAL;
+
+	gpu_write(gpu, REG_A5XX_CP_CRASH_SCRIPT_BASE_LO,
+		lower_32_bits(crashdump->iova));
+	gpu_write(gpu, REG_A5XX_CP_CRASH_SCRIPT_BASE_HI,
+		upper_32_bits(crashdump->iova));
+
+	gpu_write(gpu, REG_A5XX_CP_CRASH_DUMP_CNTL, 1);
+
+	return spin_until(gpu_read(gpu, REG_A5XX_CP_CRASH_DUMP_CNTL) & 0x04);
+}
+
+static void crashdump_destroy(struct msm_gpu *gpu, struct crashdump *crashdump)
+{
+	if (!crashdump->bo)
+		return;
+
+	if (crashdump->iova)
+		msm_gem_put_iova(crashdump->bo, gpu->aspace);
+
+	drm_gem_object_unreference(crashdump->bo);
+
+	memset(crashdump, 0, sizeof(*crashdump));
+}
+
+static inline void CRASHDUMP_SCRIPT_WRITE(struct crashdump *crashdump,
+		u32 reg, u32 val)
+{
+	u64 *ptr = crashdump->ptr + crashdump->index;
+
+	if (WARN_ON(crashdump->index + (2 * sizeof(u64))
+		>= CRASHDUMP_SCRIPT_SIZE))
+		return;
+
+	/* This is the value to write */
+	ptr[0] = (u64) val;
+
+	/*
+	 * This triggers a write to the specified register.  1 is the size of
+	 * the write in dwords
+	 */
+	ptr[1] = (((u64) reg) << 44) | (1 << 21) | 1;
+
+	crashdump->index += 2 * sizeof(u64);
+}
+
+static inline void CRASHDUMP_SCRIPT_READ(struct crashdump *crashdump,
+		u32 reg, u32 count, u32 offset)
+{
+	u64 *ptr = crashdump->ptr + crashdump->index;
+
+	if (WARN_ON(crashdump->index + (2 * sizeof(u64))
+		>= CRASHDUMP_SCRIPT_SIZE))
+		return;
+
+	if (WARN_ON(offset + (count * sizeof(u32)) >= CRASHDUMP_DATA_SIZE))
+		return;
+
+	ptr[0] = (u64) crashdump->iova + CRASHDUMP_SCRIPT_SIZE + offset;
+	ptr[1] = (((u64) reg) << 44) | count;
+
+	crashdump->index += 2 * sizeof(u64);
+}
+
+static inline void *CRASHDUMP_DATA_PTR(struct crashdump *crashdump, u32 offset)
+{
+	if (WARN_ON(!crashdump->ptr || offset >= CRASHDUMP_DATA_SIZE))
+		return NULL;
+
+	return crashdump->ptr + CRASHDUMP_SCRIPT_SIZE + offset;
+}
+
+static inline u32 CRASHDUMP_DATA_READ(struct crashdump *crashdump, u32 offset)
+{
+	return *((u32 *) CRASHDUMP_DATA_PTR(crashdump, offset));
+}
+
+static inline void CRASHDUMP_RESET(struct crashdump *crashdump)
+{
+	crashdump->index = 0;
+}
+
+static inline void CRASHDUMP_END(struct crashdump *crashdump)
+{
+	u64 *ptr = crashdump->ptr + crashdump->index;
+
+	if (WARN_ON((crashdump->index + (2 * sizeof(u64)))
+		>= CRASHDUMP_SCRIPT_SIZE))
+		return;
+
+	ptr[0] = 0;
+	ptr[1] = 0;
+
+	crashdump->index += 2 * sizeof(u64);
+}
+
+static u32 _crashdump_read_hlsq_aperture(struct crashdump *crashdump,
+		u32 offset, u32 statetype, u32 bank,
+		u32 count)
+{
+	CRASHDUMP_SCRIPT_WRITE(crashdump, REG_A5XX_HLSQ_DBG_READ_SEL,
+		A5XX_HLSQ_DBG_READ_SEL_STATETYPE(statetype) | bank);
+
+	CRASHDUMP_SCRIPT_READ(crashdump, REG_A5XX_HLSQ_DBG_AHB_READ_APERTURE,
+		count, offset);
+
+	return count * sizeof(u32);
+}
+
+static u32 _copy_registers(struct msm_snapshot *snapshot,
+		struct crashdump *crashdump, u32 reg, u32 count,
+		u32 offset)
+{
+	int i;
+	u32 *ptr = (u32 *) (crashdump->ptr + CRASHDUMP_SCRIPT_SIZE + offset);
+	/*
+	 * Write the offset of the first register of the group and the number of
+	 * registers in the group
+	 */
+	SNAPSHOT_WRITE_U32(snapshot, ((count << 16) | reg));
+
+	/* Followed by each register value in the group */
+	for (i = 0; i < count; i++)
+		SNAPSHOT_WRITE_U32(snapshot, ptr[i]);
+
+	return count * sizeof(u32);
+}
+
+/*
+ * Return the number of registers in each register group from the
+ * adreno_gpu->rgisters
+ */
+static inline u32 REG_COUNT(const unsigned int *ptr)
+{
+	return (ptr[1] - ptr[0]) + 1;
+}
+
+/*
+ * Capture what registers we can from the CPU in case the crashdumper is
+ * unavailable or broken.  This will omit the SP,TP and HLSQ registers, but
+ * you'll get everything else and that ain't bad
+ */
+static void a5xx_snapshot_registers_cpu(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct msm_snapshot_regs header;
+	u32 regcount = 0, groups = 0;
+	int i;
+
+	/*
+	 * Before we write the section we need to figure out how big our data
+	 * section will be
+	 */
+	for (i = 0; adreno_gpu->registers[i] != ~0; i += 2) {
+		regcount += REG_COUNT(&(adreno_gpu->registers[i]));
+		groups++;
+	}
+
+	header.count = groups;
+
+	/*
+	 * We need one dword for each group and then one dword for each register
+	 * value in that group
+	 */
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_REGS_V2,
+		regcount + groups))
+		return;
+
+	for (i = 0; adreno_gpu->registers[i] != ~0; i += 2) {
+		u32 count = REG_COUNT(&(adreno_gpu->registers[i]));
+		u32 reg = adreno_gpu->registers[i];
+		int j;
+
+		/* Write the offset and count for the group */
+		SNAPSHOT_WRITE_U32(snapshot, (count << 16) | reg);
+
+		/* Write each value in the group */
+		for (j = 0; j < count; j++)
+			SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu, reg++));
+	}
+}
+
+static void a5xx_snapshot_registers(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	struct msm_snapshot_regs header;
+	struct crashdump *crashdump = snapshot->priv;
+	u32 offset = 0, regcount = 0, groups = 0;
+	int i;
+
+	/*
+	 * First snapshot all the registers that we can from the CPU.  Do this
+	 * because the crashdumper has a tendency to "taint" the value of some
+	 * of the registers (because the GPU implements the crashdumper) so we
+	 * only want to use the crash dump facility if we have to
+	 */
+	a5xx_snapshot_registers_cpu(gpu, snapshot);
+
+	if (!crashdump)
+		return;
+
+	CRASHDUMP_RESET(crashdump);
+
+	/* HLSQ and context registers behind the aperture */
+	for (i = 0; i < ARRAY_SIZE(a5xx_hlsq_aperture_regs); i++) {
+		u32 count = a5xx_hlsq_aperture_regs[i].count;
+
+		offset += _crashdump_read_hlsq_aperture(crashdump, offset,
+			a5xx_hlsq_aperture_regs[i].type, 0, count);
+		regcount += count;
+
+		groups++;
+	}
+
+	CRASHDUMP_END(crashdump);
+
+	if (crashdump_run(gpu, crashdump))
+		return;
+
+	header.count = groups;
+
+	/*
+	 * The size of the data will be one dword for each "group" of registers,
+	 * and then one dword for each of the registers in that group
+	 */
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_REGS_V2,
+		groups + regcount))
+		return;
+
+	/* Copy the registers to the snapshot */
+	for (i = 0; i < ARRAY_SIZE(a5xx_hlsq_aperture_regs); i++)
+		offset += _copy_registers(snapshot, crashdump,
+			a5xx_hlsq_aperture_regs[i].regoffset,
+			a5xx_hlsq_aperture_regs[i].count, offset);
+}
+
+static void _a5xx_snapshot_shader_bank(struct msm_snapshot *snapshot,
+		struct crashdump *crashdump, u32 block, u32 bank,
+		u32 size, u32 offset)
+{
+	void *src;
+
+	struct msm_snapshot_shader header = {
+		.type = block,
+		.index = bank,
+		.size = size,
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_SHADER, size))
+		return;
+
+	src = CRASHDUMP_DATA_PTR(crashdump, offset);
+
+	if (src)
+		SNAPSHOT_MEMCPY(snapshot, src, size * sizeof(u32));
+}
+
+static void a5xx_snapshot_shader_memory(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	struct crashdump *crashdump = snapshot->priv;
+	u32 offset = 0;
+	int i;
+
+	/* We can only get shader memory through the crashdump */
+	if (!crashdump)
+		return;
+
+	CRASHDUMP_RESET(crashdump);
+
+	/* For each shader block */
+	for (i = 0; i < ARRAY_SIZE(a5xx_shader_blocks); i++) {
+		int j;
+
+		/* For each block, dump 4 banks */
+		for (j = 0; j < A5XX_NR_SHADER_BANKS; j++)
+			offset += _crashdump_read_hlsq_aperture(crashdump,
+				offset, a5xx_shader_blocks[i].id, j,
+				a5xx_shader_blocks[i].size);
+	}
+
+	CRASHDUMP_END(crashdump);
+
+	/* If the crashdump fails we can't get shader memory any other way */
+	if (crashdump_run(gpu, crashdump))
+		return;
+
+	/* Each bank of each shader gets its own snapshot section */
+	for (offset = 0, i = 0; i < ARRAY_SIZE(a5xx_shader_blocks); i++) {
+		int j;
+
+		for (j = 0; j < A5XX_NR_SHADER_BANKS; j++) {
+			_a5xx_snapshot_shader_bank(snapshot, crashdump,
+				a5xx_shader_blocks[i].id, j,
+				a5xx_shader_blocks[i].size, offset);
+			offset += a5xx_shader_blocks[i].size * sizeof(u32);
+		}
+	}
+}
+
+#define A5XX_NUM_AXI_ARB_BLOCKS 2
+#define A5XX_NUM_XIN_BLOCKS     4
+#define VBIF_DATA_SIZE ((16 * A5XX_NUM_AXI_ARB_BLOCKS) + \
+	(18 * A5XX_NUM_XIN_BLOCKS) + (12 * A5XX_NUM_XIN_BLOCKS))
+
+static void a5xx_snapshot_debugbus_vbif(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	int i;
+	struct msm_snapshot_debugbus header = {
+		.id = A5XX_RBBM_DBGBUS_VBIF,
+		.count = VBIF_DATA_SIZE,
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUGBUS,
+		VBIF_DATA_SIZE))
+		return;
+
+	gpu_rmw(gpu, REG_A5XX_VBIF_CLKON, A5XX_VBIF_CLKON_FORCE_ON_TESTBUS,
+		A5XX_VBIF_CLKON_FORCE_ON_TESTBUS);
+
+	gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS1_CTRL0, 0);
+	gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS_OUT_CTRL,
+		A5XX_VBIF_TEST_BUS_OUT_CTRL_TEST_BUS_CTRL_EN);
+
+	for (i = 0; i < A5XX_NUM_AXI_ARB_BLOCKS; i++) {
+		int j;
+
+		gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL0, 1 << (i + 16));
+		for (j = 0; j < 16; j++) {
+			gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL1,
+			A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL(j));
+			SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu,
+				REG_A5XX_VBIF_TEST_BUS_OUT));
+		}
+	}
+
+	for (i = 0; i < A5XX_NUM_XIN_BLOCKS; i++) {
+		int j;
+
+		gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL0, 1 << i);
+		for (j = 0; j < 18; j++) {
+			gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL1,
+			A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL(j));
+			SNAPSHOT_WRITE_U32(snapshot,
+				gpu_read(gpu, REG_A5XX_VBIF_TEST_BUS_OUT));
+		}
+	}
+
+	for (i = 0; i < A5XX_NUM_XIN_BLOCKS; i++) {
+		int j;
+
+		gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS1_CTRL0, 1 << i);
+		for (j = 0; j < 12; j++) {
+			gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS1_CTRL1,
+			A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL(j));
+			SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu,
+				REG_A5XX_VBIF_TEST_BUS_OUT));
+		}
+	}
+
+}
+
+static void a5xx_snapshot_debugbus_block(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot, u32 block, u32 count)
+{
+	int i;
+	struct msm_snapshot_debugbus header = {
+		.id = block,
+		.count = count * 2, /* Each value is 2 dwords */
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUGBUS,
+		(count * 2)))
+		return;
+
+	for (i = 0; i < count; i++) {
+		u32 reg = A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX(i) |
+			A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL(block);
+
+		gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_A, reg);
+		gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_B, reg);
+		gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_C, reg);
+		gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_D, reg);
+
+		/* Each debugbus entry is a quad word */
+		SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu,
+			REG_A5XX_RBBM_CFG_DBGBUS_TRACE_BUF2));
+		SNAPSHOT_WRITE_U32(snapshot,
+			gpu_read(gpu, REG_A5XX_RBBM_CFG_DBGBUS_TRACE_BUF1));
+	}
+}
+
+static void a5xx_snapshot_debugbus(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	int i;
+
+	gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_CNTLM,
+		A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE(0xF));
+
+	for (i = 0; i < ARRAY_SIZE(a5xx_debugbus_blocks); i++)
+		a5xx_snapshot_debugbus_block(gpu, snapshot,
+			a5xx_debugbus_blocks[i].id,
+			a5xx_debugbus_blocks[i].count);
+
+	/* VBIF is special and not in a good way */
+	a5xx_snapshot_debugbus_vbif(gpu, snapshot);
+}
+
+static void a5xx_snapshot_cp_merciu(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	unsigned int i;
+	struct msm_snapshot_debug header = {
+		.type = SNAPSHOT_DEBUG_CP_MERCIU,
+		.size = 64 << 1, /* Data size is 2 dwords per entry */
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUG, 64 << 1))
+		return;
+
+	gpu_write(gpu, REG_A5XX_CP_MERCIU_DBG_ADDR, 0);
+	for (i = 0; i < 64; i++) {
+		SNAPSHOT_WRITE_U32(snapshot,
+			gpu_read(gpu, REG_A5XX_CP_MERCIU_DBG_DATA_1));
+		SNAPSHOT_WRITE_U32(snapshot,
+			gpu_read(gpu, REG_A5XX_CP_MERCIU_DBG_DATA_2));
+	}
+}
+
+static void a5xx_snapshot_cp_roq(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	int i;
+	struct msm_snapshot_debug header = {
+		.type = SNAPSHOT_DEBUG_CP_ROQ,
+		.size = 512,
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUG, 512))
+		return;
+
+	gpu_write(gpu, REG_A5XX_CP_ROQ_DBG_ADDR, 0);
+	for (i = 0; i < 512; i++)
+		SNAPSHOT_WRITE_U32(snapshot,
+			gpu_read(gpu, REG_A5XX_CP_ROQ_DBG_DATA));
+}
+
+static void a5xx_snapshot_cp_meq(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	int i;
+	struct msm_snapshot_debug header = {
+		.type = SNAPSHOT_DEBUG_CP_MEQ,
+		.size = 64,
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUG, 64))
+		return;
+
+	gpu_write(gpu, REG_A5XX_CP_MEQ_DBG_ADDR, 0);
+	for (i = 0; i < 64; i++)
+		SNAPSHOT_WRITE_U32(snapshot,
+			gpu_read(gpu, REG_A5XX_CP_MEQ_DBG_DATA));
+}
+
+static void a5xx_snapshot_indexed_registers(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot, u32 addr, u32 data,
+		u32 count)
+{
+	unsigned int i;
+	struct msm_snapshot_indexed_regs header = {
+		.index_reg = addr,
+		.data_reg = data,
+		.start = 0,
+		.count = count,
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_INDEXED_REGS,
+		count))
+		return;
+
+	for (i = 0; i < count; i++) {
+		gpu_write(gpu, addr, i);
+		SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu, data));
+	}
+}
+
+int a5xx_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot)
+{
+	struct crashdump crashdump = { 0 };
+
+	if (!crashdump_init(gpu, &crashdump))
+		snapshot->priv = &crashdump;
+
+	/* To accurately read all registers, disable hardware clock gating */
+	a5xx_set_hwcg(gpu, false);
+
+	/* Kick it up to the generic level */
+	adreno_snapshot(gpu, snapshot);
+
+	/* Read the GPU registers */
+	a5xx_snapshot_registers(gpu, snapshot);
+
+	/* Read the shader memory banks */
+	a5xx_snapshot_shader_memory(gpu, snapshot);
+
+	/* Read the debugbus registers */
+	a5xx_snapshot_debugbus(gpu, snapshot);
+
+	/* PFP data */
+	a5xx_snapshot_indexed_registers(gpu, snapshot,
+		REG_A5XX_CP_PFP_STAT_ADDR, REG_A5XX_CP_PFP_STAT_DATA, 36);
+
+	/* ME data */
+	a5xx_snapshot_indexed_registers(gpu, snapshot,
+		REG_A5XX_CP_ME_STAT_ADDR, REG_A5XX_CP_ME_STAT_DATA, 29);
+
+	/* DRAW_STATE data */
+	a5xx_snapshot_indexed_registers(gpu, snapshot,
+		REG_A5XX_CP_DRAW_STATE_ADDR, REG_A5XX_CP_DRAW_STATE_DATA,
+		256);
+
+	/* ME cache */
+	a5xx_snapshot_indexed_registers(gpu, snapshot,
+		REG_A5XX_CP_ME_UCODE_DBG_ADDR, REG_A5XX_CP_ME_UCODE_DBG_DATA,
+		0x53F);
+
+	/* PFP cache */
+	a5xx_snapshot_indexed_registers(gpu, snapshot,
+		REG_A5XX_CP_PFP_UCODE_DBG_ADDR, REG_A5XX_CP_PFP_UCODE_DBG_DATA,
+		0x53F);
+
+	/* ME queue */
+	a5xx_snapshot_cp_meq(gpu, snapshot);
+
+	/* CP ROQ */
+	a5xx_snapshot_cp_roq(gpu, snapshot);
+
+	/* CP MERCIU */
+	a5xx_snapshot_cp_merciu(gpu, snapshot);
+
+	crashdump_destroy(gpu, &crashdump);
+	snapshot->priv = NULL;
+
+	/* Re-enable HWCG */
+	a5xx_set_hwcg(gpu, true);
+	return 0;
+}
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@ -17,7 +17,9 @@
 * this program.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <linux/utsname.h>
 #include "adreno_gpu.h"
+#include "msm_snapshot.h"
 #include "msm_gem.h"
 #include "msm_mmu.h"

@ -629,3 +631,81 @@ void adreno_gpu_cleanup(struct adreno_gpu *gpu)
 		msm_gem_address_space_put(aspace);
 	}
 }
+
+static void adreno_snapshot_os(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	struct msm_snapshot_linux header;
+
+	memset(&header, 0, sizeof(header));
+
+	header.osid = SNAPSHOT_OS_LINUX_V3;
+	strlcpy(header.release, utsname()->release, sizeof(header.release));
+	strlcpy(header.version, utsname()->version, sizeof(header.version));
+
+	header.seconds = get_seconds();
+	header.ctxtcount = 0;
+
+	SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_OS, 0);
+}
+
+static void adreno_snapshot_ringbuffer(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot, struct msm_ringbuffer *ring)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct msm_snapshot_ringbuffer header;
+	unsigned int i, end = 0;
+	unsigned int *data = ring->start;
+
+	memset(&header, 0, sizeof(header));
+
+	/*
+	 * We only want to copy the active contents of each ring, so find the
+	 * last valid entry in the ringbuffer
+	 */
+	for (i = 0; i < MSM_GPU_RINGBUFFER_SZ >> 2; i++) {
+		if (data[i])
+			end = i;
+	}
+
+	/* The dump always starts at 0 */
+	header.start = 0;
+	header.end = end;
+
+	/* This is the number of dwords being dumped */
+	header.count = end + 1;
+
+	/* This is the size of the actual ringbuffer */
+	header.rbsize = MSM_GPU_RINGBUFFER_SZ >> 2;
+
+	header.id = ring->id;
+	header.gpuaddr = ring->iova;
+	header.rptr = get_rptr(adreno_gpu, ring);
+	header.wptr = get_wptr(ring);
+	header.timestamp_queued = adreno_submitted_fence(gpu, ring);
+	header.timestamp_retired = adreno_last_fence(gpu, ring);
+
+	/* Write the header even if the ringbuffer data is empty */
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_RB_V2,
+		header.count))
+		return;
+
+	SNAPSHOT_MEMCPY(snapshot, ring->start, header.count * sizeof(u32));
+}
+
+static void adreno_snapshot_ringbuffers(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	struct msm_ringbuffer *ring;
+	int i;
+
+	/* Write a new section for each ringbuffer */
+	FOR_EACH_RING(gpu, ring, i)
+		adreno_snapshot_ringbuffer(gpu, snapshot, ring);
+}
+
+void adreno_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot)
+{
+	adreno_snapshot_os(gpu, snapshot);
+	adreno_snapshot_ringbuffers(gpu, snapshot);
+}
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@ -233,6 +233,7 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		int nr_rings);
 void adreno_gpu_cleanup(struct adreno_gpu *gpu);

+void adreno_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot);

 /* ringbuffer helpers (the parts that are adreno specific) */

--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@ -837,6 +837,13 @@ static int msm_gpu_show(struct drm_device *dev, struct seq_file *m)
 	return 0;
 }

+static int msm_snapshot_show(struct drm_device *dev, struct seq_file *m)
+{
+	struct msm_drm_private *priv = dev->dev_private;
+
+	return msm_snapshot_write(priv->gpu, m);
+}
+
 static int msm_gem_show(struct drm_device *dev, struct seq_file *m)
 {
 	struct msm_drm_private *priv = dev->dev_private;
@ -901,11 +908,22 @@ static int show_locked(struct seq_file *m, void *arg)
 	return ret;
 }

+static int show_unlocked(struct seq_file *m, void *arg)
+{
+	struct drm_info_node *node = (struct drm_info_node *) m->private;
+	struct drm_device *dev = node->minor->dev;
+	int (*show)(struct drm_device *dev, struct seq_file *m) =
+			node->info_ent->data;
+
+	return show(dev, m);
+}
+
 static struct drm_info_list msm_debugfs_list[] = {
 		{"gpu", show_locked, 0, msm_gpu_show},
 		{"gem", show_locked, 0, msm_gem_show},
 		{ "mm", show_locked, 0, msm_mm_show },
 		{ "fb", show_locked, 0, msm_fb_show },
+		{ "snapshot", show_unlocked, 0, msm_snapshot_show },
 };

 static int late_init_minor(struct drm_minor *minor)
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@ -764,6 +764,10 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,

 	bs_init(gpu);

+	gpu->snapshot = msm_snapshot_new(gpu);
+	if (IS_ERR(gpu->snapshot))
+		gpu->snapshot = NULL;
+
 	return 0;

 fail:
@ -794,4 +798,6 @@ void msm_gpu_cleanup(struct msm_gpu *gpu)

 		msm_ringbuffer_destroy(gpu->rb[i]);
 	}
+
+	msm_snapshot_destroy(gpu, gpu->snapshot);
 }
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@ -24,6 +24,7 @@

 #include "msm_drv.h"
 #include "msm_ringbuffer.h"
+#include "msm_snapshot.h"

 struct msm_gem_submit;
 struct msm_gpu_perfcntr;
@ -69,6 +70,7 @@ struct msm_gpu_funcs {
 	/* show GPU status in debugfs: */
 	void (*show)(struct msm_gpu *gpu, struct seq_file *m);
 #endif
+	int (*snapshot)(struct msm_gpu *gpu, struct msm_snapshot *snapshot);
 };

 struct msm_gpu {
@ -137,6 +139,8 @@ struct msm_gpu {
 	struct work_struct recover_work;

 	struct list_head submit_list;
+
+	struct msm_snapshot *snapshot;
 };

 /* It turns out that all targets use the same ringbuffer size. */
--- a/drivers/gpu/drm/msm/msm_snapshot.c
+++ b/drivers/gpu/drm/msm/msm_snapshot.c
@ -0,0 +1,105 @@
+/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "msm_gpu.h"
+#include "msm_gem.h"
+#include "msm_snapshot_api.h"
+
+void msm_snapshot_destroy(struct msm_gpu *gpu, struct msm_snapshot *snapshot)
+{
+	struct drm_device *dev = gpu->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct platform_device *pdev = priv->gpu_pdev;
+
+	if (!snapshot)
+		return;
+
+	dma_free_coherent(&pdev->dev, SZ_1M, snapshot->ptr,
+		snapshot->physaddr);
+
+	kfree(snapshot);
+}
+
+struct msm_snapshot *msm_snapshot_new(struct msm_gpu *gpu)
+{
+	struct drm_device *dev = gpu->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct platform_device *pdev = priv->gpu_pdev;
+	struct msm_snapshot *snapshot;
+
+	snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
+	if (!snapshot)
+		return ERR_PTR(-ENOMEM);
+
+	snapshot->ptr = dma_alloc_coherent(&pdev->dev, SZ_1M,
+		&snapshot->physaddr, GFP_KERNEL);
+
+	if (!snapshot->ptr) {
+		kfree(snapshot);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	seq_buf_init(&snapshot->buf, snapshot->ptr, SZ_1M);
+
+	return snapshot;
+}
+
+int msm_gpu_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot)
+{
+	int ret;
+	struct msm_snapshot_header header;
+	uint64_t val;
+
+	if (!snapshot)
+		return -ENOMEM;
+
+	/*
+	 * For now, blow away the snapshot and take a new one  - the most
+	 * interesting hang is the last one we saw
+	 */
+	seq_buf_init(&snapshot->buf, snapshot->ptr, SZ_1M);
+
+	header.magic = SNAPSHOT_MAGIC;
+	gpu->funcs->get_param(gpu, MSM_PARAM_GPU_ID, &val);
+	header.gpuid = lower_32_bits(val);
+
+	gpu->funcs->get_param(gpu, MSM_PARAM_CHIP_ID, &val);
+	header.chipid = lower_32_bits(val);
+
+	seq_buf_putmem(&snapshot->buf, &header, sizeof(header));
+
+	ret = gpu->funcs->snapshot(gpu, snapshot);
+
+	if (!ret) {
+		struct msm_snapshot_section_header end;
+
+		end.magic = SNAPSHOT_SECTION_MAGIC;
+		end.id = SNAPSHOT_SECTION_END;
+		end.size = sizeof(end);
+
+		seq_buf_putmem(&snapshot->buf, &end, sizeof(end));
+
+		dev_info(gpu->dev->dev, "GPU snapshot created [0x%pa (%d bytes)]\n",
+			&snapshot->physaddr, seq_buf_used(&snapshot->buf));
+	}
+
+	return ret;
+}
+
+int msm_snapshot_write(struct msm_gpu *gpu, struct seq_file *m)
+{
+	if (gpu && gpu->snapshot)
+		seq_write(m, gpu->snapshot->ptr,
+			seq_buf_used(&gpu->snapshot->buf));
+
+	return 0;
+}
--- a/drivers/gpu/drm/msm/msm_snapshot.h
+++ b/drivers/gpu/drm/msm/msm_snapshot.h
@ -0,0 +1,85 @@
+/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef MSM_SNAPSHOT_H_
+#define MSM_SNAPSHOT_H_
+
+#include <linux/string.h>
+#include <linux/seq_buf.h>
+#include "msm_snapshot_api.h"
+
+struct msm_snapshot {
+	void *ptr;
+	struct seq_buf buf;
+	phys_addr_t physaddr;
+	uint32_t index;
+	uint32_t remain;
+	unsigned long timestamp;
+	void *priv;
+};
+
+/* Write a uint32_t value to the next position in the snapshot buffer */
+static inline void SNAPSHOT_WRITE_U32(struct msm_snapshot *snapshot,
+		uint32_t value)
+{
+	seq_buf_putmem(&snapshot->buf, &value, sizeof(value));
+}
+
+/* Copy a block of memory to the next position in the snapshot buffer */
+static inline void SNAPSHOT_MEMCPY(struct msm_snapshot *snapshot, void *src,
+		uint32_t size)
+{
+	if (size)
+		seq_buf_putmem(&snapshot->buf, src, size);
+}
+
+static inline bool _snapshot_header(struct msm_snapshot *snapshot,
+		struct msm_snapshot_section_header *header,
+		u32 headsz, u32 datasz, u32 id)
+{
+	u32 size = headsz + datasz;
+
+	if (seq_buf_buffer_left(&snapshot->buf) <= size)
+		return false;
+
+	/* Write the section header */
+	header->magic = SNAPSHOT_SECTION_MAGIC;
+	header->id = id;
+	header->size = headsz + datasz;
+
+	/* Write the section header */
+	seq_buf_putmem(&snapshot->buf, header, headsz);
+
+	/* The caller will fill in the data from here */
+	return true;
+}
+
+/* SNAPSHOT_HEADER
+ * _snapshot: pointer to struct msm_snapshot
+ * _header: Local variable containing the sub-section header
+ * _id: Section ID to write
+ * _dword: Size of the data section (in dword)
+ */
+#define SNAPSHOT_HEADER(_snapshot, _header, _id, _dwords) \
+	_snapshot_header((_snapshot), \
+		(struct msm_snapshot_section_header *) &(header), \
+		sizeof(header), (_dwords) << 2, (_id))
+
+struct msm_gpu;
+
+struct msm_snapshot *msm_snapshot_new(struct msm_gpu *gpu);
+void msm_snapshot_destroy(struct msm_gpu *gpu, struct msm_snapshot *snapshot);
+int msm_gpu_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot);
+int msm_snapshot_write(struct msm_gpu *gpu, struct seq_file *m);
+
+#endif
+
--- a/drivers/gpu/drm/msm/msm_snapshot_api.h
+++ b/drivers/gpu/drm/msm/msm_snapshot_api.h
@ -0,0 +1,121 @@
+/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef MSM_SNAPSHOT_API_H_
+#define MSM_SNAPSHOT_API_H_
+
+#include <linux/types.h>
+
+/* High word is the magic, low word is the snapshot header version */
+#define SNAPSHOT_MAGIC 0x504D0002
+
+struct msm_snapshot_header {
+	__u32 magic;
+	__u32 gpuid;
+	__u32 chipid;
+} __packed;
+
+#define SNAPSHOT_SECTION_MAGIC 0xABCD
+
+struct msm_snapshot_section_header {
+	__u16 magic;
+	__u16 id;
+	__u32 size;
+} __packed;
+
+/* Section identifiers */
+#define SNAPSHOT_SECTION_OS		0x0101
+#define SNAPSHOT_SECTION_REGS_V2	0x0202
+#define SNAPSHOT_SECTION_RB_V2		0x0302
+#define SNAPSHOT_SECTION_IB_V2		0x0402
+#define SNAPSHOT_SECTION_INDEXED_REGS	0x0501
+#define SNAPSHOT_SECTION_DEBUG		0x0901
+#define SNAPSHOT_SECTION_DEBUGBUS	0x0A01
+#define SNAPSHOT_SECTION_GPU_OBJECT_V2	0x0B02
+#define SNAPSHOT_SECTION_MEMLIST_V2	0x0E02
+#define SNAPSHOT_SECTION_SHADER		0x1201
+#define SNAPSHOT_SECTION_END		0xFFFF
+
+#define SNAPSHOT_OS_LINUX_V3          0x00000202
+
+struct msm_snapshot_linux {
+	struct msm_snapshot_section_header header;
+	int osid;
+	__u32 seconds;
+	__u32 power_flags;
+	__u32 power_level;
+	__u32 power_interval_timeout;
+	__u32 grpclk;
+	__u32 busclk;
+	__u64 ptbase;
+	__u32 pid;
+	__u32 current_context;
+	__u32 ctxtcount;
+	unsigned char release[32];
+	unsigned char version[32];
+	unsigned char comm[16];
+} __packed;
+
+struct msm_snapshot_ringbuffer {
+	struct msm_snapshot_section_header header;
+	int start;
+	int end;
+	int rbsize;
+	int wptr;
+	int rptr;
+	int count;
+	__u32 timestamp_queued;
+	__u32 timestamp_retired;
+	__u64 gpuaddr;
+	__u32 id;
+} __packed;
+
+struct msm_snapshot_regs {
+	struct msm_snapshot_section_header header;
+	__u32 count;
+} __packed;
+
+struct msm_snapshot_indexed_regs {
+	struct msm_snapshot_section_header header;
+	__u32 index_reg;
+	__u32 data_reg;
+	__u32 start;
+	__u32 count;
+} __packed;
+
+#define SNAPSHOT_DEBUG_CP_MEQ		7
+#define SNAPSHOT_DEBUG_CP_PM4_RAM	8
+#define SNAPSHOT_DEBUG_CP_PFP_RAM	9
+#define SNAPSHOT_DEBUG_CP_ROQ		10
+#define SNAPSHOT_DEBUG_SHADER_MEMORY	11
+#define SNAPSHOT_DEBUG_CP_MERCIU	12
+
+struct msm_snapshot_debug {
+	struct msm_snapshot_section_header header;
+	__u32 type;
+	__u32 size;
+} __packed;
+
+struct msm_snapshot_debugbus {
+	struct msm_snapshot_section_header header;
+	__u32 id;
+	__u32 count;
+} __packed;
+
+struct msm_snapshot_shader {
+	struct msm_snapshot_section_header header;
+	__u32 type;
+	__u32 index;
+	__u32 size;
+} __packed;
+
+#endif