diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index ec96321eb9e4..b31e0ae4b8db 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1,8 +1,10 @@
 #include <sys/types.h>
+#include <byteswap.h>
 #include <unistd.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <linux/list.h>
+#include <linux/kernel.h>
 
 #include "util.h"
 #include "header.h"
@@ -464,8 +466,21 @@ static int do_read(int fd, void *buf, size_t size)
 	return 0;
 }
 
+static int perf_header__getbuffer64(struct perf_header *self,
+				    int fd, void *buf, size_t size)
+{
+	if (do_read(fd, buf, size))
+		return -1;
+
+	if (self->needs_swap)
+		mem_bswap_64(buf, size);
+
+	return 0;
+}
+
 int perf_header__process_sections(struct perf_header *self, int fd,
 				  int (*process)(struct perf_file_section *self,
+						 struct perf_header *ph,
 						 int feat, int fd))
 {
 	struct perf_file_section *feat_sec;
@@ -486,7 +501,7 @@ int perf_header__process_sections(struct perf_header *self, int fd,
 
 	lseek(fd, self->data_offset + self->data_size, SEEK_SET);
 
-	if (do_read(fd, feat_sec, sec_size))
+	if (perf_header__getbuffer64(self, fd, feat_sec, sec_size))
 		goto out_free;
 
 	err = 0;
@@ -494,7 +509,7 @@ int perf_header__process_sections(struct perf_header *self, int fd,
 		if (perf_header__has_feat(self, feat)) {
 			struct perf_file_section *sec = &feat_sec[idx++];
 
-			err = process(sec, feat, fd);
+			err = process(sec, self, feat, fd);
 			if (err < 0)
 				break;
 		}
@@ -511,10 +526,20 @@ int perf_file_header__read(struct perf_file_header *self,
 	lseek(fd, 0, SEEK_SET);
 
 	if (do_read(fd, self, sizeof(*self)) ||
-	    self->magic     != PERF_MAGIC ||
-	    self->attr_size != sizeof(struct perf_file_attr))
+	    memcmp(&self->magic, __perf_magic, sizeof(self->magic)))
 		return -1;
 
+	if (self->attr_size != sizeof(struct perf_file_attr)) {
+		u64 attr_size = bswap_64(self->attr_size);
+
+		if (attr_size != sizeof(struct perf_file_attr))
+			return -1;
+
+		mem_bswap_64(self, offsetof(struct perf_file_header,
+					    adds_features));
+		ph->needs_swap = true;
+	}
+
 	if (self->size != sizeof(*self)) {
 		/* Support the previous format */
 		if (self->size == offsetof(typeof(*self), adds_features))
@@ -524,16 +549,28 @@ int perf_file_header__read(struct perf_file_header *self,
 	}
 
 	memcpy(&ph->adds_features, &self->adds_features,
-	       sizeof(self->adds_features));
+	       sizeof(ph->adds_features));
+	/*
+	 * FIXME: hack that assumes that if we need swap the perf.data file
+	 * may be coming from an arch with a different word-size, ergo different
+	 * DEFINE_BITMAP format, investigate more later, but for now its mostly
+	 * safe to assume that we have a build-id section. Trace files probably
+	 * have several other issues in this realm anyway...
+	 */
+	if (ph->needs_swap) {
+		memset(&ph->adds_features, 0, sizeof(ph->adds_features));
+		perf_header__set_feat(ph, HEADER_BUILD_ID);
+	}
 
 	ph->event_offset = self->event_types.offset;
-	ph->event_size	 = self->event_types.size;
-	ph->data_offset	 = self->data.offset;
+	ph->event_size   = self->event_types.size;
+	ph->data_offset  = self->data.offset;
 	ph->data_size	 = self->data.size;
 	return 0;
 }
 
 static int perf_file_section__process(struct perf_file_section *self,
+				      struct perf_header *ph,
 				      int feat, int fd)
 {
 	if (lseek(fd, self->offset, SEEK_SET) < 0) {
@@ -548,7 +585,7 @@ static int perf_file_section__process(struct perf_file_section *self,
 		break;
 
 	case HEADER_BUILD_ID:
-		if (perf_header__read_build_ids(fd, self->offset, self->size))
+		if (perf_header__read_build_ids(ph, fd, self->offset, self->size))
 			pr_debug("Failed to read buildids, continuing...\n");
 		break;
 	default:
@@ -560,7 +597,7 @@ static int perf_file_section__process(struct perf_file_section *self,
 
 int perf_header__read(struct perf_header *self, int fd)
 {
-	struct perf_file_header f_header;
+	struct perf_file_header	f_header;
 	struct perf_file_attr	f_attr;
 	u64			f_id;
 	int nr_attrs, nr_ids, i, j;
@@ -577,8 +614,9 @@ int perf_header__read(struct perf_header *self, int fd)
 		struct perf_header_attr *attr;
 		off_t tmp;
 
-		if (do_read(fd, &f_attr, sizeof(f_attr)))
+		if (perf_header__getbuffer64(self, fd, &f_attr, sizeof(f_attr)))
 			goto out_errno;
+
 		tmp = lseek(fd, 0, SEEK_CUR);
 
 		attr = perf_header_attr__new(&f_attr.attr);
@@ -589,7 +627,7 @@ int perf_header__read(struct perf_header *self, int fd)
 		lseek(fd, f_attr.ids.offset, SEEK_SET);
 
 		for (j = 0; j < nr_ids; j++) {
-			if (do_read(fd, &f_id, sizeof(f_id)))
+			if (perf_header__getbuffer64(self, fd, &f_id, sizeof(f_id)))
 				goto out_errno;
 
 			if (perf_header_attr__add_id(attr, f_id) < 0) {
@@ -610,7 +648,8 @@ int perf_header__read(struct perf_header *self, int fd)
 		events = malloc(f_header.event_types.size);
 		if (events == NULL)
 			return -ENOMEM;
-		if (do_read(fd, events, f_header.event_types.size))
+		if (perf_header__getbuffer64(self, fd, events,
+					     f_header.event_types.size))
 			goto out_errno;
 		event_count =  f_header.event_types.size / sizeof(struct perf_trace_event_type);
 	}
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 2b69aab67e35..ccc8540feccd 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -52,6 +52,7 @@ struct perf_header {
 	u64			data_size;
 	u64			event_offset;
 	u64			event_size;
+	bool			needs_swap;
 	DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
 };
 
@@ -80,6 +81,7 @@ bool perf_header__has_feat(const struct perf_header *self, int feat);
 
 int perf_header__process_sections(struct perf_header *self, int fd,
 				  int (*process)(struct perf_file_section *self,
+						 struct perf_header *ph,
 						 int feat, int fd));
 
 #endif /* __PERF_HEADER_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index e3ccdb46d6c4..604e14f6a6f9 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1,5 +1,6 @@
 #include <linux/kernel.h>
 
+#include <byteswap.h>
 #include <unistd.h>
 #include <sys/types.h>
 
@@ -201,21 +202,88 @@ void event__print_totals(void)
 			event__name[i], event__total[i]);
 }
 
+void mem_bswap_64(void *src, int byte_size)
+{
+	u64 *m = src;
+
+	while (byte_size > 0) {
+		*m = bswap_64(*m);
+		byte_size -= sizeof(u64);
+		++m;
+	}
+}
+
+static void event__all64_swap(event_t *self)
+{
+	struct perf_event_header *hdr = &self->header;
+	mem_bswap_64(hdr + 1, self->header.size - sizeof(*hdr));
+}
+
+static void event__comm_swap(event_t *self)
+{
+	self->comm.pid = bswap_32(self->comm.pid);
+	self->comm.tid = bswap_32(self->comm.tid);
+}
+
+static void event__mmap_swap(event_t *self)
+{
+	self->mmap.pid	 = bswap_32(self->mmap.pid);
+	self->mmap.tid	 = bswap_32(self->mmap.tid);
+	self->mmap.start = bswap_64(self->mmap.start);
+	self->mmap.len	 = bswap_64(self->mmap.len);
+	self->mmap.pgoff = bswap_64(self->mmap.pgoff);
+}
+
+static void event__task_swap(event_t *self)
+{
+	self->fork.pid	= bswap_32(self->fork.pid);
+	self->fork.tid	= bswap_32(self->fork.tid);
+	self->fork.ppid	= bswap_32(self->fork.ppid);
+	self->fork.ptid	= bswap_32(self->fork.ptid);
+	self->fork.time	= bswap_64(self->fork.time);
+}
+
+static void event__read_swap(event_t *self)
+{
+	self->read.pid		= bswap_32(self->read.pid);
+	self->read.tid		= bswap_32(self->read.tid);
+	self->read.value	= bswap_64(self->read.value);
+	self->read.time_enabled	= bswap_64(self->read.time_enabled);
+	self->read.time_running	= bswap_64(self->read.time_running);
+	self->read.id		= bswap_64(self->read.id);
+}
+
+typedef void (*event__swap_op)(event_t *self);
+
+static event__swap_op event__swap_ops[] = {
+	[PERF_RECORD_MMAP]   = event__mmap_swap,
+	[PERF_RECORD_COMM]   = event__comm_swap,
+	[PERF_RECORD_FORK]   = event__task_swap,
+	[PERF_RECORD_EXIT]   = event__task_swap,
+	[PERF_RECORD_LOST]   = event__all64_swap,
+	[PERF_RECORD_READ]   = event__read_swap,
+	[PERF_RECORD_SAMPLE] = event__all64_swap,
+	[PERF_RECORD_MAX]    = NULL,
+};
+
 static int perf_session__process_event(struct perf_session *self,
 				       event_t *event,
 				       struct perf_event_ops *ops,
-				       unsigned long offset, unsigned long head)
+				       u64 offset, u64 head)
 {
 	trace_event(event);
 
 	if (event->header.type < PERF_RECORD_MAX) {
-		dump_printf("%#lx [%#x]: PERF_RECORD_%s",
+		dump_printf("%#Lx [%#x]: PERF_RECORD_%s",
 			    offset + head, event->header.size,
 			    event__name[event->header.type]);
 		++event__total[0];
 		++event__total[event->header.type];
 	}
 
+	if (self->header.needs_swap && event__swap_ops[event->header.type])
+		event__swap_ops[event->header.type](event);
+
 	switch (event->header.type) {
 	case PERF_RECORD_SAMPLE:
 		return ops->sample(event, self);
@@ -241,7 +309,15 @@ static int perf_session__process_event(struct perf_session *self,
 	}
 }
 
-int perf_header__read_build_ids(int input, u64 offset, u64 size)
+void perf_event_header__bswap(struct perf_event_header *self)
+{
+	self->type = bswap_32(self->type);
+	self->misc = bswap_16(self->misc);
+	self->size = bswap_16(self->size);
+}
+
+int perf_header__read_build_ids(struct perf_header *self,
+				int input, u64 offset, u64 size)
 {
 	struct build_id_event bev;
 	char filename[PATH_MAX];
@@ -256,6 +332,9 @@ int perf_header__read_build_ids(int input, u64 offset, u64 size)
 		if (read(input, &bev, sizeof(bev)) != sizeof(bev))
 			goto out;
 
+		if (self->needs_swap)
+			perf_event_header__bswap(&bev.header);
+
 		len = bev.header.size - sizeof(bev);
 		if (read(input, filename, len) != len)
 			goto out;
@@ -292,9 +371,9 @@ static struct thread *perf_session__register_idle_thread(struct perf_session *se
 int perf_session__process_events(struct perf_session *self,
 				 struct perf_event_ops *ops)
 {
-	int err;
-	unsigned long head, shift;
-	unsigned long offset = 0;
+	int err, mmap_prot, mmap_flags;
+	u64 head, shift;
+	u64 offset = 0;
 	size_t	page_size;
 	event_t *event;
 	uint32_t size;
@@ -330,9 +409,16 @@ out_getcwd_err:
 	offset += shift;
 	head -= shift;
 
+	mmap_prot  = PROT_READ;
+	mmap_flags = MAP_SHARED;
+
+	if (self->header.needs_swap) {
+		mmap_prot  |= PROT_WRITE;
+		mmap_flags = MAP_PRIVATE;
+	}
 remap:
-	buf = mmap(NULL, page_size * self->mmap_window, PROT_READ,
-		   MAP_SHARED, self->fd, offset);
+	buf = mmap(NULL, page_size * self->mmap_window, mmap_prot,
+		   mmap_flags, self->fd, offset);
 	if (buf == MAP_FAILED) {
 		pr_err("failed to mmap file\n");
 		err = -errno;
@@ -342,6 +428,8 @@ remap:
 more:
 	event = (event_t *)(buf + head);
 
+	if (self->header.needs_swap)
+		perf_event_header__bswap(&event->header);
 	size = event->header.size;
 	if (size == 0)
 		size = 8;
@@ -361,12 +449,12 @@ more:
 
 	size = event->header.size;
 
-	dump_printf("\n%#lx [%#x]: event: %d\n",
+	dump_printf("\n%#Lx [%#x]: event: %d\n",
 		    offset + head, event->header.size, event->header.type);
 
 	if (size == 0 ||
 	    perf_session__process_event(self, event, ops, offset, head) < 0) {
-		dump_printf("%#lx [%#x]: skipping unknown header type: %d\n",
+		dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n",
 			    offset + head, event->header.size,
 			    event->header.type);
 		/*
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index d4a9d20f8d44..36d1a80c0b6c 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -51,6 +51,8 @@ struct perf_event_ops {
 struct perf_session *perf_session__new(const char *filename, int mode, bool force);
 void perf_session__delete(struct perf_session *self);
 
+void perf_event_header__bswap(struct perf_event_header *self);
+
 int perf_session__process_events(struct perf_session *self,
 				 struct perf_event_ops *event_ops);
 
@@ -61,7 +63,8 @@ struct symbol **perf_session__resolve_callchain(struct perf_session *self,
 
 bool perf_session__has_traces(struct perf_session *self, const char *msg);
 
-int perf_header__read_build_ids(int input, u64 offset, u64 file_size);
+int perf_header__read_build_ids(struct perf_header *self, int input,
+				u64 offset, u64 file_size);
 
 int perf_session__set_kallsyms_ref_reloc_sym(struct perf_session *self,
 					     const char *symbol_name,
@@ -69,4 +72,6 @@ int perf_session__set_kallsyms_ref_reloc_sym(struct perf_session *self,
 void perf_session__reloc_vmlinux_maps(struct perf_session *self,
 				      u64 unrelocated_addr);
 
+void mem_bswap_64(void *src, int byte_size);
+
 #endif /* __PERF_SESSION_H */