powerpc: Fix endian issues in VMX copy loops
Fix the permute loops for little endian. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
This commit is contained in:
parent
8b5ede69d2
commit
32ee1e188e
2 changed files with 63 additions and 46 deletions
|
@ -19,6 +19,14 @@
|
||||||
*/
|
*/
|
||||||
#include <asm/ppc_asm.h>
|
#include <asm/ppc_asm.h>
|
||||||
|
|
||||||
|
#ifdef __BIG_ENDIAN__
|
||||||
|
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
|
||||||
|
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
|
||||||
|
#else
|
||||||
|
#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
|
||||||
|
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
|
||||||
|
#endif
|
||||||
|
|
||||||
.macro err1
|
.macro err1
|
||||||
100:
|
100:
|
||||||
.section __ex_table,"a"
|
.section __ex_table,"a"
|
||||||
|
@ -552,13 +560,13 @@ err3; stw r7,4(r3)
|
||||||
li r10,32
|
li r10,32
|
||||||
li r11,48
|
li r11,48
|
||||||
|
|
||||||
lvsl vr16,0,r4 /* Setup permute control vector */
|
LVS(vr16,0,r4) /* Setup permute control vector */
|
||||||
err3; lvx vr0,0,r4
|
err3; lvx vr0,0,r4
|
||||||
addi r4,r4,16
|
addi r4,r4,16
|
||||||
|
|
||||||
bf cr7*4+3,5f
|
bf cr7*4+3,5f
|
||||||
err3; lvx vr1,r0,r4
|
err3; lvx vr1,r0,r4
|
||||||
vperm vr8,vr0,vr1,vr16
|
VPERM(vr8,vr0,vr1,vr16)
|
||||||
addi r4,r4,16
|
addi r4,r4,16
|
||||||
err3; stvx vr8,r0,r3
|
err3; stvx vr8,r0,r3
|
||||||
addi r3,r3,16
|
addi r3,r3,16
|
||||||
|
@ -566,9 +574,9 @@ err3; stvx vr8,r0,r3
|
||||||
|
|
||||||
5: bf cr7*4+2,6f
|
5: bf cr7*4+2,6f
|
||||||
err3; lvx vr1,r0,r4
|
err3; lvx vr1,r0,r4
|
||||||
vperm vr8,vr0,vr1,vr16
|
VPERM(vr8,vr0,vr1,vr16)
|
||||||
err3; lvx vr0,r4,r9
|
err3; lvx vr0,r4,r9
|
||||||
vperm vr9,vr1,vr0,vr16
|
VPERM(vr9,vr1,vr0,vr16)
|
||||||
addi r4,r4,32
|
addi r4,r4,32
|
||||||
err3; stvx vr8,r0,r3
|
err3; stvx vr8,r0,r3
|
||||||
err3; stvx vr9,r3,r9
|
err3; stvx vr9,r3,r9
|
||||||
|
@ -576,13 +584,13 @@ err3; stvx vr9,r3,r9
|
||||||
|
|
||||||
6: bf cr7*4+1,7f
|
6: bf cr7*4+1,7f
|
||||||
err3; lvx vr3,r0,r4
|
err3; lvx vr3,r0,r4
|
||||||
vperm vr8,vr0,vr3,vr16
|
VPERM(vr8,vr0,vr3,vr16)
|
||||||
err3; lvx vr2,r4,r9
|
err3; lvx vr2,r4,r9
|
||||||
vperm vr9,vr3,vr2,vr16
|
VPERM(vr9,vr3,vr2,vr16)
|
||||||
err3; lvx vr1,r4,r10
|
err3; lvx vr1,r4,r10
|
||||||
vperm vr10,vr2,vr1,vr16
|
VPERM(vr10,vr2,vr1,vr16)
|
||||||
err3; lvx vr0,r4,r11
|
err3; lvx vr0,r4,r11
|
||||||
vperm vr11,vr1,vr0,vr16
|
VPERM(vr11,vr1,vr0,vr16)
|
||||||
addi r4,r4,64
|
addi r4,r4,64
|
||||||
err3; stvx vr8,r0,r3
|
err3; stvx vr8,r0,r3
|
||||||
err3; stvx vr9,r3,r9
|
err3; stvx vr9,r3,r9
|
||||||
|
@ -611,21 +619,21 @@ err3; stvx vr11,r3,r11
|
||||||
.align 5
|
.align 5
|
||||||
8:
|
8:
|
||||||
err4; lvx vr7,r0,r4
|
err4; lvx vr7,r0,r4
|
||||||
vperm vr8,vr0,vr7,vr16
|
VPERM(vr8,vr0,vr7,vr16)
|
||||||
err4; lvx vr6,r4,r9
|
err4; lvx vr6,r4,r9
|
||||||
vperm vr9,vr7,vr6,vr16
|
VPERM(vr9,vr7,vr6,vr16)
|
||||||
err4; lvx vr5,r4,r10
|
err4; lvx vr5,r4,r10
|
||||||
vperm vr10,vr6,vr5,vr16
|
VPERM(vr10,vr6,vr5,vr16)
|
||||||
err4; lvx vr4,r4,r11
|
err4; lvx vr4,r4,r11
|
||||||
vperm vr11,vr5,vr4,vr16
|
VPERM(vr11,vr5,vr4,vr16)
|
||||||
err4; lvx vr3,r4,r12
|
err4; lvx vr3,r4,r12
|
||||||
vperm vr12,vr4,vr3,vr16
|
VPERM(vr12,vr4,vr3,vr16)
|
||||||
err4; lvx vr2,r4,r14
|
err4; lvx vr2,r4,r14
|
||||||
vperm vr13,vr3,vr2,vr16
|
VPERM(vr13,vr3,vr2,vr16)
|
||||||
err4; lvx vr1,r4,r15
|
err4; lvx vr1,r4,r15
|
||||||
vperm vr14,vr2,vr1,vr16
|
VPERM(vr14,vr2,vr1,vr16)
|
||||||
err4; lvx vr0,r4,r16
|
err4; lvx vr0,r4,r16
|
||||||
vperm vr15,vr1,vr0,vr16
|
VPERM(vr15,vr1,vr0,vr16)
|
||||||
addi r4,r4,128
|
addi r4,r4,128
|
||||||
err4; stvx vr8,r0,r3
|
err4; stvx vr8,r0,r3
|
||||||
err4; stvx vr9,r3,r9
|
err4; stvx vr9,r3,r9
|
||||||
|
@ -649,13 +657,13 @@ err4; stvx vr15,r3,r16
|
||||||
|
|
||||||
bf cr7*4+1,9f
|
bf cr7*4+1,9f
|
||||||
err3; lvx vr3,r0,r4
|
err3; lvx vr3,r0,r4
|
||||||
vperm vr8,vr0,vr3,vr16
|
VPERM(vr8,vr0,vr3,vr16)
|
||||||
err3; lvx vr2,r4,r9
|
err3; lvx vr2,r4,r9
|
||||||
vperm vr9,vr3,vr2,vr16
|
VPERM(vr9,vr3,vr2,vr16)
|
||||||
err3; lvx vr1,r4,r10
|
err3; lvx vr1,r4,r10
|
||||||
vperm vr10,vr2,vr1,vr16
|
VPERM(vr10,vr2,vr1,vr16)
|
||||||
err3; lvx vr0,r4,r11
|
err3; lvx vr0,r4,r11
|
||||||
vperm vr11,vr1,vr0,vr16
|
VPERM(vr11,vr1,vr0,vr16)
|
||||||
addi r4,r4,64
|
addi r4,r4,64
|
||||||
err3; stvx vr8,r0,r3
|
err3; stvx vr8,r0,r3
|
||||||
err3; stvx vr9,r3,r9
|
err3; stvx vr9,r3,r9
|
||||||
|
@ -665,9 +673,9 @@ err3; stvx vr11,r3,r11
|
||||||
|
|
||||||
9: bf cr7*4+2,10f
|
9: bf cr7*4+2,10f
|
||||||
err3; lvx vr1,r0,r4
|
err3; lvx vr1,r0,r4
|
||||||
vperm vr8,vr0,vr1,vr16
|
VPERM(vr8,vr0,vr1,vr16)
|
||||||
err3; lvx vr0,r4,r9
|
err3; lvx vr0,r4,r9
|
||||||
vperm vr9,vr1,vr0,vr16
|
VPERM(vr9,vr1,vr0,vr16)
|
||||||
addi r4,r4,32
|
addi r4,r4,32
|
||||||
err3; stvx vr8,r0,r3
|
err3; stvx vr8,r0,r3
|
||||||
err3; stvx vr9,r3,r9
|
err3; stvx vr9,r3,r9
|
||||||
|
@ -675,7 +683,7 @@ err3; stvx vr9,r3,r9
|
||||||
|
|
||||||
10: bf cr7*4+3,11f
|
10: bf cr7*4+3,11f
|
||||||
err3; lvx vr1,r0,r4
|
err3; lvx vr1,r0,r4
|
||||||
vperm vr8,vr0,vr1,vr16
|
VPERM(vr8,vr0,vr1,vr16)
|
||||||
addi r4,r4,16
|
addi r4,r4,16
|
||||||
err3; stvx vr8,r0,r3
|
err3; stvx vr8,r0,r3
|
||||||
addi r3,r3,16
|
addi r3,r3,16
|
||||||
|
|
|
@ -20,6 +20,15 @@
|
||||||
#include <asm/ppc_asm.h>
|
#include <asm/ppc_asm.h>
|
||||||
|
|
||||||
_GLOBAL(memcpy_power7)
|
_GLOBAL(memcpy_power7)
|
||||||
|
|
||||||
|
#ifdef __BIG_ENDIAN__
|
||||||
|
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
|
||||||
|
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
|
||||||
|
#else
|
||||||
|
#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
|
||||||
|
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_ALTIVEC
|
#ifdef CONFIG_ALTIVEC
|
||||||
cmpldi r5,16
|
cmpldi r5,16
|
||||||
cmpldi cr1,r5,4096
|
cmpldi cr1,r5,4096
|
||||||
|
@ -485,13 +494,13 @@ _GLOBAL(memcpy_power7)
|
||||||
li r10,32
|
li r10,32
|
||||||
li r11,48
|
li r11,48
|
||||||
|
|
||||||
lvsl vr16,0,r4 /* Setup permute control vector */
|
LVS(vr16,0,r4) /* Setup permute control vector */
|
||||||
lvx vr0,0,r4
|
lvx vr0,0,r4
|
||||||
addi r4,r4,16
|
addi r4,r4,16
|
||||||
|
|
||||||
bf cr7*4+3,5f
|
bf cr7*4+3,5f
|
||||||
lvx vr1,r0,r4
|
lvx vr1,r0,r4
|
||||||
vperm vr8,vr0,vr1,vr16
|
VPERM(vr8,vr0,vr1,vr16)
|
||||||
addi r4,r4,16
|
addi r4,r4,16
|
||||||
stvx vr8,r0,r3
|
stvx vr8,r0,r3
|
||||||
addi r3,r3,16
|
addi r3,r3,16
|
||||||
|
@ -499,9 +508,9 @@ _GLOBAL(memcpy_power7)
|
||||||
|
|
||||||
5: bf cr7*4+2,6f
|
5: bf cr7*4+2,6f
|
||||||
lvx vr1,r0,r4
|
lvx vr1,r0,r4
|
||||||
vperm vr8,vr0,vr1,vr16
|
VPERM(vr8,vr0,vr1,vr16)
|
||||||
lvx vr0,r4,r9
|
lvx vr0,r4,r9
|
||||||
vperm vr9,vr1,vr0,vr16
|
VPERM(vr9,vr1,vr0,vr16)
|
||||||
addi r4,r4,32
|
addi r4,r4,32
|
||||||
stvx vr8,r0,r3
|
stvx vr8,r0,r3
|
||||||
stvx vr9,r3,r9
|
stvx vr9,r3,r9
|
||||||
|
@ -509,13 +518,13 @@ _GLOBAL(memcpy_power7)
|
||||||
|
|
||||||
6: bf cr7*4+1,7f
|
6: bf cr7*4+1,7f
|
||||||
lvx vr3,r0,r4
|
lvx vr3,r0,r4
|
||||||
vperm vr8,vr0,vr3,vr16
|
VPERM(vr8,vr0,vr3,vr16)
|
||||||
lvx vr2,r4,r9
|
lvx vr2,r4,r9
|
||||||
vperm vr9,vr3,vr2,vr16
|
VPERM(vr9,vr3,vr2,vr16)
|
||||||
lvx vr1,r4,r10
|
lvx vr1,r4,r10
|
||||||
vperm vr10,vr2,vr1,vr16
|
VPERM(vr10,vr2,vr1,vr16)
|
||||||
lvx vr0,r4,r11
|
lvx vr0,r4,r11
|
||||||
vperm vr11,vr1,vr0,vr16
|
VPERM(vr11,vr1,vr0,vr16)
|
||||||
addi r4,r4,64
|
addi r4,r4,64
|
||||||
stvx vr8,r0,r3
|
stvx vr8,r0,r3
|
||||||
stvx vr9,r3,r9
|
stvx vr9,r3,r9
|
||||||
|
@ -544,21 +553,21 @@ _GLOBAL(memcpy_power7)
|
||||||
.align 5
|
.align 5
|
||||||
8:
|
8:
|
||||||
lvx vr7,r0,r4
|
lvx vr7,r0,r4
|
||||||
vperm vr8,vr0,vr7,vr16
|
VPERM(vr8,vr0,vr7,vr16)
|
||||||
lvx vr6,r4,r9
|
lvx vr6,r4,r9
|
||||||
vperm vr9,vr7,vr6,vr16
|
VPERM(vr9,vr7,vr6,vr16)
|
||||||
lvx vr5,r4,r10
|
lvx vr5,r4,r10
|
||||||
vperm vr10,vr6,vr5,vr16
|
VPERM(vr10,vr6,vr5,vr16)
|
||||||
lvx vr4,r4,r11
|
lvx vr4,r4,r11
|
||||||
vperm vr11,vr5,vr4,vr16
|
VPERM(vr11,vr5,vr4,vr16)
|
||||||
lvx vr3,r4,r12
|
lvx vr3,r4,r12
|
||||||
vperm vr12,vr4,vr3,vr16
|
VPERM(vr12,vr4,vr3,vr16)
|
||||||
lvx vr2,r4,r14
|
lvx vr2,r4,r14
|
||||||
vperm vr13,vr3,vr2,vr16
|
VPERM(vr13,vr3,vr2,vr16)
|
||||||
lvx vr1,r4,r15
|
lvx vr1,r4,r15
|
||||||
vperm vr14,vr2,vr1,vr16
|
VPERM(vr14,vr2,vr1,vr16)
|
||||||
lvx vr0,r4,r16
|
lvx vr0,r4,r16
|
||||||
vperm vr15,vr1,vr0,vr16
|
VPERM(vr15,vr1,vr0,vr16)
|
||||||
addi r4,r4,128
|
addi r4,r4,128
|
||||||
stvx vr8,r0,r3
|
stvx vr8,r0,r3
|
||||||
stvx vr9,r3,r9
|
stvx vr9,r3,r9
|
||||||
|
@ -582,13 +591,13 @@ _GLOBAL(memcpy_power7)
|
||||||
|
|
||||||
bf cr7*4+1,9f
|
bf cr7*4+1,9f
|
||||||
lvx vr3,r0,r4
|
lvx vr3,r0,r4
|
||||||
vperm vr8,vr0,vr3,vr16
|
VPERM(vr8,vr0,vr3,vr16)
|
||||||
lvx vr2,r4,r9
|
lvx vr2,r4,r9
|
||||||
vperm vr9,vr3,vr2,vr16
|
VPERM(vr9,vr3,vr2,vr16)
|
||||||
lvx vr1,r4,r10
|
lvx vr1,r4,r10
|
||||||
vperm vr10,vr2,vr1,vr16
|
VPERM(vr10,vr2,vr1,vr16)
|
||||||
lvx vr0,r4,r11
|
lvx vr0,r4,r11
|
||||||
vperm vr11,vr1,vr0,vr16
|
VPERM(vr11,vr1,vr0,vr16)
|
||||||
addi r4,r4,64
|
addi r4,r4,64
|
||||||
stvx vr8,r0,r3
|
stvx vr8,r0,r3
|
||||||
stvx vr9,r3,r9
|
stvx vr9,r3,r9
|
||||||
|
@ -598,9 +607,9 @@ _GLOBAL(memcpy_power7)
|
||||||
|
|
||||||
9: bf cr7*4+2,10f
|
9: bf cr7*4+2,10f
|
||||||
lvx vr1,r0,r4
|
lvx vr1,r0,r4
|
||||||
vperm vr8,vr0,vr1,vr16
|
VPERM(vr8,vr0,vr1,vr16)
|
||||||
lvx vr0,r4,r9
|
lvx vr0,r4,r9
|
||||||
vperm vr9,vr1,vr0,vr16
|
VPERM(vr9,vr1,vr0,vr16)
|
||||||
addi r4,r4,32
|
addi r4,r4,32
|
||||||
stvx vr8,r0,r3
|
stvx vr8,r0,r3
|
||||||
stvx vr9,r3,r9
|
stvx vr9,r3,r9
|
||||||
|
@ -608,7 +617,7 @@ _GLOBAL(memcpy_power7)
|
||||||
|
|
||||||
10: bf cr7*4+3,11f
|
10: bf cr7*4+3,11f
|
||||||
lvx vr1,r0,r4
|
lvx vr1,r0,r4
|
||||||
vperm vr8,vr0,vr1,vr16
|
VPERM(vr8,vr0,vr1,vr16)
|
||||||
addi r4,r4,16
|
addi r4,r4,16
|
||||||
stvx vr8,r0,r3
|
stvx vr8,r0,r3
|
||||||
addi r3,r3,16
|
addi r3,r3,16
|
||||||
|
|
Loading…
Add table
Reference in a new issue