RAID/s390: add SIMD implementation for raid6 gen/xor
Using vector registers is slightly faster: raid6: vx128x8 gen() 19705 MB/s raid6: vx128x8 xor() 11886 MB/s raid6: using algorithm vx128x8 gen() 19705 MB/s raid6: .... xor() 11886 MB/s, rmw enabled vs the software algorithms: raid6: int64x1 gen() 3018 MB/s raid6: int64x1 xor() 1429 MB/s raid6: int64x2 gen() 4661 MB/s raid6: int64x2 xor() 3143 MB/s raid6: int64x4 gen() 5392 MB/s raid6: int64x4 xor() 3509 MB/s raid6: int64x8 gen() 4441 MB/s raid6: int64x8 xor() 3207 MB/s raid6: using algorithm int64x4 gen() 5392 MB/s raid6: .... xor() 3509 MB/s, rmw enabled Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
This commit is contained in:
Родитель
8f149ea6e9
Коммит
474fd6e80f
|
@ -278,6 +278,15 @@
|
|||
VLVG \v, \gr, \index, 3
|
||||
.endm
|
||||
|
||||
/* VECTOR LOAD REGISTER */
|
||||
.macro VLR v1, v2
|
||||
VX_NUM v1, \v1
|
||||
VX_NUM v2, \v2
|
||||
.word 0xE700 | ((v1&15) << 4) | (v2&15)
|
||||
.word 0
|
||||
MRXBOPC 0, 0x56, v1, v2
|
||||
.endm
|
||||
|
||||
/* VECTOR LOAD */
|
||||
.macro VL v, disp, index="%r0", base
|
||||
VX_NUM v1, \v
|
||||
|
@ -404,6 +413,16 @@
|
|||
|
||||
/* Vector integer instructions */
|
||||
|
||||
/* VECTOR AND */
|
||||
.macro VN vr1, vr2, vr3
|
||||
VX_NUM v1, \vr1
|
||||
VX_NUM v2, \vr2
|
||||
VX_NUM v3, \vr3
|
||||
.word 0xE700 | ((v1&15) << 4) | (v2&15)
|
||||
.word ((v3&15) << 12)
|
||||
MRXBOPC 0, 0x68, v1, v2, v3
|
||||
.endm
|
||||
|
||||
/* VECTOR EXCLUSIVE OR */
|
||||
.macro VX vr1, vr2, vr3
|
||||
VX_NUM v1, \vr1
|
||||
|
@ -469,6 +488,73 @@
|
|||
MRXBOPC 0, 0x7D, v1, v2, v3
|
||||
.endm
|
||||
|
||||
/* VECTOR REPLICATE IMMEDIATE */
|
||||
.macro VREPI vr1, imm2, m3
|
||||
VX_NUM v1, \vr1
|
||||
.word 0xE700 | ((v1&15) << 4)
|
||||
.word \imm2
|
||||
MRXBOPC \m3, 0x45, v1
|
||||
.endm
|
||||
.macro VREPIB vr1, imm2
|
||||
VREPI \vr1, \imm2, 0
|
||||
.endm
|
||||
.macro VREPIH vr1, imm2
|
||||
VREPI \vr1, \imm2, 1
|
||||
.endm
|
||||
.macro VREPIF vr1, imm2
|
||||
VREPI \vr1, \imm2, 2
|
||||
.endm
|
||||
.macro VREPIG vr1, imm2
|
||||
VREP \vr1, \imm2, 3
|
||||
.endm
|
||||
|
||||
/* VECTOR ADD */
|
||||
.macro VA vr1, vr2, vr3, m4
|
||||
VX_NUM v1, \vr1
|
||||
VX_NUM v2, \vr2
|
||||
VX_NUM v3, \vr3
|
||||
.word 0xE700 | ((v1&15) << 4) | (v2&15)
|
||||
.word ((v3&15) << 12)
|
||||
MRXBOPC \m4, 0xF3, v1, v2, v3
|
||||
.endm
|
||||
.macro VAB vr1, vr2, vr3
|
||||
VA \vr1, \vr2, \vr3, 0
|
||||
.endm
|
||||
.macro VAH vr1, vr2, vr3
|
||||
VA \vr1, \vr2, \vr3, 1
|
||||
.endm
|
||||
.macro VAF vr1, vr2, vr3
|
||||
VA \vr1, \vr2, \vr3, 2
|
||||
.endm
|
||||
.macro VAG vr1, vr2, vr3
|
||||
VA \vr1, \vr2, \vr3, 3
|
||||
.endm
|
||||
.macro VAQ vr1, vr2, vr3
|
||||
VA \vr1, \vr2, \vr3, 4
|
||||
.endm
|
||||
|
||||
/* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */
|
||||
.macro VESRAV vr1, vr2, vr3, m4
|
||||
VX_NUM v1, \vr1
|
||||
VX_NUM v2, \vr2
|
||||
VX_NUM v3, \vr3
|
||||
.word 0xE700 | ((v1&15) << 4) | (v2&15)
|
||||
.word ((v3&15) << 12)
|
||||
MRXBOPC \m4, 0x7A, v1, v2, v3
|
||||
.endm
|
||||
|
||||
.macro VESRAVB vr1, vr2, vr3
|
||||
VESRAV \vr1, \vr2, \vr3, 0
|
||||
.endm
|
||||
.macro VESRAVH vr1, vr2, vr3
|
||||
VESRAV \vr1, \vr2, \vr3, 1
|
||||
.endm
|
||||
.macro VESRAVF vr1, vr2, vr3
|
||||
VESRAV \vr1, \vr2, \vr3, 2
|
||||
.endm
|
||||
.macro VESRAVG vr1, vr2, vr3
|
||||
VESRAV \vr1, \vr2, \vr3, 3
|
||||
.endm
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
#endif /* __ASM_S390_VX_INSN_H */
|
||||
|
|
|
@ -103,6 +103,7 @@ extern const struct raid6_calls raid6_avx2x1;
|
|||
extern const struct raid6_calls raid6_avx2x2;
|
||||
extern const struct raid6_calls raid6_avx2x4;
|
||||
extern const struct raid6_calls raid6_tilegx8;
|
||||
extern const struct raid6_calls raid6_s390vx8;
|
||||
|
||||
struct raid6_recov_calls {
|
||||
void (*data2)(int, size_t, int, int, void **);
|
||||
|
|
|
@ -3,3 +3,4 @@ altivec*.c
|
|||
int*.c
|
||||
tables.c
|
||||
neon?.c
|
||||
s390vx?.c
|
||||
|
|
|
@ -7,6 +7,7 @@ raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
|
|||
raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
|
||||
raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
|
||||
raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
|
||||
raid6_pq-$(CONFIG_S390) += s390vx8.o
|
||||
|
||||
hostprogs-y += mktables
|
||||
|
||||
|
@ -116,6 +117,11 @@ $(obj)/tilegx8.c: UNROLL := 8
|
|||
$(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE
|
||||
$(call if_changed,unroll)
|
||||
|
||||
targets += s390vx8.c
|
||||
$(obj)/s390vx8.c: UNROLL := 8
|
||||
$(obj)/s390vx8.c: $(src)/s390vx.uc $(src)/unroll.awk FORCE
|
||||
$(call if_changed,unroll)
|
||||
|
||||
quiet_cmd_mktable = TABLE $@
|
||||
cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
|
||||
|
||||
|
|
|
@ -68,6 +68,9 @@ const struct raid6_calls * const raid6_algos[] = {
|
|||
#endif
|
||||
#if defined(CONFIG_TILEGX)
|
||||
&raid6_tilegx8,
|
||||
#endif
|
||||
#if defined(CONFIG_S390)
|
||||
&raid6_s390vx8,
|
||||
#endif
|
||||
&raid6_intx1,
|
||||
&raid6_intx2,
|
||||
|
|
|
@ -0,0 +1,168 @@
|
|||
/*
|
||||
* raid6_vx$#.c
|
||||
*
|
||||
* $#-way unrolled RAID6 gen/xor functions for s390
|
||||
* based on the vector facility
|
||||
*
|
||||
* Copyright IBM Corp. 2016
|
||||
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
|
||||
*
|
||||
* This file is postprocessed using unroll.awk.
|
||||
*/
|
||||
|
||||
#include <linux/raid/pq.h>
|
||||
#include <asm/fpu/api.h>
|
||||
|
||||
asm(".include \"asm/vx-insn.h\"\n");
|
||||
|
||||
#define NSIZE 16
|
||||
|
||||
static inline void LOAD_CONST(void)
|
||||
{
|
||||
asm volatile("VREPIB %v24,7");
|
||||
asm volatile("VREPIB %v25,0x1d");
|
||||
}
|
||||
|
||||
/*
|
||||
* The SHLBYTE() operation shifts each of the 16 bytes in
|
||||
* vector register y left by 1 bit and stores the result in
|
||||
* vector register x.
|
||||
*/
|
||||
static inline void SHLBYTE(int x, int y)
|
||||
{
|
||||
asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y));
|
||||
}
|
||||
|
||||
/*
|
||||
* For each of the 16 bytes in the vector register y the MASK()
|
||||
* operation returns 0xFF if the high bit of the byte is 1,
|
||||
* or 0x00 if the high bit is 0. The result is stored in vector
|
||||
* register x.
|
||||
*/
|
||||
static inline void MASK(int x, int y)
|
||||
{
|
||||
asm volatile ("VESRAVB %0,%1,24" : : "i" (x), "i" (y));
|
||||
}
|
||||
|
||||
static inline void AND(int x, int y, int z)
|
||||
{
|
||||
asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
|
||||
}
|
||||
|
||||
static inline void XOR(int x, int y, int z)
|
||||
{
|
||||
asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
|
||||
}
|
||||
|
||||
static inline void LOAD_DATA(int x, int n, u8 *ptr)
|
||||
{
|
||||
typedef struct { u8 _[16*n]; } addrtype;
|
||||
register addrtype *__ptr asm("1") = (addrtype *) ptr;
|
||||
|
||||
asm volatile ("VLM %2,%3,0,%r1"
|
||||
: : "m" (*__ptr), "a" (__ptr), "i" (x), "i" (x + n - 1));
|
||||
}
|
||||
|
||||
static inline void STORE_DATA(int x, int n, u8 *ptr)
|
||||
{
|
||||
typedef struct { u8 _[16*n]; } addrtype;
|
||||
register addrtype *__ptr asm("1") = (addrtype *) ptr;
|
||||
|
||||
asm volatile ("VSTM %2,%3,0,1"
|
||||
: "=m" (*__ptr) : "a" (__ptr), "i" (x), "i" (x + n - 1));
|
||||
}
|
||||
|
||||
static inline void COPY_VEC(int x, int y)
|
||||
{
|
||||
asm volatile ("VLR %0,%1" : : "i" (x), "i" (y));
|
||||
}
|
||||
|
||||
static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
|
||||
{
|
||||
struct kernel_fpu vxstate;
|
||||
u8 **dptr, *p, *q;
|
||||
int d, z, z0;
|
||||
|
||||
kernel_fpu_begin(&vxstate, KERNEL_VXR);
|
||||
LOAD_CONST();
|
||||
|
||||
dptr = (u8 **) ptrs;
|
||||
z0 = disks - 3; /* Highest data disk */
|
||||
p = dptr[z0 + 1]; /* XOR parity */
|
||||
q = dptr[z0 + 2]; /* RS syndrome */
|
||||
|
||||
for (d = 0; d < bytes; d += $#*NSIZE) {
|
||||
LOAD_DATA(0,$#,&dptr[z0][d]);
|
||||
COPY_VEC(8+$$,0+$$);
|
||||
for (z = z0 - 1; z >= 0; z--) {
|
||||
MASK(16+$$,8+$$);
|
||||
AND(16+$$,16+$$,25);
|
||||
SHLBYTE(8+$$,8+$$);
|
||||
XOR(8+$$,8+$$,16+$$);
|
||||
LOAD_DATA(16,$#,&dptr[z][d]);
|
||||
XOR(0+$$,0+$$,16+$$);
|
||||
XOR(8+$$,8+$$,16+$$);
|
||||
}
|
||||
STORE_DATA(0,$#,&p[d]);
|
||||
STORE_DATA(8,$#,&q[d]);
|
||||
}
|
||||
kernel_fpu_end(&vxstate, KERNEL_VXR);
|
||||
}
|
||||
|
||||
static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop,
|
||||
size_t bytes, void **ptrs)
|
||||
{
|
||||
struct kernel_fpu vxstate;
|
||||
u8 **dptr, *p, *q;
|
||||
int d, z, z0;
|
||||
|
||||
dptr = (u8 **) ptrs;
|
||||
z0 = stop; /* P/Q right side optimization */
|
||||
p = dptr[disks - 2]; /* XOR parity */
|
||||
q = dptr[disks - 1]; /* RS syndrome */
|
||||
|
||||
kernel_fpu_begin(&vxstate, KERNEL_VXR);
|
||||
LOAD_CONST();
|
||||
|
||||
for (d = 0; d < bytes; d += $#*NSIZE) {
|
||||
/* P/Q data pages */
|
||||
LOAD_DATA(0,$#,&dptr[z0][d]);
|
||||
COPY_VEC(8+$$,0+$$);
|
||||
for (z = z0 - 1; z >= start; z--) {
|
||||
MASK(16+$$,8+$$);
|
||||
AND(16+$$,16+$$,25);
|
||||
SHLBYTE(8+$$,8+$$);
|
||||
XOR(8+$$,8+$$,16+$$);
|
||||
LOAD_DATA(16,$#,&dptr[z][d]);
|
||||
XOR(0+$$,0+$$,16+$$);
|
||||
XOR(8+$$,8+$$,16+$$);
|
||||
}
|
||||
/* P/Q left side optimization */
|
||||
for (z = start - 1; z >= 0; z--) {
|
||||
MASK(16+$$,8+$$);
|
||||
AND(16+$$,16+$$,25);
|
||||
SHLBYTE(8+$$,8+$$);
|
||||
XOR(8+$$,8+$$,16+$$);
|
||||
}
|
||||
LOAD_DATA(16,$#,&p[d]);
|
||||
XOR(16+$$,16+$$,0+$$);
|
||||
STORE_DATA(16,$#,&p[d]);
|
||||
LOAD_DATA(16,$#,&q[d]);
|
||||
XOR(16+$$,16+$$,8+$$);
|
||||
STORE_DATA(16,$#,&q[d]);
|
||||
}
|
||||
kernel_fpu_end(&vxstate, KERNEL_VXR);
|
||||
}
|
||||
|
||||
static int raid6_s390vx$#_valid(void)
|
||||
{
|
||||
return MACHINE_HAS_VX;
|
||||
}
|
||||
|
||||
const struct raid6_calls raid6_s390vx$# = {
|
||||
raid6_s390vx$#_gen_syndrome,
|
||||
raid6_s390vx$#_xor_syndrome,
|
||||
raid6_s390vx$#_valid,
|
||||
"vx128x$#",
|
||||
1
|
||||
};
|
Загрузка…
Ссылка в новой задаче