powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision

author Simon Guo <wei.guo.simon@gmail.com>

Thu, 7 Jun 2018 01:57:53 +0000 (09:57 +0800)

committer Michael Ellerman <mpe@ellerman.id.au>

Tue, 24 Jul 2018 12:03:21 +0000 (22:03 +1000)
author Simon Guo <wei.guo.simon@gmail.com>
Thu, 7 Jun 2018 01:57:53 +0000 (09:57 +0800)
committer Michael Ellerman <mpe@ellerman.id.au>
Tue, 24 Jul 2018 12:03:21 +0000 (22:03 +1000)
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h

index 7841b8a..769567b 100644 (file)
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -48,8 +48,8 @@ void __trace_opal_exit(long opcode, unsigned long retval);
  /* VMX copying */
  int enter_vmx_usercopy(void);
  int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
-void * exit_vmx_copy(void *dest);
+int enter_vmx_ops(void);
+void *exit_vmx_ops(void *dest);
  
  /* Traps */
  long machine_check_early(struct pt_regs *regs);
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S

index 8fa73b7..e38f956 100644 (file)
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
         std     r0,16(r1)
         stdu    r1,-STACKFRAMESIZE(r1)
-       bl      enter_vmx_copy
+       bl      enter_vmx_ops
         cmpwi   r3,0
         ld      r0,STACKFRAMESIZE+16(r1)
         ld      r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
         addi    r3,r3,128
         bdnz    1b
  
-       b       exit_vmx_copy           /* tail call optimise */
+       b       exit_vmx_ops            /* tail call optimise */
  
  #else
         li      r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S

index 5776f91..be2f792 100644 (file)
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -9,6 +9,7 @@
   */
  #include <asm/ppc_asm.h>
  #include <asm/export.h>
+#include <asm/ppc-opcode.h>
  
  #define off8   r6
  #define off16  r7
@@ -27,12 +28,73 @@
  #define LH     lhbrx
  #define LW     lwbrx
  #define LD     ldbrx
+#define LVS    lvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+       vperm _VRT,_VRB,_VRA,_VRC
  #else
  #define LH     lhzx
  #define LW     lwzx
  #define LD     ldx
+#define LVS    lvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+       vperm _VRT,_VRA,_VRB,_VRC
  #endif
  
+#define VMX_THRESH 4096
+#define ENTER_VMX_OPS  \
+       mflr    r0;     \
+       std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+       std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+       std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+       std     r0,16(r1); \
+       stdu    r1,-STACKFRAMESIZE(r1); \
+       bl      enter_vmx_ops; \
+       cmpwi   cr1,r3,0; \
+       ld      r0,STACKFRAMESIZE+16(r1); \
+       ld      r3,STK_REG(R31)(r1); \
+       ld      r4,STK_REG(R30)(r1); \
+       ld      r5,STK_REG(R29)(r1); \
+       addi    r1,r1,STACKFRAMESIZE; \
+       mtlr    r0
+
+#define EXIT_VMX_OPS \
+       mflr    r0; \
+       std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+       std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+       std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+       std     r0,16(r1); \
+       stdu    r1,-STACKFRAMESIZE(r1); \
+       bl      exit_vmx_ops; \
+       ld      r0,STACKFRAMESIZE+16(r1); \
+       ld      r3,STK_REG(R31)(r1); \
+       ld      r4,STK_REG(R30)(r1); \
+       ld      r5,STK_REG(R29)(r1); \
+       addi    r1,r1,STACKFRAMESIZE; \
+       mtlr    r0
+
+/*
+ * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
+ * 16 bytes boundary and permute the result with the 1st 16 bytes.
+
+ *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
+ *    ^                                  ^                                 ^
+ * 0xbbbb10                          0xbbbb20                          0xbbb30
+ *                                 ^
+ *                                _vaddr
+ *
+ *
+ * _vmask is the mask generated by LVS
+ * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
+ *   for example: 0xyyyyyyyyyyyyy012 for big endian
+ * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
+ *   for example: 0x3456789abcdefzzz for big endian
+ * The permute result is saved in _v_res.
+ *   for example: 0x0123456789abcdef for big endian.
+ */
+#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
+        lvx     _v2nd_qw,_vaddr,off16; \
+        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
+
  /*
   * There are 2 categories for memcmp:
   * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
@@ -40,7 +102,7 @@
   * 2) src/dst has different offset to the 8 bytes boundary. The handlers
   * are named like .Ldiffoffset_xxxx
   */
-_GLOBAL(memcmp)
+_GLOBAL_TOC(memcmp)
         cmpdi   cr1,r5,0
  
         /* Use the short loop if the src/dst addresses are not
@@ -132,7 +194,7 @@ _GLOBAL(memcmp)
         bgt     cr6,.Llong
  
  .Lcmp_lt32bytes:
-       /* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */
+       /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
         cmpdi   cr5,r5,7
         srdi    r0,r5,3
         ble     cr5,.Lcmp_rest_lt8bytes
@@ -173,6 +235,15 @@ _GLOBAL(memcmp)
         blr
  
  .Llong:
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+       /* Try to use vmx loop if length is equal or greater than 4K */
+       cmpldi  cr6,r5,VMX_THRESH
+       bge     cr6,.Lsameoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Llong_novmx_cmp:
+#endif
         /* At least s1 addr is aligned with 8 bytes */
         li      off8,8
         li      off16,16
@@ -330,7 +401,97 @@ _GLOBAL(memcmp)
         li      r3,-1
         blr
  
+#ifdef CONFIG_ALTIVEC
+.Lsameoffset_vmx_cmp:
+       /* Enter with src/dst addrs has the same offset with 8 bytes
+        * align boundary
+        */
+       ENTER_VMX_OPS
+       beq     cr1,.Llong_novmx_cmp
+
+3:
+       /* need to check whether r4 has the same offset with r3
+        * for 16 bytes boundary.
+        */
+       xor     r0,r3,r4
+       andi.   r0,r0,0xf
+       bne     .Ldiffoffset_vmx_cmp_start
+
+       /* len is no less than 4KB. Need to align with 16 bytes further.
+        */
+       andi.   rA,r3,8
+       LD      rA,0,r3
+       beq     4f
+       LD      rB,0,r4
+       cmpld   cr0,rA,rB
+       addi    r3,r3,8
+       addi    r4,r4,8
+       addi    r5,r5,-8
+
+       beq     cr0,4f
+       /* save and restore cr0 */
+       mfocrf  r5,128
+       EXIT_VMX_OPS
+       mtocrf  128,r5
+       b       .LcmpAB_lightweight
+
+4:
+       /* compare 32 bytes for each loop */
+       srdi    r0,r5,5
+       mtctr   r0
+       clrldi  r5,r5,59
+       li      off16,16
+
+.balign 16
+5:
+       lvx     v0,0,r3
+       lvx     v1,0,r4
+       VCMPEQUD_RC(v0,v0,v1)
+       bnl     cr6,7f
+       lvx     v0,off16,r3
+       lvx     v1,off16,r4
+       VCMPEQUD_RC(v0,v0,v1)
+       bnl     cr6,6f
+       addi    r3,r3,32
+       addi    r4,r4,32
+       bdnz    5b
+
+       EXIT_VMX_OPS
+       cmpdi   r5,0
+       beq     .Lzero
+       b       .Lcmp_lt32bytes
+
+6:
+       addi    r3,r3,16
+       addi    r4,r4,16
+
+7:
+       /* diff the last 16 bytes */
+       EXIT_VMX_OPS
+       LD      rA,0,r3
+       LD      rB,0,r4
+       cmpld   cr0,rA,rB
+       li      off8,8
+       bne     cr0,.LcmpAB_lightweight
+
+       LD      rA,off8,r3
+       LD      rB,off8,r4
+       cmpld   cr0,rA,rB
+       bne     cr0,.LcmpAB_lightweight
+       b       .Lzero
+#endif
+
  .Ldiffoffset_8bytes_make_align_start:
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+       /* only do vmx ops when the size equal or greater than 4K bytes */
+       cmpdi   cr5,r5,VMX_THRESH
+       bge     cr5,.Ldiffoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Ldiffoffset_novmx_cmp:
+#endif
+
         /* now try to align s1 with 8 bytes */
         rlwinm  r6,r3,3,26,28
         beq     .Ldiffoffset_align_s1_8bytes
@@ -356,6 +517,82 @@ _GLOBAL(memcmp)
         /* now s1 is aligned with 8 bytes. */
         cmpdi   cr5,r5,31
         ble     cr5,.Lcmp_lt32bytes
+
+#ifdef CONFIG_ALTIVEC
+       b       .Llong_novmx_cmp
+#else
         b       .Llong
+#endif
+
+#ifdef CONFIG_ALTIVEC
+.Ldiffoffset_vmx_cmp:
+       ENTER_VMX_OPS
+       beq     cr1,.Ldiffoffset_novmx_cmp
+
+.Ldiffoffset_vmx_cmp_start:
+       /* Firstly try to align r3 with 16 bytes */
+       andi.   r6,r3,0xf
+       li      off16,16
+       beq     .Ldiffoffset_vmx_s1_16bytes_align
  
+       LVS     v3,0,r3
+       LVS     v4,0,r4
+
+       lvx     v5,0,r3
+       lvx     v6,0,r4
+       LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
+       LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+
+       VCMPEQUB_RC(v7,v9,v10)
+       bnl     cr6,.Ldiffoffset_vmx_diff_found
+
+       subfic  r6,r6,16
+       subf    r5,r6,r5
+       add     r3,r3,r6
+       add     r4,r4,r6
+
+.Ldiffoffset_vmx_s1_16bytes_align:
+       /* now s1 is aligned with 16 bytes */
+       lvx     v6,0,r4
+       LVS     v4,0,r4
+       srdi    r6,r5,5  /* loop for 32 bytes each */
+       clrldi  r5,r5,59
+       mtctr   r6
+
+.balign        16
+.Ldiffoffset_vmx_32bytesloop:
+       /* the first qw of r4 was saved in v6 */
+       lvx     v9,0,r3
+       LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+       VCMPEQUB_RC(v7,v9,v10)
+       vor     v6,v8,v8
+       bnl     cr6,.Ldiffoffset_vmx_diff_found
+
+       addi    r3,r3,16
+       addi    r4,r4,16
+
+       lvx     v9,0,r3
+       LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+       VCMPEQUB_RC(v7,v9,v10)
+       vor     v6,v8,v8
+       bnl     cr6,.Ldiffoffset_vmx_diff_found
+
+       addi    r3,r3,16
+       addi    r4,r4,16
+
+       bdnz    .Ldiffoffset_vmx_32bytesloop
+
+       EXIT_VMX_OPS
+
+       cmpdi   r5,0
+       beq     .Lzero
+       b       .Lcmp_lt32bytes
+
+.Ldiffoffset_vmx_diff_found:
+       EXIT_VMX_OPS
+       /* anyway, the diff will appear in next 16 bytes */
+       li      r5,16
+       b       .Lcmp_lt32bytes
+
+#endif
  EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S

index df7de9d..070cdf6 100644 (file)
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7)
         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
         std     r0,16(r1)
         stdu    r1,-STACKFRAMESIZE(r1)
-       bl      enter_vmx_copy
+       bl      enter_vmx_ops
         cmpwi   cr1,r3,0
         ld      r0,STACKFRAMESIZE+16(r1)
         ld      r3,STK_REG(R31)(r1)
@@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7)
  
  15:    addi    r1,r1,STACKFRAMESIZE
         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-       b       exit_vmx_copy           /* tail call optimise */
+       b       exit_vmx_ops            /* tail call optimise */
  
  .Lvmx_unaligned_copy:
         /* Get the destination 16B aligned */
@@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7)
  
  15:    addi    r1,r1,STACKFRAMESIZE
         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-       b       exit_vmx_copy           /* tail call optimise */
+       b       exit_vmx_ops            /* tail call optimise */
  #endif /* CONFIG_ALTIVEC */
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c

index bf925cd..9f34049 100644 (file)
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
         return 0;
  }
  
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
  {
         if (in_interrupt())
                 return 0;
@@ -70,7 +70,7 @@ int enter_vmx_copy(void)
   * passed a pointer to the destination which we return as required by a
   * memcpy implementation.
   */
-void *exit_vmx_copy(void *dest)
+void *exit_vmx_ops(void *dest)
  {
         disable_kernel_altivec();
         preempt_enable();
author	Simon Guo <wei.guo.simon@gmail.com>
	Thu, 7 Jun 2018 01:57:53 +0000 (09:57 +0800)
committer	Michael Ellerman <mpe@ellerman.id.au>
	Tue, 24 Jul 2018 12:03:21 +0000 (22:03 +1000)
arch/powerpc/include/asm/asm-prototypes.h		patch \| blob \| history
arch/powerpc/lib/copypage_power7.S		patch \| blob \| history
arch/powerpc/lib/memcmp_64.S		patch \| blob \| history
arch/powerpc/lib/memcpy_power7.S		patch \| blob \| history
arch/powerpc/lib/vmx-helper.c		patch \| blob \| history