arm64: lib: accelerate crc32_be
authorKevin Bracey <kevin@bracey.fi>
Tue, 18 Jan 2022 10:23:51 +0000 (12:23 +0200)
committerHerbert Xu <herbert@gondor.apana.org.au>
Mon, 31 Jan 2022 00:21:43 +0000 (11:21 +1100)
It makes no sense to leave crc32_be using the generic code while we
only accelerate the little-endian ops.

Even though the big-endian form doesn't fit as smoothly into the arm64,
we can speed it up and avoid hitting the D cache.

Tested on Cortex-A53. Without acceleration:

    crc32: CRC_LE_BITS = 64, CRC_BE BITS = 64
    crc32: self tests passed, processed 225944 bytes in 192240 nsec
    crc32c: CRC_LE_BITS = 64
    crc32c: self tests passed, processed 112972 bytes in 21360 nsec

With acceleration:

    crc32: CRC_LE_BITS = 64, CRC_BE BITS = 64
    crc32: self tests passed, processed 225944 bytes in 53480 nsec
    crc32c: CRC_LE_BITS = 64
    crc32c: self tests passed, processed 112972 bytes in 21480 nsec

Signed-off-by: Kevin Bracey <kevin@bracey.fi>
Tested-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/arm64/lib/crc32.S

index 0f9e10e..8340dcc 100644 (file)
 
        .arch           armv8-a+crc
 
-       .macro          __crc32, c
+       .macro          byteorder, reg, be
+       .if             \be
+CPU_LE( rev            \reg, \reg      )
+       .else
+CPU_BE( rev            \reg, \reg      )
+       .endif
+       .endm
+
+       .macro          byteorder16, reg, be
+       .if             \be
+CPU_LE( rev16          \reg, \reg      )
+       .else
+CPU_BE( rev16          \reg, \reg      )
+       .endif
+       .endm
+
+       .macro          bitorder, reg, be
+       .if             \be
+       rbit            \reg, \reg
+       .endif
+       .endm
+
+       .macro          bitorder16, reg, be
+       .if             \be
+       rbit            \reg, \reg
+       lsr             \reg, \reg, #16
+       .endif
+       .endm
+
+       .macro          bitorder8, reg, be
+       .if             \be
+       rbit            \reg, \reg
+       lsr             \reg, \reg, #24
+       .endif
+       .endm
+
+       .macro          __crc32, c, be=0
+       bitorder        w0, \be
        cmp             x2, #16
        b.lt            8f                      // less than 16 bytes
 
        add             x8, x8, x1
        add             x1, x1, x7
        ldp             x5, x6, [x8]
-CPU_BE(        rev             x3, x3          )
-CPU_BE(        rev             x4, x4          )
-CPU_BE(        rev             x5, x5          )
-CPU_BE(        rev             x6, x6          )
+       byteorder       x3, \be
+       byteorder       x4, \be
+       byteorder       x5, \be
+       byteorder       x6, \be
+       bitorder        x3, \be
+       bitorder        x4, \be
+       bitorder        x5, \be
+       bitorder        x6, \be
 
        tst             x7, #8
        crc32\c\()x     w8, w0, x3
@@ -55,33 +96,43 @@ CPU_BE(     rev             x6, x6          )
 32:    ldp             x3, x4, [x1], #32
        sub             x2, x2, #32
        ldp             x5, x6, [x1, #-16]
-CPU_BE(        rev             x3, x3          )
-CPU_BE(        rev             x4, x4          )
-CPU_BE(        rev             x5, x5          )
-CPU_BE(        rev             x6, x6          )
+       byteorder       x3, \be
+       byteorder       x4, \be
+       byteorder       x5, \be
+       byteorder       x6, \be
+       bitorder        x3, \be
+       bitorder        x4, \be
+       bitorder        x5, \be
+       bitorder        x6, \be
        crc32\c\()x     w0, w0, x3
        crc32\c\()x     w0, w0, x4
        crc32\c\()x     w0, w0, x5
        crc32\c\()x     w0, w0, x6
        cbnz            x2, 32b
-0:     ret
+0:     bitorder        w0, \be
+       ret
 
 8:     tbz             x2, #3, 4f
        ldr             x3, [x1], #8
-CPU_BE(        rev             x3, x3          )
+       byteorder       x3, \be
+       bitorder        x3, \be
        crc32\c\()x     w0, w0, x3
 4:     tbz             x2, #2, 2f
        ldr             w3, [x1], #4
-CPU_BE(        rev             w3, w3          )
+       byteorder       w3, \be
+       bitorder        w3, \be
        crc32\c\()w     w0, w0, w3
 2:     tbz             x2, #1, 1f
        ldrh            w3, [x1], #2
-CPU_BE(        rev16           w3, w3          )
+       byteorder16     w3, \be
+       bitorder16      w3, \be
        crc32\c\()h     w0, w0, w3
 1:     tbz             x2, #0, 0f
        ldrb            w3, [x1]
+       bitorder8       w3, \be
        crc32\c\()b     w0, w0, w3
-0:     ret
+0:     bitorder        w0, \be
+       ret
        .endm
 
        .align          5
@@ -99,3 +150,11 @@ alternative_if_not ARM64_HAS_CRC32
 alternative_else_nop_endif
        __crc32         c
 SYM_FUNC_END(__crc32c_le)
+
+       .align          5
+SYM_FUNC_START(crc32_be)
+alternative_if_not ARM64_HAS_CRC32
+       b               crc32_be_base
+alternative_else_nop_endif
+       __crc32         be=1
+SYM_FUNC_END(crc32_be)