crypto: x86/cast6 - Use RIP-relative addressing
authorArd Biesheuvel <ardb@kernel.org>
Wed, 12 Apr 2023 11:00:28 +0000 (13:00 +0200)
committerHerbert Xu <herbert@gondor.apana.org.au>
Thu, 20 Apr 2023 10:20:04 +0000 (18:20 +0800)
Prefer RIP-relative addressing where possible, which removes the need
for boot time relocation fixups.

Co-developed-by: Thomas Garnier <thgarnie@chromium.org>
Signed-off-by: Thomas Garnier <thgarnie@chromium.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/x86/crypto/cast6-avx-x86_64-asm_64.S

index 82b716f..9e86d46 100644 (file)
 
 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
        movzbl          src ## bh,     RID1d;    \
+       leaq            s1(%rip),      RID2;     \
+       movl            (RID2,RID1,4), dst ## d; \
        movzbl          src ## bl,     RID2d;    \
+       leaq            s2(%rip),      RID1;     \
+       op1             (RID1,RID2,4), dst ## d; \
        shrq $16,       src;                     \
-       movl            s1(, RID1, 4), dst ## d; \
-       op1             s2(, RID2, 4), dst ## d; \
        movzbl          src ## bh,     RID1d;    \
+       leaq            s3(%rip),      RID2;     \
+       op2             (RID2,RID1,4), dst ## d; \
        movzbl          src ## bl,     RID2d;    \
        interleave_op(il_reg);                   \
-       op2             s3(, RID1, 4), dst ## d; \
-       op3             s4(, RID2, 4), dst ## d;
+       leaq            s4(%rip),      RID1;     \
+       op3             (RID1,RID2,4), dst ## d;
 
 #define dummy(d) /* do nothing */
 
        qop(RD, RC, 1);
 
 #define shuffle(mask) \
-       vpshufb         mask,            RKR, RKR;
+       vpshufb         mask(%rip),            RKR, RKR;
 
 #define preload_rkr(n, do_mask, mask) \
-       vbroadcastss    .L16_mask,                RKR;      \
+       vbroadcastss    .L16_mask(%rip),          RKR;      \
        /* add 16-bit rotation to key rotations (mod 32) */ \
        vpxor           (kr+n*16)(CTX),           RKR, RKR; \
        do_mask(mask);
@@ -258,9 +262,9 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
 
        movq %rdi, CTX;
 
-       vmovdqa .Lbswap_mask, RKM;
-       vmovd .Lfirst_mask, R1ST;
-       vmovd .L32_mask, R32;
+       vmovdqa .Lbswap_mask(%rip), RKM;
+       vmovd .Lfirst_mask(%rip), R1ST;
+       vmovd .L32_mask(%rip), R32;
 
        inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
        inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
@@ -284,7 +288,7 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
        popq %rbx;
        popq %r15;
 
-       vmovdqa .Lbswap_mask, RKM;
+       vmovdqa .Lbswap_mask(%rip), RKM;
 
        outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
        outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
@@ -306,9 +310,9 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
 
        movq %rdi, CTX;
 
-       vmovdqa .Lbswap_mask, RKM;
-       vmovd .Lfirst_mask, R1ST;
-       vmovd .L32_mask, R32;
+       vmovdqa .Lbswap_mask(%rip), RKM;
+       vmovd .Lfirst_mask(%rip), R1ST;
+       vmovd .L32_mask(%rip), R32;
 
        inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
        inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
@@ -332,7 +336,7 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
        popq %rbx;
        popq %r15;
 
-       vmovdqa .Lbswap_mask, RKM;
+       vmovdqa .Lbswap_mask(%rip), RKM;
        outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
        outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);