mpn/alpha/mode1o.asm

   1 dnl  Alpha mpn_modexact_1c_odd -- mpn exact remainder
   2
   3 dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C      cycles/limb
  24 C EV4:    47
  25 C EV5:    30
  26 C EV6:    15
  27
  28
  29 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
  30 C                                mp_limb_t c)
  31 C
  32 C This code follows the "alternate" code in mpn/generic/mode1o.c,
  33 C eliminating cbit+climb from the dependent chain.  This leaves,
  34 C
  35 C        ev4   ev5   ev6
  36 C         1     3     1    subq   y = x - h
  37 C        23    13     7    mulq   q = y * inverse
  38 C        23    14     7    umulh  h = high (q * d)
  39 C        --    --    --
  40 C        47    30    15
  41 C
  42 C In each case, the load latency, loop control, and extra carry bit handling
  43 C hide under the multiply latencies.  Those latencies are long enough that
  44 C we don't need to worry about alignment or pairing to squeeze out
  45 C performance.
  46 C
  47 C For the first limb, some of the loop code is broken out and scheduled back
  48 C since it can be done earlier.
  49 C
  50 C   - The first ldq src[0] is near the start of the routine, for maximum
  51 C     time from memory.
  52 C
  53 C   - The subq y=x-climb can be done without waiting for the inverse.
  54 C
  55 C   - The mulq y*inverse is replicated after the final subq for the inverse,
  56 C     instead of branching to the mulq in the main loop.  On ev4 a branch
  57 C     there would cost cycles, but we can hide them under the mulq latency.
  58 C
  59 C For the last limb, high<divisor is tested and if that's true a subtract
  60 C and addback is done, as per the main mpn/generic/mode1o.c code.  This is a
  61 C data-dependent branch, but we're waiting for umulh so any penalty should
  62 C hide there.  The multiplies saved would be worth the cost anyway.
  63 C
  64 C Enhancements:
  65 C
  66 C For size==1, a plain division (done bitwise say) might be faster than
  67 C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
  68 C ev5.  A call to gcc __remqu might be a possibility.
  69
  70 ASM_START()
  71 PROLOGUE(mpn_modexact_1c_odd,gp)
  72
  73         C r16   src
  74         C r17   size
  75         C r18   d
  76         C r19   c
  77
  78         LEA(r0, binvert_limb_table)
  79         srl     r18, 1, r20             C d >> 1
  80
  81         and     r20, 127, r20           C idx = d>>1 & 0x7F
  82
  83         addq    r0, r20, r21            C table + idx
  84
  85 ifelse(bwx_available_p,1,
  86 `       ldbu    r20, 0(r21)             C table[idx], inverse 8 bits
  87 ',`
  88         ldq_u   r20, 0(r21)             C table[idx] qword
  89         extbl   r20, r21, r20           C table[idx], inverse 8 bits
  90 ')
  91
  92         mull    r20, r20, r7            C i*i
  93         addq    r20, r20, r20           C 2*i
  94
  95         ldq     r2, 0(r16)              C x = s = src[0]
  96         lda     r17, -1(r17)            C size--
  97         clr     r0                      C initial cbit=0
  98
  99         mull    r7, r18, r7             C i*i*d
 100
 101         subq    r20, r7, r20            C 2*i-i*i*d, inverse 16 bits
 102
 103         mull    r20, r20, r7            C i*i
 104         addq    r20, r20, r20           C 2*i
 105
 106         mull    r7, r18, r7             C i*i*d
 107
 108         subq    r20, r7, r20            C 2*i-i*i*d, inverse 32 bits
 109
 110         mulq    r20, r20, r7            C i*i
 111         addq    r20, r20, r20           C 2*i
 112
 113         mulq    r7, r18, r7             C i*i*d
 114         subq    r2, r19, r3             C y = x - climb
 115
 116         subq    r20, r7, r20            C inv = 2*i-i*i*d, inverse 64 bits
 117
 118 ASSERT(r7, C should have d*inv==1 mod 2^64
 119 `       mulq    r18, r20, r7
 120         cmpeq   r7, 1, r7')
 121
 122         mulq    r3, r20, r4             C first q = y * inv
 123
 124         beq     r17, L(one)             C if size==1
 125         br      L(entry)
 126
 127
 128 L(top):
 129         C r0    cbit
 130         C r16   src, incrementing
 131         C r17   size, decrementing
 132         C r18   d
 133         C r19   climb
 134         C r20   inv
 135
 136         ldq     r1, 0(r16)              C s = src[i]
 137         subq    r1, r0, r2              C x = s - cbit
 138         cmpult  r1, r0, r0              C new cbit = s < cbit
 139
 140         subq    r2, r19, r3             C y = x - climb
 141
 142         mulq    r3, r20, r4             C q = y * inv
 143 L(entry):
 144         cmpult  r2, r19, r5             C cbit2 = x < climb
 145         addq    r5, r0, r0              C cbit += cbit2
 146         lda     r16, 8(r16)             C src++
 147         lda     r17, -1(r17)            C size--
 148
 149         umulh   r4, r18, r19            C climb = q * d
 150         bne     r17, L(top)             C while 2 or more limbs left
 151
 152
 153
 154         C r0    cbit
 155         C r18   d
 156         C r19   climb
 157         C r20   inv
 158
 159         ldq     r1, 0(r16)              C s = src[size-1] high limb
 160
 161         cmpult  r1, r18, r2             C test high<divisor
 162         bne     r2, L(skip)             C skip if so
 163
 164         C can't skip a division, repeat loop code
 165
 166         subq    r1, r0, r2              C x = s - cbit
 167         cmpult  r1, r0, r0              C new cbit = s < cbit
 168
 169         subq    r2, r19, r3             C y = x - climb
 170
 171         mulq    r3, r20, r4             C q = y * inv
 172 L(one):
 173         cmpult  r2, r19, r5             C cbit2 = x < climb
 174         addq    r5, r0, r0              C cbit += cbit2
 175
 176         umulh   r4, r18, r19            C climb = q * d
 177
 178         addq    r19, r0, r0             C return climb + cbit
 179         ret     r31, (r26), 1
 180
 181
 182         ALIGN(8)
 183 L(skip):
 184         C with high<divisor, the final step can be just (cbit+climb)-s and
 185         C an addback of d if that underflows
 186
 187         addq    r19, r0, r19            C c = climb + cbit
 188
 189         subq    r19, r1, r2             C c - s
 190         cmpult  r19, r1, r3             C c < s
 191
 192         addq    r2, r18, r0             C return c-s + divisor
 193
 194         cmoveq  r3, r2, r0              C return c-s if no underflow
 195         ret     r31, (r26), 1
 196
 197 EPILOGUE()
 198 ASM_END()