mpn/ia64/dive_1.asm

   1 dnl  IA-64 mpn_divexact_1 -- mpn by limb exact division.
   2
   3 dnl  Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22 C            cycles/limb
  23 C Itanium:      16
  24 C Itanium 2:     8
  25
  26 C INPUT PARAMETERS
  27 define(`rp', `r32')
  28 define(`up', `r33')
  29 define(`n',  `r34')
  30 define(`divisor', `r35')
  31
  32 define(`lshift', `r24')
  33 define(`rshift', `r25')
  34
  35 C This code is a bit messy, and not as similar to mode1o.asm as desired.
  36
  37 C The critical path during initialization is for computing the inverse of the
  38 C divisor.  Since odd divisors are probably common, we conditionally execute
  39 C the initial count_traling_zeros code and the downshift.
  40
  41 C Possible improvement: Merge more of the feed-in code into the inverse
  42 C computation.
  43
  44 ASM_START()
  45         .text
  46         .align  32
  47 .Ltab:
  48 data1   0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
  49 data1   0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
  50 data1   0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
  51 data1   0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
  52 data1   0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
  53 data1   0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
  54 data1   0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
  55 data1   0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
  56 data1   0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
  57 data1   0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
  58 data1   0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
  59 data1   0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
  60 data1   0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
  61 data1   0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
  62 data1   0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
  63 data1   0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
  64
  65
  66 PROLOGUE(mpn_divexact_1)
  67         .prologue
  68         .save           ar.lc, r2
  69         .body
  70
  71  {.mmi; add             r8 = -1, divisor        C M0
  72         nop             0                       C M1
  73         tbit.z          p8, p9 = divisor, 0     C I0
  74 }
  75 ifdef(`HAVE_ABI_32',
  76 `       addp4           rp = 0, rp              C M2  rp extend
  77         addp4           up = 0, up              C M3  up extend
  78         sxt4            n = n')                 C I1  size extend
  79         ;;
  80 .Lhere:
  81  {.mmi; ld8             r20 = [up], 8           C M0  up[0]
  82   (p8)  andcm           r8 = r8, divisor        C M1
  83         mov             r15 = ip                C I0  .Lhere
  84         ;;
  85 }{.mii
  86         .pred.rel "mutex", p8, p9
  87   (p9)  mov             rshift = 0              C M0
  88   (p8)  popcnt          rshift = r8             C I0 r8 = cnt_lo_zeros(divisor)
  89         cmp.eq          p6, p10 = 1, n          C I1
  90         ;;
  91 }{.mii; add             r9 = .Ltab-.Lhere, r15  C M0
  92   (p8)  shr.u           divisor = divisor, rshift C I0
  93         nop             0                       C I1
  94         ;;
  95 }{.mmi; add             n = -4, n               C M0  size-1
  96   (p10) ld8             r21 = [up], 8           C M1  up[1]
  97         mov             r14 = 2                 C M1  2
  98 }{.mfi; setf.sig        f6 = divisor            C M2  divisor
  99         mov             f9 = f0                 C M3  carry             FIXME
 100         zxt1            r3 = divisor            C I1  divisor low byte
 101         ;;
 102 }{.mmi; add             r3 = r9, r3             C M0  table offset ip and index
 103         sub             r16 = 0, divisor        C M1  -divisor
 104         mov             r2 = ar.lc              C I0
 105 }{.mmi; sub             lshift = 64, rshift     C M2
 106         setf.sig        f13 = r14               C M3  2 in significand
 107         mov             r17 = -1                C I1  -1
 108         ;;
 109 }{.mmi; ld1             r3 = [r3]               C M0  inverse, 8 bits
 110         nop             0                       C M1
 111         mov             ar.lc = n               C I0  size-1 loop count
 112 }{.mmi; setf.sig        f12 = r16               C M2  -divisor
 113         setf.sig        f8 = r17                C M3  -1
 114         cmp.eq          p7, p0 = -2, n          C I1
 115         ;;
 116 }{.mmi; setf.sig        f7 = r3                 C M2  inverse, 8 bits
 117         cmp.eq          p8, p0 = -1, n          C M0
 118         shr.u           r23 = r20, rshift       C I0
 119         ;;
 120 }
 121
 122         C f6    divisor
 123         C f7    inverse, being calculated
 124         C f8    -1, will be -inverse
 125         C f9    carry
 126         C f12   -divisor
 127         C f13   2
 128         C f14   scratch
 129
 130         xmpy.l          f14 = f13, f7           C Newton 2*i
 131         xmpy.l          f7 = f7, f7             C Newton i*i
 132         ;;
 133         xma.l           f7 = f7, f12, f14       C Newton i*i*-d + 2*i, 16 bits
 134         ;;
 135         setf.sig        f10 = r23               C speculative, used iff n = 1
 136         xmpy.l          f14 = f13, f7           C Newton 2*i
 137         shl             r22 = r21, lshift       C speculative, used iff n > 1
 138         xmpy.l          f7 = f7, f7             C Newton i*i
 139         ;;
 140         or              r31 = r22, r23          C speculative, used iff n > 1
 141         xma.l           f7 = f7, f12, f14       C Newton i*i*-d + 2*i, 32 bits
 142         shr.u           r23 = r21, rshift       C speculative, used iff n > 1
 143         ;;
 144         setf.sig        f11 = r31               C speculative, used iff n > 1
 145         xmpy.l          f14 = f13, f7           C Newton 2*i
 146         xmpy.l          f7 = f7, f7             C Newton i*i
 147         ;;
 148         xma.l           f7 = f7, f12, f14       C Newton i*i*-d + 2*i, 64 bits
 149
 150   (p7)  br.cond.dptk    .Ln2
 151   (p10) br.cond.dptk    .grt3
 152         ;;
 153
 154 .Ln1:   xmpy.l          f12 = f10, f7           C q = ulimb * inverse
 155         br              .Lx1
 156
 157 .Ln2:
 158         xmpy.l          f8 = f7, f8             C -inverse = inverse * -1
 159         xmpy.l          f12 = f11, f7           C q = ulimb * inverse
 160         setf.sig        f11 = r23
 161         br              .Lx2
 162
 163 .grt3:
 164         ld8             r21 = [up], 8           C up[2]
 165         xmpy.l          f8 = f7, f8             C -inverse = inverse * -1
 166         ;;
 167         shl             r22 = r21, lshift
 168         ;;
 169         xmpy.l          f12 = f11, f7           C q = ulimb * inverse
 170         ;;
 171         or              r31 = r22, r23
 172         shr.u           r23 = r21, rshift
 173         ;;
 174         setf.sig        f11 = r31
 175   (p8)  br.cond.dptk    .Lx3                    C branch for n = 3
 176         ;;
 177         ld8             r21 = [up], 8
 178         br              .Lent
 179
 180 .Loop:  ld8             r21 = [up], 8
 181         xma.l           f12 = f9, f8, f10       C q = c * -inverse + si
 182         ;;
 183 .Lent:  add             r16 = 160, up
 184         shl             r22 = r21, lshift
 185         ;;
 186         stf8            [rp] = f12, 8
 187         xma.hu          f9 = f12, f6, f9        C c = high(q * divisor + c)
 188         xmpy.l          f10 = f11, f7           C si = ulimb * inverse
 189         ;;
 190         or              r31 = r22, r23
 191         shr.u           r23 = r21, rshift
 192         ;;
 193         lfetch          [r16]
 194         setf.sig        f11 = r31
 195         br.cloop.sptk.few.clr .Loop
 196
 197
 198         xma.l           f12 = f9, f8, f10       C q = c * -inverse + si
 199         ;;
 200 .Lx3:   stf8            [rp] = f12, 8
 201         xma.hu          f9 = f12, f6, f9        C c = high(q * divisor + c)
 202         xmpy.l          f10 = f11, f7           C si = ulimb * inverse
 203         ;;
 204         setf.sig        f11 = r23
 205         ;;
 206         xma.l           f12 = f9, f8, f10       C q = c * -inverse + si
 207         ;;
 208 .Lx2:   stf8            [rp] = f12, 8
 209         xma.hu          f9 = f12, f6, f9        C c = high(q * divisor + c)
 210         xmpy.l          f10 = f11, f7           C si = ulimb * inverse
 211         ;;
 212         xma.l           f12 = f9, f8, f10       C q = c * -inverse + si
 213         ;;
 214 .Lx1:   stf8            [rp] = f12, 8
 215         mov             ar.lc = r2              C I0
 216         br.ret.sptk.many b0
 217 EPILOGUE()