mpn/ia64/gcd_1.asm

   1 dnl  Itanium-2 mpn_gcd_1 -- mpn by 1 gcd.
   2
   3 dnl  Copyright 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C           cycles/bitpair (1x1 gcd)
  24 C Itanium:      14 (approx)
  25 C Itanium 2:     6.3
  26
  27
  28 C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
  29 C
  30 C The entry sequence is designed to expect xsize>1 and hence a modexact
  31 C call.  This ought to be more common than a 1x1 operation.  Our critical
  32 C path is thus stripping factors of 2 from y, calling modexact, then
  33 C stripping factors of 2 from the x remainder returned.
  34 C
  35 C The common factors of 2 between x and y must be determined using the
  36 C original x, not the remainder from the modexact.  This is done with
  37 C x_orig which is xp[0].  There's plenty of time to do this while the rest
  38 C of the modexact etc is happening.
  39 C
  40 C It's possible xp[0] is zero.  In this case the trailing zeros calculation
  41 C popc((x-1)&~x) gives 63, and that's clearly no less than what y will
  42 C have, making min(x_twos,y_twos) == y_twos.
  43 C
  44 C The main loop consists of transforming x,y to abs(x-y),min(x,y), and then
  45 C stripping factors of 2 from abs(x-y).  Those factors of two are
  46 C determined from just y-x, without the abs(), since there's the same
  47 C number of trailing zeros on n or -n in twos complement.  That makes the
  48 C dependent chain
  49 C
  50 C       cycles
  51 C         1    sub     x-y and x-y-1
  52 C         3    andcm   (x-y-1)&~(x-y)
  53 C         2    popcnt  trailing zeros
  54 C         3    shr.u   strip abs(x-y)
  55 C        ---
  56 C         9
  57 C
  58 C The selection of x-y versus y-x for abs(x-y), and the selection of the
  59 C minimum of x and y, is done in parallel with the above.
  60 C
  61 C The algorithm takes about 0.68 iterations per bit (two N bit operands) on
  62 C average, hence the final 6.3 cycles/bitpair.
  63 C
  64 C The loop is not as fast as one might hope, since there's extra latency
  65 C from andcm going across to the `multimedia' popcnt, and vice versa from
  66 C multimedia shr.u back to the integer sub.
  67 C
  68 C The loop branch is .sptk.clr since we usually expect a good number of
  69 C iterations, and the iterations are data dependent so it's unlikely past
  70 C results will predict anything much about the future.
  71 C
  72 C Not done:
  73 C
  74 C An alternate algorithm which didn't strip all twos, but instead applied
  75 C tbit and predicated extr on x, and then y, was attempted.  The loop was 6
  76 C cycles, but the algorithm is an average 1.25 iterations per bitpair for a
  77 C total 7.25 c/bp, which is slower than the current approach.
  78 C
  79 C Alternatives:
  80 C
  81 C Perhaps we could do something tricky by extracting a few high bits and a
  82 C few low bits from the operands, and looking up a table which would give a
  83 C set of predicates to control some shifts or subtracts or whatever.  That
  84 C could knock off multiple bits per iteration.
  85 C
  86 C The right shifts are a bit of a bottleneck (shr at 2 or 3 cycles, or extr
  87 C only going down I0), perhaps it'd be possible to shift left instead,
  88 C using add.  That would mean keeping track of the lowest not-yet-zeroed
  89 C bit, using some sort of mask.
  90 C
  91 C Itanium-1:
  92 C
  93 C This code is not designed for itanium-1 and in fact doesn't run well on
  94 C that chip.  The loop seems to be about 21 cycles, probably because we end
  95 C up with a 10 cycle replay for not forcibly scheduling the shr.u latency.
  96 C Lack of branch hints might introduce a couple of bubbles too.
  97 C
  98
  99 ASM_START()
 100         .explicit                               C What does this mean?
 101
 102 C HP's assembler requires these declarations for importing mpn_modexact_1c_odd
 103         .global mpn_modexact_1c_odd
 104         .type   mpn_modexact_1c_odd,@function
 105
 106 PROLOGUE(mpn_gcd_1)
 107
 108                 C r32   xp
 109                 C r33   xsize
 110                 C r34   y
 111
 112 define(x,           r8)
 113 define(xp_orig,     r32)
 114 define(xsize,       r33)
 115 define(y,           r34)  define(inputs, 3)
 116 define(save_rp,     r35)
 117 define(save_pfs,    r36)
 118 define(x_orig,      r37)
 119 define(x_orig_one,  r38)
 120 define(y_twos,      r39)  define(locals, 5)
 121 define(out_xp,      r40)
 122 define(out_xsize,   r41)
 123 define(out_divisor, r42)
 124 define(out_carry,   r43)  define(outputs, 4)
 125
 126         .prologue
 127 { .mmi;
 128 ifdef(`HAVE_ABI_32',
 129 `               addp4   r9 = 0, xp_orig   define(xp,r9)',       C M0
 130 `                                         define(xp,xp_orig)')
 131         .save ar.pfs, save_pfs
 132                 alloc   save_pfs = ar.pfs, inputs, locals, outputs, 0 C M2
 133         .save rp, save_rp
 134                 mov     save_rp = b0            C I0
 135 }{      .body
 136                 add     r10 = -1, y             C M3  y-1
 137 }               ;;
 138
 139 { .mmi;         ld8     x = [xp]                C M0  x = xp[0] if no modexact
 140                 ld8     x_orig = [xp]           C M1  orig x for common twos
 141                 cmp.ne  p6,p0 = 1, xsize        C I0
 142 }{ .mmi;        andcm   y_twos = r10, y         C M2  (y-1)&~y
 143                 mov     out_xp = xp_orig        C M3
 144                 mov     out_xsize = xsize       C I1
 145 }               ;;
 146
 147                 mov     out_carry = 0
 148
 149                 C
 150
 151                 popcnt  y_twos = y_twos         C I0  y twos
 152                 ;;
 153
 154                 C
 155
 156 { .mmi;         add     x_orig_one = -1, x_orig C M0  orig x-1
 157                 shr.u   out_divisor = y, y_twos C I0  y without twos
 158 }{              shr.u   y = y, y_twos           C I1  y without twos
 159         (p6)    br.call.sptk.many b0 = mpn_modexact_1c_odd  C if xsize>1
 160 }               ;;
 161
 162                 C modexact can leave x==0
 163 { .mmi;         cmp.eq  p6,p0 = 0, x            C M0  if {xp,xsize} % y == 0
 164                 andcm   x_orig = x_orig_one, x_orig     C M1  orig (x-1)&~x
 165                 add     r9 = -1, x              C I0  x-1
 166 }               ;;
 167
 168 { .mmi;         andcm   r9 = r9, x              C M0  (x-1)&~x
 169                 mov     b0 = save_rp            C I0
 170 }               ;;
 171
 172                 C
 173
 174                 popcnt  x_orig = x_orig         C I0  orig x twos
 175
 176                 popcnt  r9 = r9                 C I0  x twos
 177                 ;;
 178
 179                 C
 180
 181 {               cmp.lt  p7,p0 = x_orig, y_twos  C M0  orig x_twos < y_twos
 182                 shr.u   x = x, r9               C I0  x odd
 183 }               ;;
 184
 185 {       (p7)    mov     y_twos = x_orig         C M0  common twos
 186                 add     r10 = -1, y             C I0  y-1
 187         (p6)    br.dpnt.few .Ldone_y            C B0  x%y==0 then result y
 188 }               ;;
 189
 190                 C
 191
 192
 193                 C No noticable difference in speed for the loop aligned to
 194                 C 32 or just 16.
 195 .Ltop:
 196                 C r8    x
 197                 C r10  y-1
 198                 C r34   y
 199                 C r38   common twos, for use at end
 200
 201 {  .mmi;        cmp.gtu p8,p9 = x, y    C M0  x>y
 202                 cmp.ne  p10,p0 = x, y   C M1  x==y
 203                 sub     r9 = y, x       C I0  d = y - x
 204 }{ .mmi;        sub     r10 = r10, x    C M2  d-1 = y - x - 1
 205 }               ;;
 206
 207 { .mmi; .pred.rel "mutex", p8, p9
 208         (p8)    sub     x = x, y        C M0  x>y  use x=x-y, y unchanged
 209         (p9)    mov     y = x           C M1  y>=x use y=x
 210         (p9)    mov     x = r9          C I0  y>=x use x=y-x
 211 }{ .mmi;        andcm   r9 = r10, r9    C M2  (d-1)&~d
 212                 ;;
 213
 214                 add     r10 = -1, y     C M0  new y-1
 215                 popcnt  r9 = r9         C I0  twos on x-y
 216 }               ;;
 217
 218 {               shr.u   x = x, r9       C I0   new x without twos
 219         (p10)   br.sptk.few.clr .Ltop
 220 }               ;;
 221
 222
 223
 224                 C result is y
 225 .Ldone_y:
 226                 shl     r8 = y, y_twos          C I   common factors of 2
 227                 ;;
 228                 mov     ar.pfs = save_pfs       C I0
 229                 br.ret.sptk.many b0
 230
 231 EPILOGUE()