mpn/alpha/divrem_2.asm

   1 dnl  Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
   2
   3 dnl  Copyright 2007, 2008 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22 C               norm    frac
  23 C ev4
  24 C ev5           70      70
  25 C ev6           29      29
  26
  27 C TODO
  28 C  * Perhaps inline mpn_invert_limb, that would allow us to not save/restore
  29 C    any registers (thus save ~10 cycles per call).
  30 C  * Use negated d1 and/or d0 to speed carry propagation.  Might save a cycle
  31 C    or two.
  32 C  * Check cluster delays (for ev6).  We very likely could save some cycles.
  33 C  * Use branch-free code for computing di.
  34 C  * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call.
  35
  36 C INPUT PARAMETERS
  37 define(`qp',            `r16')
  38 define(`fn',            `r17')
  39 define(`up_param',      `r18')
  40 define(`un_param',      `r19')
  41 define(`dp',            `r20')
  42
  43 ASM_START()
  44 PROLOGUE(mpn_divrem_2)
  45         ldgp    r29, 0(r27)
  46         lda     r30, -80(r30)
  47         stq     r26, 0(r30)
  48         stq     r9, 8(r30)
  49         stq     r10, 16(r30)
  50         stq     r11, 24(r30)
  51         stq     r12, 32(r30)
  52         stq     r13, 40(r30)
  53 C       stq     r14, 48(r30)
  54         stq     r15, 56(r30)
  55         .prologue       1
  56         stq     r16, 64(r30)
  57         bis     r31, r17, r15
  58         s8addq  r19, r18, r13
  59         lda     r13, -24(r13)
  60         ldq     r12, 8(r20)
  61         ldq     r10, 0(r20)
  62         ldq     r11, 16(r13)
  63         ldq     r9, 8(r13)
  64
  65         bis     r31, r31, r3            C most_significant_q_limb = 0
  66         cmpult  r11, r12, r1
  67         bne     r1, L(L8)
  68         cmpule  r11, r12, r1
  69         cmpult  r9, r10, r2
  70         and     r1, r2, r1
  71         bne     r1, L(L8)
  72         subq    r11, r12, r11
  73         subq    r11, r2, r11
  74         subq    r9, r10, r9
  75         lda     r3, 1(r31)              C most_significant_q_limb = 1
  76 L(L8):  stq     r3, 72(r30)
  77
  78         addq    r15, r19, r19
  79         lda     r19, -3(r19)
  80         blt     r19, L(L10)
  81         bis     r31, r12, r16
  82         jsr     r26, mpn_invert_limb
  83         ldgp    r29, 0(r26)
  84         mulq    r0, r12, r4             C t0 = LO(di * d1)
  85         umulh   r0, r10, r2             C s1 = HI(di * d0)
  86         addq    r4, r10, r4             C t0 += d0
  87         cmpule  r10, r4, r7             C (t0 < d0)
  88         addq    r4, r2, r4              C t0 += s1
  89         cmpult  r4, r2, r1
  90         subq    r1, r7, r7              C t1 (-1, 0, or 1)
  91         blt     r7, L(L42)
  92 L(L22):
  93         lda     r0, -1(r0)              C di--
  94         cmpult  r4, r12, r1             C cy for: t0 -= d1 (below)
  95         subq    r7, r1, r7              C t1 -= cy
  96         subq    r4, r12, r4             C t0 -= d1
  97         bge     r7, L(L22)
  98 L(L42):
  99         ldq     r16, 64(r30)
 100         s8addq  r19, r16, r16
 101         ALIGN(16)
 102 L(loop):
 103         mulq    r11, r0, r5             C q0 (early)
 104         umulh   r11, r0, r6             C q  (early)
 105         addq    r5, r9, r8              C q0 += n1
 106         addq    r6, r11, r6             C q  += n2
 107         cmpult  r8, r5, r1              C cy for: q0 += n1
 108         addq    r6, r1, r6              C q  += cy
 109         unop
 110         mulq    r12, r6, r1             C LO(d1 * q)
 111         umulh   r10, r6, r7             C t1 = HI(d0 * q)
 112         subq    r9, r1, r9              C n1 -= LO(d1 * q)
 113         mulq    r10, r6, r4             C t0 = LO(d0 * q)
 114         unop
 115         cmple   r15, r19, r5            C condition and n0...
 116         beq     r5, L(L31)
 117         ldq     r5, 0(r13)
 118         lda     r13, -8(r13)
 119 L(L31): subq    r9, r12, r9             C n1 -= d1
 120         cmpult  r5, r10, r1             C
 121         subq    r9, r1, r9              C
 122         subq    r5, r10, r5             C n0 -= d0
 123         subq    r9, r7, r9              C n1 -= t0
 124         cmpult  r5, r4, r1              C
 125         subq    r9, r1, r2              C
 126         subq    r5, r4, r5              C n0 -= t1
 127         cmpult  r2, r8, r1              C (n1 < q0)
 128         addq    r6, r1, r6              C q += cond
 129         lda     r1, -1(r1)              C -(n1 >= q0)
 130         and     r1, r10, r4             C
 131         addq    r5, r4, r9              C n0 += mask & d0
 132         and     r1, r12, r1             C
 133         cmpult  r9, r5, r11             C cy for: n0 += mask & d0
 134         addq    r2, r1, r1              C n1 += mask & d1
 135         addq    r1, r11, r11            C n1 += cy
 136         cmpult  r11, r12, r1            C
 137         beq     r1, L(fix)              C
 138 L(bck): stq     r6, 0(r16)
 139         lda     r16, -8(r16)
 140         lda     r19, -1(r19)
 141         bge     r19, L(loop)
 142
 143 L(L10): stq     r9, 8(r13)
 144         stq     r11, 16(r13)
 145         ldq     r0, 72(r30)
 146         ldq     r26, 0(r30)
 147         ldq     r9, 8(r30)
 148         ldq     r10, 16(r30)
 149         ldq     r11, 24(r30)
 150         ldq     r12, 32(r30)
 151         ldq     r13, 40(r30)
 152 C       ldq     r14, 48(r30)
 153         ldq     r15, 56(r30)
 154         lda     r30, 80(r30)
 155         ret     r31, (r26), 1
 156
 157 L(fix): cmpule  r11, r12, r1
 158         cmpult  r9, r10, r2
 159         and     r1, r2, r1
 160         bne     r1, L(bck)
 161         subq    r11, r12, r11
 162         subq    r11, r2, r11
 163         subq    r9, r10, r9
 164         lda     r6, 1(r6)
 165         br      L(bck)
 166 EPILOGUE()
 167 ASM_END()