mpn/pa32/hppa1_1/submul_1.asm

   1 dnl  HP-PA 1.1 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
   2 dnl  the result from a second limb vector.
   3
   4 dnl  Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation,
   5 dnl  Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of the GNU Lesser General Public License as published
  11 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  12 dnl  your option) any later version.
  13
  14 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  15 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  16 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  17 dnl  License for more details.
  18
  19 dnl  You should have received a copy of the GNU Lesser General Public License
  20 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  21
  22 include(`../config.m4')
  23
  24 C INPUT PARAMETERS
  25 C res_ptr       r26
  26 C s1_ptr        r25
  27 C size          r24
  28 C s2_limb       r23
  29
  30 C This runs at 12 cycles/limb on a PA7000.  With the used instructions, it can
  31 C not become faster due to data cache contention after a store.  On the PA7100
  32 C it runs at 11 cycles/limb.
  33
  34 C There are some ideas described in mul_1.asm that applies to this code too.
  35
  36 C It seems possible to make this run as fast as mpn_addmul_1, if we use
  37 C       sub,>>= %r29,%r19,%r22
  38 C       addi    1,%r28,%r28
  39 C but that requires reworking the hairy software pipeline...
  40
  41 ASM_START()
  42 PROLOGUE(mpn_submul_1)
  43 C       .callinfo       frame=64,no_calls
  44
  45         ldo             64(%r30),%r30
  46         fldws,ma        4(%r25),%fr5
  47         stw             %r23,-16(%r30)          C move s2_limb ...
  48         addib,=         -1,%r24,L(just_one_limb)
  49          fldws          -16(%r30),%fr4          C ... into fr4
  50         add             %r0,%r0,%r0             C clear carry
  51         xmpyu           %fr4,%fr5,%fr6
  52         fldws,ma        4(%r25),%fr7
  53         fstds           %fr6,-16(%r30)
  54         xmpyu           %fr4,%fr7,%fr8
  55         ldw             -12(%r30),%r19          C least significant limb in product
  56         ldw             -16(%r30),%r28
  57
  58         fstds           %fr8,-16(%r30)
  59         addib,=         -1,%r24,L(end)
  60          ldw            -12(%r30),%r1
  61
  62 C Main loop
  63 LDEF(loop)
  64         ldws            0(%r26),%r29
  65         fldws,ma        4(%r25),%fr5
  66         sub             %r29,%r19,%r22
  67         add             %r22,%r19,%r0
  68         stws,ma         %r22,4(%r26)
  69         addc            %r28,%r1,%r19
  70         xmpyu           %fr4,%fr5,%fr6
  71         ldw             -16(%r30),%r28
  72         fstds           %fr6,-16(%r30)
  73         addc            %r0,%r28,%r28
  74         addib,<>        -1,%r24,L(loop)
  75          ldw            -12(%r30),%r1
  76
  77 LDEF(end)
  78         ldw             0(%r26),%r29
  79         sub             %r29,%r19,%r22
  80         add             %r22,%r19,%r0
  81         stws,ma         %r22,4(%r26)
  82         addc            %r28,%r1,%r19
  83         ldw             -16(%r30),%r28
  84         ldws            0(%r26),%r29
  85         addc            %r0,%r28,%r28
  86         sub             %r29,%r19,%r22
  87         add             %r22,%r19,%r0
  88         stws,ma         %r22,4(%r26)
  89         addc            %r0,%r28,%r28
  90         bv              0(%r2)
  91          ldo            -64(%r30),%r30
  92
  93 LDEF(just_one_limb)
  94         xmpyu           %fr4,%fr5,%fr6
  95         ldw             0(%r26),%r29
  96         fstds           %fr6,-16(%r30)
  97         ldw             -12(%r30),%r1
  98         ldw             -16(%r30),%r28
  99         sub             %r29,%r1,%r22
 100         add             %r22,%r1,%r0
 101         stw             %r22,0(%r26)
 102         addc            %r0,%r28,%r28
 103         bv              0(%r2)
 104          ldo            -64(%r30),%r30
 105 EPILOGUE()