mpn/x86/k6/mmx/logops_n.asm

   1 dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
   2 dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
   3
   4 dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
   5 dnl
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or
   9 dnl  modify it under the terms of the GNU Lesser General Public License as
  10 dnl  published by the Free Software Foundation; either version 3 of the
  11 dnl  License, or (at your option) any later version.
  12 dnl
  13 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  14 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 dnl  Lesser General Public License for more details.
  17 dnl
  18 dnl  You should have received a copy of the GNU Lesser General Public License
  19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  20
  21 include(`../config.m4')
  22
  23 NAILS_SUPPORT(0-31)
  24
  25
  26 C         alignment dst/src1/src2, A=0mod8, N=4mod8
  27 C      A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
  28 C
  29 C K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
  30 C K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
  31 C K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
  32 C
  33 C K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
  34 C K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
  35 C K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
  36
  37
  38 dnl  M4_p and M4_i are the MMX and integer instructions
  39 dnl  M4_*_neg_dst means whether to negate the final result before writing
  40 dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
  41
  42 define(M4_choose_op,
  43 m4_assert_numargs(7)
  44 `ifdef(`OPERATION_$1',`
  45 define(`M4_function',  `mpn_$1')
  46 define(`M4_operation', `$1')
  47 define(`M4_p',         `$2')
  48 define(`M4_p_neg_dst', `$3')
  49 define(`M4_p_neg_src2',`$4')
  50 define(`M4_i',         `$5')
  51 define(`M4_i_neg_dst', `$6')
  52 define(`M4_i_neg_src2',`$7')
  53 ')')
  54
  55 dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
  56 dnl  style (the two are equivalent for xor).
  57 dnl
  58 dnl  pandn can't be used with nails.
  59
  60 M4_choose_op( and_n,  pand,0,0,  andl,0,0)
  61 ifelse(GMP_NAIL_BITS,0,
  62 `M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
  63 `M4_choose_op(andn_n, pand,0,1,  andl,0,1)')
  64 M4_choose_op( nand_n, pand,1,0,  andl,1,0)
  65 M4_choose_op( ior_n,  por,0,0,   orl,0,0)
  66 M4_choose_op( iorn_n, por,0,1,   orl,0,1)
  67 M4_choose_op( nior_n, por,1,0,   orl,1,0)
  68 M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
  69 M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
  70
  71 ifdef(`M4_function',,
  72 `m4_error(`Unrecognised or undefined OPERATION symbol
  73 ')')
  74
  75 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
  76
  77
  78 C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
  79 C                   mp_size_t size);
  80 C
  81 C Do src1,size M4_operation src2,size, storing the result in dst,size.
  82 C
  83 C Unaligned movq loads and stores are a bit slower than aligned ones.  The
  84 C test at the start of the routine checks the alignment of src1 and if
  85 C necessary processes one limb separately at the low end to make it aligned.
  86 C
  87 C The raw speeds without this alignment switch are as follows.
  88 C
  89 C           alignment dst/src1/src2, A=0mod8, N=4mod8
  90 C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
  91 C
  92 C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
  93 C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
  94 C K6                 2.0    2.25                2.35   2.28   nand,nior
  95 C
  96 C
  97 C Future:
  98 C
  99 C K6 can do one 64-bit load per cycle so each of these routines should be
 100 C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
 101 C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
 102 C The others are 4 instructions per 2 limbs, and so can only approach 1.0
 103 C because there's nowhere to hide some loop control.
 104
 105 defframe(PARAM_SIZE,16)
 106 defframe(PARAM_SRC2,12)
 107 defframe(PARAM_SRC1,8)
 108 defframe(PARAM_DST, 4)
 109 deflit(`FRAME',0)
 110
 111         TEXT
 112         ALIGN(32)
 113 PROLOGUE(M4_function)
 114                         movl    PARAM_SIZE, %ecx
 115                         pushl   %ebx            FRAME_pushl()
 116
 117                         movl    PARAM_SRC1, %eax
 118
 119                         movl    PARAM_SRC2, %ebx
 120                         cmpl    $1, %ecx
 121
 122                         movl    PARAM_DST, %edx
 123                         ja      L(two_or_more)
 124
 125
 126                         movl    (%ebx), %ecx
 127                         popl    %ebx
 128 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(     %ecx)')
 129                         M4_i    (%eax), %ecx
 130 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK(     %ecx)')
 131                         movl    %ecx, (%edx)
 132
 133                         ret
 134
 135
 136 L(two_or_more):
 137                         C eax   src1
 138                         C ebx   src2
 139                         C ecx   size
 140                         C edx   dst
 141                         C esi
 142                         C edi
 143                         C ebp
 144
 145                         pushl   %esi            FRAME_pushl()
 146                         testl   $4, %eax
 147                         jz      L(alignment_ok)
 148
 149                         movl    (%ebx), %esi
 150                         addl    $4, %ebx
 151 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(     %esi)')
 152                         M4_i    (%eax), %esi
 153                         addl    $4, %eax
 154 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK(     %esi)')
 155                         movl    %esi, (%edx)
 156                         addl    $4, %edx
 157                         decl    %ecx
 158
 159 L(alignment_ok):
 160                         movl    %ecx, %esi
 161                         shrl    %ecx
 162                         jnz     L(still_two_or_more)
 163
 164                         movl    (%ebx), %ecx
 165                         popl    %esi
 166 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(     %ecx)')
 167                         M4_i    (%eax), %ecx
 168 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK(     %ecx)')
 169                         popl    %ebx
 170                         movl    %ecx, (%edx)
 171                         ret
 172
 173
 174 L(still_two_or_more):
 175 ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
 176                         pcmpeqd %mm7, %mm7              C all ones
 177 ifelse(GMP_NAIL_BITS,0,,`psrld  $GMP_NAIL_BITS, %mm7')  C clear nails
 178 ')
 179
 180                         ALIGN(16)
 181 L(top):
 182                         C eax   src1
 183                         C ebx   src2
 184                         C ecx   counter
 185                         C edx   dst
 186                         C esi
 187                         C edi
 188                         C ebp
 189                         C
 190                         C carry bit is low of size
 191
 192                         movq    -8(%ebx,%ecx,8), %mm0
 193 ifelse(M4_p_neg_src2,1,`pxor    %mm7, %mm0')
 194                         M4_p    -8(%eax,%ecx,8), %mm0
 195 ifelse(M4_p_neg_dst,1,` pxor    %mm7, %mm0')
 196                         movq    %mm0, -8(%edx,%ecx,8)
 197
 198                         loop    L(top)
 199
 200
 201                         jnc     L(no_extra)
 202
 203                         movl    -4(%ebx,%esi,4), %ebx
 204 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(     %ebx)')
 205                         M4_i    -4(%eax,%esi,4), %ebx
 206 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK(     %ebx)')
 207                         movl    %ebx, -4(%edx,%esi,4)
 208 L(no_extra):
 209
 210                         popl    %esi
 211                         popl    %ebx
 212                         emms_or_femms
 213                         ret
 214
 215 EPILOGUE()