sysdeps/i386/fpu/s_cbrtl.S

   1 /* Compute cubic root of long double value.
   2    Copyright (C) 1997, 2005, 2012 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
   5    Ulrich Drepper <drepper@cygnus.com>, 1997.
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library; if not, write to the Free
  19    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  20    02111-1307 USA.  */
  21
  22 #include <machine/asm.h>
  23
  24         .section .rodata
  25
  26         .align ALIGNARG(4)
  27         ASM_TYPE_DIRECTIVE(f8,@object)
  28 f8:     .tfloat 0.161617097923756032
  29         ASM_SIZE_DIRECTIVE(f8)
  30         .align ALIGNARG(4)
  31         ASM_TYPE_DIRECTIVE(f7,@object)
  32 f7:     .tfloat -0.988553671195413709
  33         ASM_SIZE_DIRECTIVE(f7)
  34         .align ALIGNARG(4)
  35         ASM_TYPE_DIRECTIVE(f6,@object)
  36 f6:     .tfloat 2.65298938441952296
  37         ASM_SIZE_DIRECTIVE(f6)
  38         .align ALIGNARG(4)
  39         ASM_TYPE_DIRECTIVE(f5,@object)
  40 f5:     .tfloat -4.11151425200350531
  41         ASM_SIZE_DIRECTIVE(f5)
  42         .align ALIGNARG(4)
  43         ASM_TYPE_DIRECTIVE(f4,@object)
  44 f4:     .tfloat 4.09559907378707839
  45         ASM_SIZE_DIRECTIVE(f4)
  46         .align ALIGNARG(4)
  47         ASM_TYPE_DIRECTIVE(f3,@object)
  48 f3:     .tfloat -2.82414939754975962
  49         ASM_SIZE_DIRECTIVE(f3)
  50         .align ALIGNARG(4)
  51         ASM_TYPE_DIRECTIVE(f2,@object)
  52 f2:     .tfloat 1.67595307700780102
  53         ASM_SIZE_DIRECTIVE(f2)
  54         .align ALIGNARG(4)
  55         ASM_TYPE_DIRECTIVE(f1,@object)
  56 f1:     .tfloat 0.338058687610520237
  57         ASM_SIZE_DIRECTIVE(f1)
  58
  59 #define CBRT2           1.2599210498948731648
  60 #define ONE_CBRT2       0.793700525984099737355196796584
  61 #define SQR_CBRT2       1.5874010519681994748
  62 #define ONE_SQR_CBRT2   0.629960524947436582364439673883
  63
  64         /* We make the entries in the following table all 16 bytes
  65            wide to avoid having to implement a multiplication by 10.  */
  66         ASM_TYPE_DIRECTIVE(factor,@object)
  67         .align ALIGNARG(4)
  68 factor: .tfloat ONE_SQR_CBRT2
  69         .byte 0, 0, 0, 0, 0, 0
  70         .tfloat ONE_CBRT2
  71         .byte 0, 0, 0, 0, 0, 0
  72         .tfloat 1.0
  73         .byte 0, 0, 0, 0, 0, 0
  74         .tfloat CBRT2
  75         .byte 0, 0, 0, 0, 0, 0
  76         .tfloat SQR_CBRT2
  77         ASM_SIZE_DIRECTIVE(factor)
  78
  79         ASM_TYPE_DIRECTIVE(two64,@object)
  80         .align ALIGNARG(4)
  81 two64:  .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
  82         ASM_SIZE_DIRECTIVE(two64)
  83
  84 #ifdef PIC
  85 #define MO(op) op##@GOTOFF(%ebx)
  86 #define MOX(op,x) op##@GOTOFF(%ebx,x,1)
  87 #else
  88 #define MO(op) op
  89 #define MOX(op,x) op(x)
  90 #endif
  91
  92         .text
  93 ENTRY(__cbrtl)
  94         movl    4(%esp), %ecx
  95         movl    12(%esp), %eax
  96         orl     8(%esp), %ecx
  97         movl    %eax, %edx
  98         andl    $0x7fff, %eax
  99         orl     %eax, %ecx
 100         jz      1f
 101         xorl    %ecx, %ecx
 102         cmpl    $0x7fff, %eax
 103         je      1f
 104
 105 #ifdef PIC
 106         pushl   %ebx
 107         cfi_adjust_cfa_offset (4)
 108         cfi_rel_offset (ebx, 0)
 109         LOAD_PIC_REG (bx)
 110 #endif
 111
 112         cmpl    $0, %eax
 113         jne     2f
 114
 115 #ifdef PIC
 116         fldt    8(%esp)
 117 #else
 118         fldt    4(%esp)
 119 #endif
 120         fmull   MO(two64)
 121         movl    $-64, %ecx
 122 #ifdef PIC
 123         fstpt   8(%esp)
 124         movl    16(%esp), %eax
 125 #else
 126         fstpt   4(%esp)
 127         movl    12(%esp), %eax
 128 #endif
 129         movl    %eax, %edx
 130         andl    $0x7fff, %eax
 131
 132 2:      andl    $0x8000, %edx
 133         subl    $16382, %eax
 134         orl     $0x3ffe, %edx
 135         addl    %eax, %ecx
 136 #ifdef PIC
 137         movl    %edx, 16(%esp)
 138
 139         fldt    8(%esp)                 /* xm */
 140 #else
 141         movl    %edx, 12(%esp)
 142
 143         fldt    4(%esp)                 /* xm */
 144 #endif
 145         fabs
 146
 147         /* The following code has two tracks:
 148             a) compute the normalized cbrt value
 149             b) compute xe/3 and xe%3
 150            The right track computes the value for b) and this is done
 151            in an optimized way by avoiding division.
 152
 153            But why two tracks at all?  Very easy: efficiency.  Some FP
 154            instruction can overlap with a certain amount of integer (and
 155            FP) instructions.  So we get (except for the imull) all
 156            instructions for free.  */
 157
 158         fldt    MO(f8)                  /* f8 : xm */
 159         fmul    %st(1)                  /* f8*xm : xm */
 160
 161         fldt    MO(f7)
 162         faddp                           /* f7+f8*xm : xm */
 163         fmul    %st(1)                  /* (f7+f8*xm)*xm : xm */
 164                         movl    $1431655766, %eax
 165         fldt    MO(f6)
 166         faddp                           /* f6+(f7+f8*xm)*xm : xm */
 167                         imull   %ecx
 168         fmul    %st(1)                  /* (f6+(f7+f8*xm)*xm)*xm : xm */
 169                         movl    %ecx, %eax
 170         fldt    MO(f5)
 171         faddp                           /* f5+(f6+(f7+f8*xm)*xm)*xm : xm */
 172                         sarl    $31, %eax
 173         fmul    %st(1)                  /* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
 174                         subl    %eax, %edx
 175         fldt    MO(f4)
 176         faddp                           /* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
 177         fmul    %st(1)                  /* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
 178         fldt    MO(f3)
 179         faddp                           /* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
 180         fmul    %st(1)                  /* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
 181         fldt    MO(f2)
 182         faddp                           /* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
 183         fmul    %st(1)                  /* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
 184         fldt    MO(f1)
 185         faddp                           /* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
 186
 187         fld     %st                     /* u : u : xm */
 188         fmul    %st(1)                  /* u*u : u : xm */
 189         fld     %st(2)                  /* xm : u*u : u : xm */
 190         fadd    %st                     /* 2*xm : u*u : u : xm */
 191         fxch    %st(1)                  /* u*u : 2*xm : u : xm */
 192         fmul    %st(2)                  /* t2:=u*u*u : 2*xm : u : xm */
 193                         movl    %edx, %eax
 194         fadd    %st, %st(1)             /* t2 : t2+2*xm : u : xm */
 195                         leal    (%edx,%edx,2),%edx
 196         fadd    %st(0)                  /* 2*t2 : t2+2*xm : u : xm */
 197                         subl    %edx, %ecx
 198         faddp   %st, %st(3)             /* t2+2*xm : u : 2*t2+xm */
 199                         shll    $4, %ecx
 200         fmulp                           /* u*(t2+2*xm) : 2*t2+xm */
 201         fdivp   %st, %st(1)             /* u*(t2+2*xm)/(2*t2+xm) */
 202         fldt    MOX(32+factor,%ecx)
 203         fmulp                           /* u*(t2+2*xm)/(2*t2+xm)*FACT */
 204         pushl   %eax
 205         cfi_adjust_cfa_offset (4)
 206         fildl   (%esp)                  /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
 207         fxch                            /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
 208         fscale                          /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
 209         popl    %edx
 210         cfi_adjust_cfa_offset (-4)
 211 #ifdef PIC
 212         movl    16(%esp), %eax
 213         popl    %ebx
 214         cfi_adjust_cfa_offset (-4)
 215         cfi_restore (ebx)
 216 #else
 217         movl    12(%esp), %eax
 218 #endif
 219         testl   $0x8000, %eax
 220         fstp    %st(1)
 221         jz      4f
 222         fchs
 223 4:      ret
 224
 225         /* Return the argument.  */
 226 1:      fldt    4(%esp)
 227         ret
 228 END(__cbrtl)
 229 weak_alias (__cbrtl, cbrtl)