mozilla/security/nss/lib/freebl/mpi/mpi_x86_asm.c

   1 /*
   2  *  mpi_x86.c - MSVC inline assembly implementation of s_mpv_ functions.
   3  *
   4  * ***** BEGIN LICENSE BLOCK *****
   5  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   6  *
   7  * The contents of this file are subject to the Mozilla Public License Version
   8  * 1.1 (the "License"); you may not use this file except in compliance with
   9  * the License. You may obtain a copy of the License at
  10  * http://www.mozilla.org/MPL/
  11  *
  12  * Software distributed under the License is distributed on an "AS IS" basis,
  13  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  14  * for the specific language governing rights and limitations under the
  15  * License.
  16  *
  17  * The Original Code is the Netscape security libraries.
  18  *
  19  * The Initial Developer of the Original Code is
  20  * Netscape Communications Corporation.
  21  * Portions created by the Initial Developer are Copyright (C) 2000
  22  * the Initial Developer. All Rights Reserved.
  23  *
  24  * Contributor(s):
  25  *   Benjamin Smedberg <benjamin@smedbergs.us>
  26  *
  27  * Alternatively, the contents of this file may be used under the terms of
  28  * either the GNU General Public License Version 2 or later (the "GPL"), or
  29  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  30  * in which case the provisions of the GPL or the LGPL are applicable instead
  31  * of those above. If you wish to allow use of your version of this file only
  32  * under the terms of either the GPL or the LGPL, and not to allow others to
  33  * use your version of this file under the terms of the MPL, indicate your
  34  * decision by deleting the provisions above and replace them with the notice
  35  * and other provisions required by the GPL or the LGPL. If you do not delete
  36  * the provisions above, a recipient may use your version of this file under
  37  * the terms of any one of the MPL, the GPL or the LGPL.
  38  *
  39  * ***** END LICENSE BLOCK ***** */
  40
  41 #include "mpi-priv.h"
  42
  43 static int is_sse = -1;
  44 extern unsigned long s_mpi_is_sse2();
  45
  46 /*
  47  *   ebp - 36:  caller's esi
  48  *   ebp - 32:  caller's edi
  49  *   ebp - 28:
  50  *   ebp - 24:
  51  *   ebp - 20:
  52  *   ebp - 16:
  53  *   ebp - 12:
  54  *   ebp - 8:
  55  *   ebp - 4:
  56  *   ebp + 0:   caller's ebp
  57  *   ebp + 4:   return address
  58  *   ebp + 8:   a       argument
  59  *   ebp + 12:  a_len   argument
  60  *   ebp + 16:  b       argument
  61  *   ebp + 20:  c       argument
  62  *   registers:
  63  *      eax:
  64  *      ebx:    carry
  65  *      ecx:    a_len
  66  *      edx:
  67  *      esi:    a ptr
  68  *      edi:    c ptr
  69  */
  70 __declspec(naked) void
  71 s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
  72 {
  73   __asm {
  74     mov    eax, is_sse
  75     cmp    eax, 0
  76     je     s_mpv_mul_d_x86
  77     jg     s_mpv_mul_d_sse2
  78     call   s_mpi_is_sse2
  79     mov    is_sse, eax
  80     cmp    eax, 0
  81     jg     s_mpv_mul_d_sse2
  82 s_mpv_mul_d_x86:
  83     push   ebp
  84     mov    ebp,esp
  85     sub    esp,28
  86     push   edi
  87     push   esi
  88     push   ebx
  89     mov    ebx,0                ; carry = 0
  90     mov    ecx,[ebp+12]         ; ecx = a_len
  91     mov    edi,[ebp+20]
  92     cmp    ecx,0
  93     je     L_2                  ; jmp if a_len == 0
  94     mov    esi,[ebp+8]          ; esi = a
  95     cld
  96 L_1:
  97     lodsd                       ; eax = [ds:esi]; esi += 4
  98     mov    edx,[ebp+16]         ; edx = b
  99     mul    edx                  ; edx:eax = Phi:Plo = a_i * b
 100
 101     add    eax,ebx              ; add carry (ebx) to edx:eax
 102     adc    edx,0
 103     mov    ebx,edx              ; high half of product becomes next carry
 104
 105     stosd                       ; [es:edi] = ax; edi += 4;
 106     dec    ecx                  ; --a_len
 107     jnz    L_1                  ; jmp if a_len != 0
 108 L_2:
 109     mov    [edi],ebx            ; *c = carry
 110     pop    ebx
 111     pop    esi
 112     pop    edi
 113     leave
 114     ret
 115     nop
 116 s_mpv_mul_d_sse2:
 117     push   ebp
 118     mov    ebp, esp
 119     push   edi
 120     push   esi
 121     psubq  mm2, mm2             ; carry = 0
 122     mov    ecx, [ebp+12]        ; ecx = a_len
 123     movd   mm1, [ebp+16]        ; mm1 = b
 124     mov    edi, [ebp+20]
 125     cmp    ecx, 0
 126     je     L_6                  ; jmp if a_len == 0
 127     mov    esi, [ebp+8]         ; esi = a
 128     cld
 129 L_5:
 130     movd   mm0, [esi]           ; mm0 = *a++
 131     add    esi, 4
 132     pmuludq mm0, mm1            ; mm0 = b * *a++
 133     paddq  mm2, mm0             ; add the carry
 134     movd   [edi], mm2           ; store the 32bit result
 135     add    edi, 4
 136     psrlq  mm2, 32              ; save the carry
 137     dec    ecx                  ; --a_len
 138     jnz    L_5                  ; jmp if a_len != 0
 139 L_6:
 140     movd   [edi], mm2           ; *c = carry
 141     emms
 142     pop    esi
 143     pop    edi
 144     leave
 145     ret
 146     nop
 147   }
 148 }
 149
 150 /*
 151  *   ebp - 36:  caller's esi
 152  *   ebp - 32:  caller's edi
 153  *   ebp - 28:
 154  *   ebp - 24:
 155  *   ebp - 20:
 156  *   ebp - 16:
 157  *   ebp - 12:
 158  *   ebp - 8:
 159  *   ebp - 4:
 160  *   ebp + 0:   caller's ebp
 161  *   ebp + 4:   return address
 162  *   ebp + 8:   a       argument
 163  *   ebp + 12:  a_len   argument
 164  *   ebp + 16:  b       argument
 165  *   ebp + 20:  c       argument
 166  *   registers:
 167  *      eax:
 168  *      ebx:    carry
 169  *      ecx:    a_len
 170  *      edx:
 171  *      esi:    a ptr
 172  *      edi:    c ptr
 173  */
 174 __declspec(naked) void
 175 s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
 176 {
 177   __asm {
 178     mov    eax, is_sse
 179     cmp    eax, 0
 180     je     s_mpv_mul_d_add_x86
 181     jg     s_mpv_mul_d_add_sse2
 182     call   s_mpi_is_sse2
 183     mov    is_sse, eax
 184     cmp    eax, 0
 185     jg     s_mpv_mul_d_add_sse2
 186 s_mpv_mul_d_add_x86:
 187     push   ebp
 188     mov    ebp,esp
 189     sub    esp,28
 190     push   edi
 191     push   esi
 192     push   ebx
 193     mov    ebx,0                ; carry = 0
 194     mov    ecx,[ebp+12]         ; ecx = a_len
 195     mov    edi,[ebp+20]
 196     cmp    ecx,0
 197     je     L_11                 ; jmp if a_len == 0
 198     mov    esi,[ebp+8]          ; esi = a
 199     cld
 200 L_10:
 201     lodsd                       ; eax = [ds:esi]; esi += 4
 202     mov    edx,[ebp+16]         ; edx = b
 203     mul    edx                  ; edx:eax = Phi:Plo = a_i * b
 204
 205     add    eax,ebx              ; add carry (ebx) to edx:eax
 206     adc    edx,0
 207     mov    ebx,[edi]            ; add in current word from *c
 208     add    eax,ebx
 209     adc    edx,0
 210     mov    ebx,edx              ; high half of product becomes next carry
 211
 212     stosd                       ; [es:edi] = ax; edi += 4;
 213     dec    ecx                  ; --a_len
 214     jnz    L_10                 ; jmp if a_len != 0
 215 L_11:
 216     mov    [edi],ebx            ; *c = carry
 217     pop    ebx
 218     pop    esi
 219     pop    edi
 220     leave
 221     ret
 222     nop
 223 s_mpv_mul_d_add_sse2:
 224     push   ebp
 225     mov    ebp, esp
 226     push   edi
 227     push   esi
 228     psubq  mm2, mm2             ; carry = 0
 229     mov    ecx, [ebp+12]        ; ecx = a_len
 230     movd   mm1, [ebp+16]        ; mm1 = b
 231     mov    edi, [ebp+20]
 232     cmp    ecx, 0
 233     je     L_16                 ; jmp if a_len == 0
 234     mov    esi, [ebp+8]         ; esi = a
 235     cld
 236 L_15:
 237     movd   mm0, [esi]           ; mm0 = *a++
 238     add    esi, 4
 239     pmuludq mm0, mm1            ; mm0 = b * *a++
 240     paddq  mm2, mm0             ; add the carry
 241     movd   mm0, [edi]
 242     paddq  mm2, mm0             ; add the carry
 243     movd   [edi], mm2           ; store the 32bit result
 244     add    edi, 4
 245     psrlq  mm2, 32              ; save the carry
 246     dec    ecx                  ; --a_len
 247     jnz    L_15                 ; jmp if a_len != 0
 248 L_16:
 249     movd   [edi], mm2           ; *c = carry
 250     emms
 251     pop    esi
 252     pop    edi
 253     leave
 254     ret
 255     nop
 256   }
 257 }
 258
 259 /*
 260  *   ebp - 36:  caller's esi
 261  *   ebp - 32:  caller's edi
 262  *   ebp - 28:
 263  *   ebp - 24:
 264  *   ebp - 20:
 265  *   ebp - 16:
 266  *   ebp - 12:
 267  *   ebp - 8:
 268  *   ebp - 4:
 269  *   ebp + 0:   caller's ebp
 270  *   ebp + 4:   return address
 271  *   ebp + 8:   a       argument
 272  *   ebp + 12:  a_len   argument
 273  *   ebp + 16:  b       argument
 274  *   ebp + 20:  c       argument
 275  *   registers:
 276  *      eax:
 277  *      ebx:    carry
 278  *      ecx:    a_len
 279  *      edx:
 280  *      esi:    a ptr
 281  *      edi:    c ptr
 282  */
 283 __declspec(naked) void
 284 s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
 285 {
 286   __asm {
 287     mov    eax, is_sse
 288     cmp    eax, 0
 289     je     s_mpv_mul_d_add_prop_x86
 290     jg     s_mpv_mul_d_add_prop_sse2
 291     call   s_mpi_is_sse2
 292     mov    is_sse, eax
 293     cmp    eax, 0
 294     jg     s_mpv_mul_d_add_prop_sse2
 295 s_mpv_mul_d_add_prop_x86:
 296     push   ebp
 297     mov    ebp,esp
 298     sub    esp,28
 299     push   edi
 300     push   esi
 301     push   ebx
 302     mov    ebx,0                ; carry = 0
 303     mov    ecx,[ebp+12]         ; ecx = a_len
 304     mov    edi,[ebp+20]
 305     cmp    ecx,0
 306     je     L_21                 ; jmp if a_len == 0
 307     cld
 308     mov    esi,[ebp+8]          ; esi = a
 309 L_20:
 310     lodsd                       ; eax = [ds:esi]; esi += 4
 311     mov    edx,[ebp+16]         ; edx = b
 312     mul    edx                  ; edx:eax = Phi:Plo = a_i * b
 313
 314     add    eax,ebx              ; add carry (ebx) to edx:eax
 315     adc    edx,0
 316     mov    ebx,[edi]            ; add in current word from *c
 317     add    eax,ebx
 318     adc    edx,0
 319     mov    ebx,edx              ; high half of product becomes next carry
 320
 321     stosd                       ; [es:edi] = ax; edi += 4;
 322     dec    ecx                  ; --a_len
 323     jnz    L_20                 ; jmp if a_len != 0
 324 L_21:
 325     cmp    ebx,0                ; is carry zero?
 326     jz     L_23
 327     mov    eax,[edi]            ; add in current word from *c
 328     add    eax,ebx
 329     stosd                       ; [es:edi] = ax; edi += 4;
 330     jnc    L_23
 331 L_22:
 332     mov    eax,[edi]            ; add in current word from *c
 333     adc    eax,0
 334     stosd                       ; [es:edi] = ax; edi += 4;
 335     jc     L_22
 336 L_23:
 337     pop    ebx
 338     pop    esi
 339     pop    edi
 340     leave
 341     ret
 342     nop
 343 s_mpv_mul_d_add_prop_sse2:
 344     push   ebp
 345     mov    ebp, esp
 346     push   edi
 347     push   esi
 348     push   ebx
 349     psubq  mm2, mm2             ; carry = 0
 350     mov    ecx, [ebp+12]        ; ecx = a_len
 351     movd   mm1, [ebp+16]        ; mm1 = b
 352     mov    edi, [ebp+20]
 353     cmp    ecx, 0
 354     je     L_26                 ; jmp if a_len == 0
 355     mov    esi, [ebp+8]         ; esi = a
 356     cld
 357 L_25:
 358     movd   mm0, [esi]           ; mm0 = *a++
 359     movd   mm3, [edi]           ; fetch the sum
 360     add    esi, 4
 361     pmuludq mm0, mm1            ; mm0 = b * *a++
 362     paddq  mm2, mm0             ; add the carry
 363     paddq  mm2, mm3             ; add *c++
 364     movd   [edi], mm2           ; store the 32bit result
 365     add    edi, 4
 366     psrlq  mm2, 32              ; save the carry
 367     dec    ecx                  ; --a_len
 368     jnz    L_25                 ; jmp if a_len != 0
 369 L_26:
 370     movd   ebx, mm2
 371     cmp    ebx, 0               ; is carry zero?
 372     jz     L_28
 373     mov    eax, [edi]
 374     add    eax, ebx
 375     stosd
 376     jnc    L_28
 377 L_27:
 378     mov    eax, [edi]           ; add in current word from *c
 379     adc    eax, 0
 380     stosd                       ; [es:edi] = ax; edi += 4;
 381     jc     L_27
 382 L_28:
 383     emms
 384     pop    ebx
 385     pop    esi
 386     pop    edi
 387     leave
 388     ret
 389     nop
 390   }
 391 }
 392
 393 /*
 394  *   ebp - 20:  caller's esi
 395  *   ebp - 16:  caller's edi
 396  *   ebp - 12:
 397  *   ebp - 8:   carry
 398  *   ebp - 4:   a_len   local
 399  *   ebp + 0:   caller's ebp
 400  *   ebp + 4:   return address
 401  *   ebp + 8:   pa      argument
 402  *   ebp + 12:  a_len   argument
 403  *   ebp + 16:  ps      argument
 404  *   ebp + 20:
 405  *   registers:
 406  *      eax:
 407  *      ebx:    carry
 408  *      ecx:    a_len
 409  *      edx:
 410  *      esi:    a ptr
 411  *      edi:    c ptr
 412  */
 413 __declspec(naked) void
 414 s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs)
 415 {
 416   __asm {
 417      mov    eax, is_sse
 418      cmp    eax, 0
 419      je     s_mpv_sqr_add_prop_x86
 420      jg     s_mpv_sqr_add_prop_sse2
 421      call   s_mpi_is_sse2
 422      mov    is_sse, eax
 423      cmp    eax, 0
 424      jg     s_mpv_sqr_add_prop_sse2
 425 s_mpv_sqr_add_prop_x86:
 426      push   ebp
 427      mov    ebp,esp
 428      sub    esp,12
 429      push   edi
 430      push   esi
 431      push   ebx
 432      mov    ebx,0               ; carry = 0
 433      mov    ecx,[ebp+12]        ; a_len
 434      mov    edi,[ebp+16]        ; edi = ps
 435      cmp    ecx,0
 436      je     L_31                ; jump if a_len == 0
 437      cld
 438      mov    esi,[ebp+8]         ; esi = pa
 439 L_30:
 440      lodsd                      ; eax = [ds:si]; si += 4;
 441      mul    eax
 442
 443      add    eax,ebx             ; add "carry"
 444      adc    edx,0
 445      mov    ebx,[edi]
 446      add    eax,ebx             ; add low word from result
 447      mov    ebx,[edi+4]
 448      stosd                      ; [es:di] = eax; di += 4;
 449      adc    edx,ebx             ; add high word from result
 450      mov    ebx,0
 451      mov    eax,edx
 452      adc    ebx,0
 453      stosd                      ; [es:di] = eax; di += 4;
 454      dec    ecx                 ; --a_len
 455      jnz    L_30                ; jmp if a_len != 0
 456 L_31:
 457     cmp    ebx,0                ; is carry zero?
 458     jz     L_34
 459     mov    eax,[edi]            ; add in current word from *c
 460     add    eax,ebx
 461     stosd                       ; [es:edi] = ax; edi += 4;
 462     jnc    L_34
 463 L_32:
 464     mov    eax,[edi]            ; add in current word from *c
 465     adc    eax,0
 466     stosd                       ; [es:edi] = ax; edi += 4;
 467     jc     L_32
 468 L_34:
 469     pop    ebx
 470     pop    esi
 471     pop    edi
 472     leave
 473     ret
 474     nop
 475 s_mpv_sqr_add_prop_sse2:
 476     push   ebp
 477     mov    ebp, esp
 478     push   edi
 479     push   esi
 480     push   ebx
 481     psubq  mm2, mm2             ; carry = 0
 482     mov    ecx, [ebp+12]        ; ecx = a_len
 483     mov    edi, [ebp+16]
 484     cmp    ecx, 0
 485     je     L_36         ; jmp if a_len == 0
 486     mov    esi, [ebp+8]         ; esi = a
 487     cld
 488 L_35:
 489     movd   mm0, [esi]           ; mm0 = *a
 490     movd   mm3, [edi]           ; fetch the sum
 491     add    esi, 4
 492     pmuludq mm0, mm0            ; mm0 = sqr(a)
 493     paddq  mm2, mm0             ; add the carry
 494     paddq  mm2, mm3             ; add the low word
 495     movd   mm3, [edi+4]
 496     movd   [edi], mm2           ; store the 32bit result
 497     psrlq  mm2, 32
 498     paddq  mm2, mm3             ; add the high word
 499     movd   [edi+4], mm2         ; store the 32bit result
 500     psrlq  mm2, 32              ; save the carry.
 501     add    edi, 8
 502     dec    ecx                  ; --a_len
 503     jnz    L_35                 ; jmp if a_len != 0
 504 L_36:
 505     movd   ebx, mm2
 506     cmp    ebx, 0               ; is carry zero?
 507     jz     L_38
 508     mov    eax, [edi]
 509     add    eax, ebx
 510     stosd
 511     jnc    L_38
 512 L_37:
 513     mov    eax, [edi]           ; add in current word from *c
 514     adc    eax, 0
 515     stosd                       ; [es:edi] = ax; edi += 4;
 516     jc     L_37
 517 L_38:
 518     emms
 519     pop    ebx
 520     pop    esi
 521     pop    edi
 522     leave
 523     ret
 524     nop
 525   }
 526 }
 527
 528 /*
 529  *  Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
 530  *  so its high bit is 1.   This code is from NSPR.
 531  *
 532  *  Dump of assembler code for function s_mpv_div_2dx1d:
 533  *
 534  *   esp +  0:   Caller's ebx
 535  *   esp +  4:  return address
 536  *   esp +  8:  Nhi     argument
 537  *   esp + 12:  Nlo     argument
 538  *   esp + 16:  divisor argument
 539  *   esp + 20:  qp      argument
 540  *   esp + 24:   rp     argument
 541  *   registers:
 542  *      eax:
 543  *      ebx:    carry
 544  *      ecx:    a_len
 545  *      edx:
 546  *      esi:    a ptr
 547  *      edi:    c ptr
 548  */
 549 __declspec(naked) mp_err
 550 s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
 551                 mp_digit *qp, mp_digit *rp)
 552 {
 553   __asm {
 554        push   ebx
 555        mov    edx,[esp+8]
 556        mov    eax,[esp+12]
 557        mov    ebx,[esp+16]
 558        div    ebx
 559        mov    ebx,[esp+20]
 560        mov    [ebx],eax
 561        mov    ebx,[esp+24]
 562        mov    [ebx],edx
 563        xor    eax,eax           ; return zero
 564        pop    ebx
 565        ret
 566        nop
 567   }
 568 }