1 dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
3 dnl Copyright 2002, 2003, 2005, 2006, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
28 C 7400,7410 (G4): 1 simple load-use scheduling results in 0.75
29 C 744x,745x (G4+): 0.75
35 C * Either start using the low-end masking constants, or remove them.
36 C * Merge multiple feed-in cases into a parameterized code block.
37 C * Reduce register usage. It should be possible to almost halve it.
60 PROLOGUE(mpn_mod_34lsub1)
61 cmpwi cr0, n, 20 C tuned cutoff point
64 li r9, 0 C result accumulator
65 mulli r10, n, 0xb C 0xb = ceil(32/3)
66 srwi. r10, r10, 5 C r10 = floor(n/3), n < 32
78 L(los): rlwinm r0, r6, 0,8,31
79 add r9, r9, r0 C add 24b from u0
82 rlwimi r0, r7, 8, 0x00ffff00 C --111100
83 add r9, r9, r0 C add 8b from u0 and 16b from u1
86 rlwimi r0, r8, 16, 0x00ff0000 C --221111
87 add r9, r9, r0 C add 16b from u1 and 8b from u2
88 srwi r0, r8, 8 C --222222
90 add r9, r9, r0 C add 24b from u2
94 add r9, r9, r0 C add 24b from u0
96 rlwimi r0, r7, 8, 0x00ffff00 C --111100
97 add r9, r9, r0 C add 8b from u0 and 16b from u1
99 rlwimi r0, r8, 16, 0x00ff0000 C --221111
100 add r9, r9, r0 C add 16b from u1 and 8b from u2
101 srwi r0, r8, 8 C --222222
102 add r9, r9, r0 C add 24b from u2
105 rlwinm r0, r9, 0,8,31
114 rlwinm r0, r6, 0,8,31
122 rlwinm r0, r6, 8,8,23
133 oris r0, r10, 0xffff C Set VRSAVE bit 0-15
162 srwi r0, r0, 3 C r0 = floor(n/12)
169 L(na4): bne cr7, L(na8)
178 srwi r0, r0, 3 C r0 = floor(n/12)
192 srwi r0, r0, 3 C r0 = floor(n/12)
201 srwi r0, r0, 3 C r0 = floor(n/12)
258 rlwinm r3, n ,4,26,27
276 rlwinm r3, n ,4,26,27
289 rlwinm r3, n ,4,26,27
297 L(sum): lvx pv, 0, r11
298 vperm x0, a0, z, pv C extract 4 24-bit field from a0
301 vperm x1, a1, z, pv C extract 4 24-bit field from a1
302 vperm y1, c0, z, pv C extract 4 24-bit field from a1
304 vperm x2, a2, z, pv C extract 4 24-bit field from a1
305 vperm y2, c1, z, pv C extract 4 24-bit field from a1
308 vperm x3, a0, z, pv C extract remaining/partial a0 fields
309 vperm y3, c2, z, pv C extract remaining/partial a0 fields
312 vperm x3, a1, x3, pv C insert remaining/partial a1 fields
313 vperm y3, c0, y3, pv C insert remaining/partial a1 fields
316 vperm x3, a2, x3, pv C insert remaining/partial a2 fields
317 vperm y3, c1, y3, pv C insert remaining/partial a2 fields
319 C We now have 4 128-bit accumulators to sum
330 C Reduce 32-bit fields
333 li r7, -16 C FIXME: does all ppc32 ABIs...
334 stvx x0, r7, r1 C FIXME: ...support storing below sp?
341 C load | v0 | v1 | v2 |
342 C acc | a0 | a1 | a2 |
343 C carry | c0 | c1 | c2 |
344 C | 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 128
345 C |---|---|---|---|---|---|---|---|---|---|---|---| 32
346 C | | | | | | | | | | | | | | | | | 24
347 C | | | | | | | | | 48
349 C $---------------$---------------$---------------$---------------$
350 C | . . . . . . . . . . . . . . . |
351 C |_______________________________________________________________|
353 C <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16->
357 C Permutation vectors in the order they are used above
358 C # 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
359 .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0
360 .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1
361 .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2
362 .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0
363 .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1
364 .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2
365 C Masks for high end of number
366 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
367 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
368 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
369 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
370 C Masks for low end of number
371 C .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
372 C .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
373 C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
374 C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff