+Tue Jun 25 20:59:56 2002 J"orn Rennecke <joern.rennecke@superh.com>
+
+ * config/sh/lib1funcs.asm (udivdi3): Make first divide step
+ produce a 32 bit result before normalization, then normalize with a
+ left shift. Compute approximative error of 2nd reciprocal
+ approximation in 2's complement. Fix mask generation from upper
+ longword of second divide stage result.
+ For large divisor, fix shift count used to truncate first stage
+ divide result; make decision if to adjust upwards based on comparison
+ of higher parts of normalized values.
+ (udivdi): Likewise. Undo normalization of result for large divisor
+ case.
+
2002-06-25 David S. Miller <davem@redhat.com>
* config/sparc/sparc.md: Change \\{t,n} to \{t,n}.
shlri r6,32,r7
bgt/u r9,r63,tr0 // large_divisor
mmulfx.w r5,r4,r4
- shlri r2,32,r19
- addi r20,14-1,r0
+ shlri r2,32+14,r19
+ addi r22,-31,r0
msub.w r1,r4,r1
mulu.l r1,r7,r4
addi r1,-3,r5
mulu.l r5,r19,r5
+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
the case may be, %0000000000000000 000.11111111111, still */
muls.l r1,r4,r4 /* leaving at least one sign bit. */
- shlrd r5,r0,r8
- mulu.l r8,r3,r5
+ mulu.l r5,r3,r8
mshalds.l r1,r21,r1
shari r4,26,r4
- shlli r5,32,r5
- sub r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
- sub r2,r5,r2
+ shlld r8,r0,r8
+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+ sub r2,r8,r2
/* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */
shlri r2,22,r21
mulu.l r21,r1,r21
+ shlld r5,r0,r8
addi r20,30-22,r0
- shlli r8,32,r8
shlrd r21,r0,r21
mulu.l r21,r3,r5
add r8,r21,r8
- mcmpeq.l r21,r63,r21 // See Note 1
+ mcmpgt.l r21,r63,r21 // See Note 1
addi r20,30,r0
mshfhi.l r63,r21,r21
sub r2,r5,r2
mulu.l r1,r7,r4
addi r1,-3,r5
mulu.l r5,r8,r5
+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
the case may be, %0000000000000000 000.11111111111, still */
muls.l r1,r4,r4 /* leaving at least one sign bit. */
- shlri r5,14-1+32,r8
+ shlri r5,14-1,r8
mulu.l r8,r7,r5
mshalds.l r1,r21,r1
shari r4,26,r4
- sub r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
sub r25,r5,r25
/* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */
add r8,r21,r8
shlld r2,r0,r2
sub r25,r5,r25
- mextr4 r2,r25,r2
- bgtu/u r6,r2,tr0 // no_lo_adj
+ bgtu/u r7,r25,tr0 // no_lo_adj
addi r8,1,r8
- sub r2,r6,r2
+ sub r25,r7,r25
LOCAL(no_lo_adj):
+ mextr4 r2,r25,r2
/* large_divisor: only needs a few adjustments. */
mulu.l r8,r6,r5
shlri r6,32,r7
bgt/u r9,r63,tr0 // large_divisor
mmulfx.w r5,r4,r4
- shlri r2,32,r19
- addi r20,14-1,r0
+ shlri r2,32+14,r19
+ addi r22,-31,r0
msub.w r1,r4,r1
mulu.l r1,r7,r4
addi r1,-3,r5
mulu.l r5,r19,r5
+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
the case may be, %0000000000000000 000.11111111111, still */
muls.l r1,r4,r4 /* leaving at least one sign bit. */
- shlrd r5,r0,r8
- mulu.l r8,r3,r5
+ mulu.l r5,r3,r5
mshalds.l r1,r21,r1
shari r4,26,r4
- shlli r5,32,r5
- sub r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+ shlld r5,r0,r5
+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
sub r2,r5,r2
/* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */
/* bubble */ /* could test r3 here to check for divide by zero. */
shlrd r21,r0,r21
mulu.l r21,r3,r5
- mcmpeq.l r21,r63,r21 // See Note 1
+ mcmpgt.l r21,r63,r21 // See Note 1
addi r20,30,r0
mshfhi.l r63,r21,r21
sub r2,r5,r2
mulu.l r1,r7,r4
addi r1,-3,r5
mulu.l r5,r8,r5
+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
the case may be, %0000000000000000 000.11111111111, still */
muls.l r1,r4,r4 /* leaving at least one sign bit. */
- shlri r5,14-1+32,r8
+ shlri r5,14-1,r8
mulu.l r8,r7,r5
mshalds.l r1,r21,r1
shari r4,26,r4
- sub r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
sub r25,r5,r25
/* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */
add r8,r21,r8
shlld r2,r0,r2
sub r25,r5,r25
- mextr4 r2,r25,r2
- bgtu/u r6,r2,tr0 // no_lo_adj
+ bgtu/u r7,r25,tr0 // no_lo_adj
addi r8,1,r8
- sub r2,r6,r2
+ sub r25,r7,r25
LOCAL(no_lo_adj):
+ mextr4 r2,r25,r2
/* large_divisor: only needs a few adjustments. */
mulu.l r8,r6,r5
ptabs r18,tr0
- add r2,r3,r7
+ add r2,r6,r7
cmpgtu r5,r2,r8
cmvne r8,r7,r2
sub r2,r5,r2
+ shlrd r2,r22,r2
blink tr0,r63
/* Note 1: To shift the result of the second divide stage so that the result
always fits into 32 bits, yet we still reduce the rest sufficiently