From: Oleg Endo Date: Sun, 9 Jun 2013 21:32:37 +0000 (+0000) Subject: re PR target/6526 ([SH4] sdivsi3_i4 can clobber xd0/xd2) X-Git-Tag: upstream/12.2.0~69288 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0d00888247bcfe39791153f9aa1fd8734f58bc74;p=platform%2Fupstream%2Fgcc.git re PR target/6526 ([SH4] sdivsi3_i4 can clobber xd0/xd2) PR target/6526 * config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change bits other than FPSCR.PR and FPSCR.SZ. Add SH4A implementation. PR target/6526 * gcc.target/sh/pr6526.c: New. From-SVN: r199873 --- diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 0ba3af6..cae5502 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2013-06-09 Oleg Endo + + PR target/6526 + * gcc.target/sh/pr6526.c: New. + 2013-06-09 Jakub Jelinek PR target/57568 diff --git a/gcc/testsuite/gcc.target/sh/pr6526.c b/gcc/testsuite/gcc.target/sh/pr6526.c new file mode 100644 index 0000000..a7dd6d8 --- /dev/null +++ b/gcc/testsuite/gcc.target/sh/pr6526.c @@ -0,0 +1,64 @@ +/* Check that the XF registers are not clobbered by an integer division + that is done using double precision FPU division. */ +/* { dg-do run { target "sh*-*-*" } } */ +/* { dg-options "-O1 -mdiv=call-fp" } */ +/* { dg-skip-if "" { "sh*-*-*" } { "*" } { "-m4*-single" "-m4*-single-only" } } */ + +#include +#include + +extern void __set_fpscr (int); + +void +write_xf0 (float* f) +{ + __asm__ __volatile__ ("frchg; fmov.s @%0,fr0; frchg" : : "r" (f) : "memory"); +} + +void +read_xf0 (float* f) +{ + __asm__ __volatile__ ("frchg; fmov.s fr0,@%0; frchg" : : "r" (f) : "memory"); +} + +int __attribute__ ((noinline)) +test_00 (int a, int b) +{ + return a / b; +} + +unsigned int __attribute__ ((noinline)) +test_01 (unsigned a, unsigned b) +{ + return a / b; +} + +int __attribute__ ((noinline)) +test_02 (int x) +{ + return x & 0; +} + +int +main (void) +{ + float test_value; + int r = 0; + + /* Set FPSCR.FR to 1. */ + __set_fpscr (0x200000); + + test_value = 123; + write_xf0 (&test_value); + r += test_00 (40, 4); + read_xf0 (&test_value); + assert (test_value == 123); + + test_value = 321; + write_xf0 (&test_value); + r += test_01 (50, 5); + read_xf0 (&test_value); + assert (test_value == 321); + + return test_02 (r); +} diff --git a/libgcc/ChangeLog b/libgcc/ChangeLog index 085432b..832b425 100644 --- a/libgcc/ChangeLog +++ b/libgcc/ChangeLog @@ -1,3 +1,9 @@ +2013-06-09 Oleg Endo + + PR target/6526 + * config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change bits + other than FPSCR.PR and FPSCR.SZ. Add SH4A implementation. + 2013-06-08 Walter Lee * config/tilepro/atomic.h: Don't include stdint.h or features.h. diff --git a/libgcc/config/sh/lib1funcs.S b/libgcc/config/sh/lib1funcs.S index 5f0bbff..51addf3 100644 --- a/libgcc/config/sh/lib1funcs.S +++ b/libgcc/config/sh/lib1funcs.S @@ -1003,11 +1003,17 @@ hiset: sts macl,r0 ! r0 = bb*dd ENDFUNC(GLOBAL(mulsi3)) #endif #endif /* ! __SH5__ */ + +/*------------------------------------------------------------------------------ + 32 bit signed integer division that uses FPU double precision division. */ + #ifdef L_sdivsi3_i4 .title "SH DIVIDE" -!! 4 byte integer Divide code for the Renesas SH + #if defined (__SH4__) || defined (__SH2A__) -!! args in r4 and r5, result in fpul, clobber dr0, dr2 +/* This variant is used when FPSCR.PR = 1 (double precision) is the default + setting. + Args in r4 and r5, result in fpul, clobber dr0, dr2. */ .global GLOBAL(sdivsi3_i4) HIDDEN_FUNC(GLOBAL(sdivsi3_i4)) @@ -1021,8 +1027,13 @@ GLOBAL(sdivsi3_i4): ftrc dr0,fpul ENDFUNC(GLOBAL(sdivsi3_i4)) + #elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__) -!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2 +/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default + setting. + Args in r4 and r5, result in fpul, clobber r2, dr0, dr2. + For this to work, we must temporarily switch the FPU do double precision, + but we better do not touch FPSCR.FR. See PR 6526. */ #if ! __SH5__ || __SH5__ == 32 #if __SH5__ @@ -1031,24 +1042,43 @@ GLOBAL(sdivsi3_i4): .global GLOBAL(sdivsi3_i4) HIDDEN_FUNC(GLOBAL(sdivsi3_i4)) GLOBAL(sdivsi3_i4): - sts.l fpscr,@-r15 - mov #8,r2 - swap.w r2,r2 - lds r2,fpscr - lds r4,fpul - float fpul,dr0 - lds r5,fpul - float fpul,dr2 - fdiv dr2,dr0 - ftrc dr0,fpul + +#ifndef __SH4A__ + mov.l r3,@-r15 + sts fpscr,r2 + mov #8,r3 + swap.w r3,r3 // r3 = 1 << 19 (FPSCR.PR bit) + or r2,r3 + lds r3,fpscr // Set FPSCR.PR = 1. + lds r4,fpul + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul + lds r2,fpscr rts - lds.l @r15+,fpscr + mov.l @r15+,r3 +#else +/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit. */ + fpchg + lds r4,fpul + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul + rts + fpchg + +#endif /* __SH4A__ */ ENDFUNC(GLOBAL(sdivsi3_i4)) #endif /* ! __SH5__ || __SH5__ == 32 */ #endif /* ! __SH4__ || __SH2A__ */ -#endif +#endif /* L_sdivsi3_i4 */ +//------------------------------------------------------------------------------ #ifdef L_sdivsi3 /* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with sh2e/sh3e code. */ @@ -1367,54 +1397,60 @@ div0: rts mov #0,r0 ENDFUNC(GLOBAL(sdivsi3)) -#endif /* ! __SHMEDIA__ */ -#endif -#ifdef L_udivsi3_i4 +#endif /* ! __SHMEDIA__ */ +#endif /* L_sdivsi3 */ + +/*------------------------------------------------------------------------------ + 32 bit unsigned integer division that uses FPU double precision division. */ +#ifdef L_udivsi3_i4 .title "SH DIVIDE" -!! 4 byte integer Divide code for the Renesas SH + #if defined (__SH4__) || defined (__SH2A__) -!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4, -!! and t bit +/* This variant is used when FPSCR.PR = 1 (double precision) is the default + setting. + Args in r4 and r5, result in fpul, + clobber r0, r1, r4, r5, dr0, dr2, dr4, and t bit */ .global GLOBAL(udivsi3_i4) HIDDEN_FUNC(GLOBAL(udivsi3_i4)) GLOBAL(udivsi3_i4): - mov #1,r1 - cmp/hi r1,r5 - bf trivial - rotr r1 - xor r1,r4 - lds r4,fpul - mova L1,r0 + mov #1,r1 + cmp/hi r1,r5 + bf/s trivial + rotr r1 + xor r1,r4 + lds r4,fpul + mova L1,r0 #ifdef FMOVD_WORKS - fmov.d @r0+,dr4 + fmov.d @r0+,dr4 #else - fmov.s @r0+,DR40 - fmov.s @r0,DR41 + fmov.s @r0+,DR40 + fmov.s @r0,DR41 #endif - float fpul,dr0 - xor r1,r5 - lds r5,fpul - float fpul,dr2 - fadd dr4,dr0 - fadd dr4,dr2 - fdiv dr2,dr0 + float fpul,dr0 + xor r1,r5 + lds r5,fpul + float fpul,dr2 + fadd dr4,dr0 + fadd dr4,dr2 + fdiv dr2,dr0 rts - ftrc dr0,fpul + ftrc dr0,fpul trivial: rts - lds r4,fpul + lds r4,fpul .align 2 #ifdef FMOVD_WORKS - .align 3 ! make double below 8 byte aligned. + .align 3 // Make the double below 8 byte aligned. #endif L1: .double 2147483648 ENDFUNC(GLOBAL(udivsi3_i4)) + #elif defined (__SH5__) && ! defined (__SH4_NOFPU__) && ! defined (__SH2A_NOFPU__) #if ! __SH5__ || __SH5__ == 32 !! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33 @@ -1436,57 +1472,106 @@ GLOBAL(udivsi3_i4): ENDFUNC(GLOBAL(udivsi3_i4)) #endif /* ! __SH5__ || __SH5__ == 32 */ + #elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) -!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4 +/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default + setting. + Args in r4 and r5, result in fpul, + clobber r0, r1, r4, r5, dr0, dr2, dr4. + For this to work, we must temporarily switch the FPU do double precision, + but we better do not touch FPSCR.FR. See PR 6526. */ .global GLOBAL(udivsi3_i4) HIDDEN_FUNC(GLOBAL(udivsi3_i4)) GLOBAL(udivsi3_i4): - mov #1,r1 - cmp/hi r1,r5 - bf trivial - sts.l fpscr,@-r15 - mova L1,r0 - lds.l @r0+,fpscr - rotr r1 - xor r1,r4 - lds r4,fpul + +#ifndef __SH4A__ + mov #1,r1 + cmp/hi r1,r5 + bf/s trivial + rotr r1 // r1 = 1 << 31 + sts.l fpscr,@-r15 + xor r1,r4 + mov.l @(0,r15),r0 + xor r1,r5 + mov.l L2,r1 + lds r4,fpul + or r0,r1 + mova L1,r0 + lds r1,fpscr #ifdef FMOVD_WORKS - fmov.d @r0+,dr4 + fmov.d @r0+,dr4 #else - fmov.s @r0+,DR40 - fmov.s @r0,DR41 + fmov.s @r0+,DR40 + fmov.s @r0,DR41 #endif - float fpul,dr0 - xor r1,r5 - lds r5,fpul - float fpul,dr2 - fadd dr4,dr0 - fadd dr4,dr2 - fdiv dr2,dr0 - ftrc dr0,fpul + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fadd dr4,dr0 + fadd dr4,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul rts - lds.l @r15+,fpscr + lds.l @r15+,fpscr #ifdef FMOVD_WORKS - .align 3 ! make double below 8 byte aligned. + .align 3 // Make the double below 8 byte aligned. #endif trivial: rts - lds r4,fpul + lds r4,fpul .align 2 -L1: -#ifndef FMOVD_WORKS - .long 0x80000 +L2: +#ifdef FMOVD_WORKS + .long 0x180000 // FPSCR.PR = 1, FPSCR.SZ = 1 #else - .long 0x180000 + .long 0x80000 // FPSCR.PR = 1 #endif +L1: + .double 2147483648 + +#else +/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit. + Although on SH4A fmovd usually works, it would require either additional + two fschg instructions or an FPSCR push + pop. It's not worth the effort + for loading only one double constant. */ + mov #1,r1 + cmp/hi r1,r5 + bf/s trivial + rotr r1 // r1 = 1 << 31 + fpchg + mova L1,r0 + xor r1,r4 + fmov.s @r0+,DR40 + lds r4,fpul + fmov.s @r0,DR41 + xor r1,r5 + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fadd dr4,dr0 + fadd dr4,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul + rts + fpchg + +trivial: + rts + lds r4,fpul + + .align 2 +L1: .double 2147483648 +#endif /* __SH4A__ */ + + ENDFUNC(GLOBAL(udivsi3_i4)) #endif /* ! __SH4__ */ -#endif +#endif /* L_udivsi3_i4 */ #ifdef L_udivsi3 /* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with