pixman/loongson-mmintrin.h

   1 /* The gcc-provided loongson intrinsic functions are way too fucking broken
   2  * to be of any use, otherwise I'd use them.
   3  *
   4  * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
   5  *   close enough that they could have implemented the _mm_*-style intrinsic
   6  *   interface and had a ton of optimized code available to them. Instead they
   7  *   implemented something much, much worse.
   8  *
   9  * - pshuf takes a dead first argument, causing extra instructions to be
  10  *   generated.
  11  *
  12  * - There are no 64-bit shift or logical intrinsics, which means you have
  13  *   to implement them with inline assembly, but this is a nightmare because
  14  *   gcc doesn't understand that the integer vector datatypes are actually in
  15  *   floating-point registers, so you end up with braindead code like
  16  *
  17  *      punpcklwd       $f9,$f9,$f5
  18  *          dmtc1       v0,$f8
  19  *      punpcklwd       $f19,$f19,$f5
  20  *          dmfc1       t9,$f9
  21  *          dmtc1       v0,$f9
  22  *          dmtc1       t9,$f20
  23  *          dmfc1       s0,$f19
  24  *      punpcklbh       $f20,$f20,$f2
  25  *
  26  *   where crap just gets copied back and forth between integer and floating-
  27  *   point registers ad nauseum.
  28  *
  29  * Instead of trying to workaround the problems from these crap intrinsics, I
  30  * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
  31  * assembly.
  32  */
  33
  34 #include <stdint.h>
  35
  36 /* vectors are stored in 64-bit floating-point registers */
  37 typedef double __m64;
  38 /* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */
  39 typedef float  __m32;
  40
  41 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  42 _mm_setzero_si64 (void)
  43 {
  44         return 0.0;
  45 }
  46
  47 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  48 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
  49 {
  50         __m64 ret;
  51         asm("paddush %0, %1, %2\n\t"
  52            : "=f" (ret)
  53            : "f" (__m1), "f" (__m2)
  54         );
  55         return ret;
  56 }
  57
  58 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  59 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
  60 {
  61         __m64 ret;
  62         asm("paddusb %0, %1, %2\n\t"
  63            : "=f" (ret)
  64            : "f" (__m1), "f" (__m2)
  65         );
  66         return ret;
  67 }
  68
  69 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  70 _mm_and_si64 (__m64 __m1, __m64 __m2)
  71 {
  72         __m64 ret;
  73         asm("and %0, %1, %2\n\t"
  74            : "=f" (ret)
  75            : "f" (__m1), "f" (__m2)
  76         );
  77         return ret;
  78 }
  79
  80 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  81 _mm_empty (void)
  82 {
  83
  84 }
  85
  86 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  87 _mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
  88 {
  89         __m64 ret;
  90         asm("pmulhuh %0, %1, %2\n\t"
  91            : "=f" (ret)
  92            : "f" (__m1), "f" (__m2)
  93         );
  94         return ret;
  95 }
  96
  97 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  98 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
  99 {
 100         __m64 ret;
 101         asm("pmullh %0, %1, %2\n\t"
 102            : "=f" (ret)
 103            : "f" (__m1), "f" (__m2)
 104         );
 105         return ret;
 106 }
 107
 108 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 109 _mm_or_si64 (__m64 __m1, __m64 __m2)
 110 {
 111         __m64 ret;
 112         asm("or %0, %1, %2\n\t"
 113            : "=f" (ret)
 114            : "f" (__m1), "f" (__m2)
 115         );
 116         return ret;
 117 }
 118
 119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 120 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
 121 {
 122         __m64 ret;
 123         asm("packushb %0, %1, %2\n\t"
 124            : "=f" (ret)
 125            : "f" (__m1), "f" (__m2)
 126         );
 127         return ret;
 128 }
 129
 130 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 131 _mm_shuffle_pi16 (__m64 __m, int64_t __n)
 132 {
 133         __m64 ret;
 134         asm("pshufh %0, %1, %2\n\t"
 135             : "=f" (ret)
 136             : "f" (__m), "f" (*(__m64 *)&__n)
 137         );
 138         return ret;
 139 }
 140
 141 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 142 _mm_slli_si64 (__m64 __m, int64_t __count)
 143 {
 144         __m64 ret;
 145         asm("dsll  %0, %1, %2\n\t"
 146            : "=f" (ret)
 147            : "f" (__m), "f" (*(__m64 *)&__count)
 148         );
 149         return ret;
 150 }
 151
 152 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 153 _mm_srli_pi16 (__m64 __m, int64_t __count)
 154 {
 155         __m64 ret;
 156         asm("psrlh %0, %1, %2\n\t"
 157            : "=f" (ret)
 158            : "f" (__m), "f" (*(__m64 *)&__count)
 159         );
 160         return ret;
 161 }
 162
 163 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 164 _mm_srli_si64 (__m64 __m, int64_t __count)
 165 {
 166         __m64 ret;
 167         asm("dsrl  %0, %1, %2\n\t"
 168            : "=f" (ret)
 169            : "f" (__m), "f" (*(__m64 *)&__count)
 170         );
 171         return ret;
 172 }
 173
 174 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 175 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
 176 {
 177         __m64 ret;
 178         asm("punpckhbh %0, %1, %2\n\t"
 179            : "=f" (ret)
 180            : "f" (__m1), "f" (__m2)
 181         );
 182         return ret;
 183 }
 184
 185 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 186 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
 187 {
 188         __m64 ret;
 189         asm("punpcklbh %0, %1, %2\n\t"
 190            : "=f" (ret)
 191            : "f" (__m1), "f" (__m2)
 192         );
 193         return ret;
 194 }
 195
 196 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which
 197  * allows load8888 to use 32-bit loads */
 198 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 199 _mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2)
 200 {
 201         __m64 ret;
 202         asm("punpcklbh %0, %1, %2\n\t"
 203            : "=f" (ret)
 204            : "f" (__m1), "f" (__m2)
 205         );
 206         return ret;
 207 }
 208
 209 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 210 _mm_xor_si64 (__m64 __m1, __m64 __m2)
 211 {
 212         __m64 ret;
 213         asm("xor %0, %1, %2\n\t"
 214            : "=f" (ret)
 215            : "f" (__m1), "f" (__m2)
 216         );
 217         return ret;
 218 }