1 /* The gcc-provided loongson intrinsic functions are way too fucking broken
2 * to be of any use, otherwise I'd use them.
4 * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
5 * close enough that they could have implemented the _mm_*-style intrinsic
6 * interface and had a ton of optimized code available to them. Instead they
7 * implemented something much, much worse.
9 * - pshuf takes a dead first argument, causing extra instructions to be
12 * - There are no 64-bit shift or logical intrinsics, which means you have
13 * to implement them with inline assembly, but this is a nightmare because
14 * gcc doesn't understand that the integer vector datatypes are actually in
15 * floating-point registers, so you end up with braindead code like
17 * punpcklwd $f9,$f9,$f5
19 * punpcklwd $f19,$f19,$f5
24 * punpcklbh $f20,$f20,$f2
26 * where crap just gets copied back and forth between integer and floating-
27 * point registers ad nauseum.
29 * Instead of trying to workaround the problems from these crap intrinsics, I
30 * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
36 /* vectors are stored in 64-bit floating-point registers */
38 /* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */
41 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
42 _mm_setzero_si64 (void)
47 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
48 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
51 asm("paddush %0, %1, %2\n\t"
53 : "f" (__m1), "f" (__m2)
58 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
59 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
62 asm("paddusb %0, %1, %2\n\t"
64 : "f" (__m1), "f" (__m2)
69 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70 _mm_and_si64 (__m64 __m1, __m64 __m2)
73 asm("and %0, %1, %2\n\t"
75 : "f" (__m1), "f" (__m2)
80 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
87 _mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
90 asm("pmulhuh %0, %1, %2\n\t"
92 : "f" (__m1), "f" (__m2)
97 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
101 asm("pmullh %0, %1, %2\n\t"
103 : "f" (__m1), "f" (__m2)
108 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
109 _mm_or_si64 (__m64 __m1, __m64 __m2)
112 asm("or %0, %1, %2\n\t"
114 : "f" (__m1), "f" (__m2)
119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
123 asm("packushb %0, %1, %2\n\t"
125 : "f" (__m1), "f" (__m2)
130 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131 _mm_shuffle_pi16 (__m64 __m, int64_t __n)
134 asm("pshufh %0, %1, %2\n\t"
136 : "f" (__m), "f" (*(__m64 *)&__n)
141 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142 _mm_slli_si64 (__m64 __m, int64_t __count)
145 asm("dsll %0, %1, %2\n\t"
147 : "f" (__m), "f" (*(__m64 *)&__count)
152 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 _mm_srli_pi16 (__m64 __m, int64_t __count)
156 asm("psrlh %0, %1, %2\n\t"
158 : "f" (__m), "f" (*(__m64 *)&__count)
163 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
164 _mm_srli_si64 (__m64 __m, int64_t __count)
167 asm("dsrl %0, %1, %2\n\t"
169 : "f" (__m), "f" (*(__m64 *)&__count)
174 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
178 asm("punpckhbh %0, %1, %2\n\t"
180 : "f" (__m1), "f" (__m2)
185 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
189 asm("punpcklbh %0, %1, %2\n\t"
191 : "f" (__m1), "f" (__m2)
196 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which
197 * allows load8888 to use 32-bit loads */
198 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
199 _mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2)
202 asm("punpcklbh %0, %1, %2\n\t"
204 : "f" (__m1), "f" (__m2)
209 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
210 _mm_xor_si64 (__m64 __m1, __m64 __m2)
213 asm("xor %0, %1, %2\n\t"
215 : "f" (__m1), "f" (__m2)