2 * Loongson MMI optimizations for libjpeg-turbo
4 * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
15 * 1. The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * 2. Altered source versions must be plainly marked as such, and must not be
20 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
24 #ifndef __LOONGSON_MMINTRIN_H__
25 #define __LOONGSON_MMINTRIN_H__
30 #define FUNCTION_ATTRIBS \
31 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
34 /* Vectors are stored in 64-bit floating-point registers. */
37 /* Having a 32-bit datatype allows us to use 32-bit loads in places like
42 /********** Set Operations **********/
45 _mm_setzero_si64(void)
50 extern __inline __m64 FUNCTION_ATTRIBS
51 _mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
52 uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
55 uint32_t lo = ((uint32_t)__b6 << 24) |
56 ((uint32_t)__b4 << 16) |
57 ((uint32_t)__b2 << 8) |
59 uint32_t hi = ((uint32_t)__b7 << 24) |
60 ((uint32_t)__b5 << 16) |
61 ((uint32_t)__b3 << 8) |
66 "punpcklbh %0, %0, $f0\n\t"
75 extern __inline __m64 FUNCTION_ATTRIBS
76 _mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
79 uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
80 uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
84 "punpcklhw %0, %0, $f0\n\t"
93 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
94 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
96 extern __inline __m64 FUNCTION_ATTRIBS
97 _mm_set_pi32(uint32_t __i1, uint32_t __i0)
99 if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
100 uint64_t val = ((uint64_t)__i1 << 32) |
101 ((uint64_t)__i0 << 0);
103 return *(__m64 *)&val;
104 } else if (__i1 == __i0) {
105 uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
108 asm("pshufh %0, %1, %2\n\t"
110 : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
115 uint64_t val = ((uint64_t)__i1 << 32) |
116 ((uint64_t)__i0 << 0);
118 return *(__m64 *)&val;
122 extern __inline __m64 FUNCTION_ATTRIBS
123 _mm_set1_pi8(uint8_t __b0)
127 asm("sll $8, %1, 8\n\t"
131 "pshufh %0, %0, $f0\n\t"
140 extern __inline __m64 FUNCTION_ATTRIBS
141 _mm_set1_pi16(uint16_t __h0)
145 asm("mtc1 %1, %0\n\t"
147 "pshufh %0, %0, $f0\n\t"
156 extern __inline __m64 FUNCTION_ATTRIBS
157 _mm_set1_pi32(unsigned __i0)
159 return _mm_set_pi32(__i0, __i0);
162 extern __inline __m64 FUNCTION_ATTRIBS
163 _mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
164 uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
166 return _mm_set_pi8(__h7, __h6, __h5, __h4,
167 __h3, __h2, __h1, __h0);
170 extern __inline __m64 FUNCTION_ATTRIBS
171 _mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
173 return _mm_set_pi16(__w3, __w2, __w1, __w0);
176 extern __inline __m64 FUNCTION_ATTRIBS
177 _mm_setr_pi32(uint32_t __i0, uint32_t __i1)
179 return _mm_set_pi32(__i1, __i0);
183 /********** Arithmetic Operations **********/
185 extern __inline __m64 FUNCTION_ATTRIBS
186 _mm_add_pi8(__m64 __m1, __m64 __m2)
190 asm("paddb %0, %1, %2\n\t"
192 : "f" (__m1), "f" (__m2)
198 extern __inline __m64 FUNCTION_ATTRIBS
199 _mm_add_pi16(__m64 __m1, __m64 __m2)
203 asm("paddh %0, %1, %2\n\t"
205 : "f" (__m1), "f" (__m2)
211 extern __inline __m64 FUNCTION_ATTRIBS
212 _mm_add_pi32(__m64 __m1, __m64 __m2)
216 asm("paddw %0, %1, %2\n\t"
218 : "f" (__m1), "f" (__m2)
224 extern __inline __m64 FUNCTION_ATTRIBS
225 _mm_add_si64(__m64 __m1, __m64 __m2)
229 asm("paddd %0, %1, %2\n\t"
231 : "f" (__m1), "f" (__m2)
237 extern __inline __m64 FUNCTION_ATTRIBS
238 _mm_adds_pi8(__m64 __m1, __m64 __m2)
242 asm("paddsb %0, %1, %2\n\t"
244 : "f" (__m1), "f" (__m2)
250 extern __inline __m64 FUNCTION_ATTRIBS
251 _mm_adds_pi16(__m64 __m1, __m64 __m2)
255 asm("paddsh %0, %1, %2\n\t"
257 : "f" (__m1), "f" (__m2)
264 extern __inline __m64 FUNCTION_ATTRIBS
265 _mm_adds_pu8(__m64 __m1, __m64 __m2)
269 asm("paddusb %0, %1, %2\n\t"
271 : "f" (__m1), "f" (__m2)
277 extern __inline __m64 FUNCTION_ATTRIBS
278 _mm_adds_pu16(__m64 __m1, __m64 __m2)
282 asm("paddush %0, %1, %2\n\t"
284 : "f" (__m1), "f" (__m2)
290 extern __inline __m64 FUNCTION_ATTRIBS
291 _mm_avg_pu8(__m64 __m1, __m64 __m2)
295 asm("pavgb %0, %1, %2\n\t"
297 : "f" (__m1), "f" (__m2)
303 extern __inline __m64 FUNCTION_ATTRIBS
304 _mm_avg_pu16(__m64 __m1, __m64 __m2)
308 asm("pavgh %0, %1, %2\n\t"
310 : "f" (__m1), "f" (__m2)
316 extern __inline __m64 FUNCTION_ATTRIBS
317 _mm_madd_pi16(__m64 __m1, __m64 __m2)
321 asm("pmaddhw %0, %1, %2\n\t"
323 : "f" (__m1), "f" (__m2)
329 extern __inline __m64 FUNCTION_ATTRIBS
330 _mm_max_pi16(__m64 __m1, __m64 __m2)
334 asm("pmaxsh %0, %1, %2\n\t"
336 : "f" (__m1), "f" (__m2)
342 extern __inline __m64 FUNCTION_ATTRIBS
343 _mm_max_pu8(__m64 __m1, __m64 __m2)
347 asm("pmaxub %0, %1, %2\n\t"
349 : "f" (__m1), "f" (__m2)
355 extern __inline __m64 FUNCTION_ATTRIBS
356 _mm_min_pi16(__m64 __m1, __m64 __m2)
360 asm("pminsh %0, %1, %2\n\t"
362 : "f" (__m1), "f" (__m2)
368 extern __inline __m64 FUNCTION_ATTRIBS
369 _mm_min_pu8(__m64 __m1, __m64 __m2)
373 asm("pminub %0, %1, %2\n\t"
375 : "f" (__m1), "f" (__m2)
381 extern __inline int FUNCTION_ATTRIBS
382 _mm_movemask_pi8(__m64 __m1)
386 asm("pmovmskb %0, %1\n\t"
394 extern __inline __m64 FUNCTION_ATTRIBS
395 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
399 asm("pmulhh %0, %1, %2\n\t"
401 : "f" (__m1), "f" (__m2)
407 extern __inline __m64 FUNCTION_ATTRIBS
408 _mm_mulhi_pu16(__m64 __m1, __m64 __m2)
412 asm("pmulhuh %0, %1, %2\n\t"
414 : "f" (__m1), "f" (__m2)
420 extern __inline __m64 FUNCTION_ATTRIBS
421 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
425 asm("pmullh %0, %1, %2\n\t"
427 : "f" (__m1), "f" (__m2)
433 extern __inline __m64 FUNCTION_ATTRIBS
434 _mm_mul_pu32(__m64 __m1, __m64 __m2)
438 asm("pmuluw %0, %1, %2\n\t"
440 : "f" (__m1), "f" (__m2)
446 extern __inline __m64 FUNCTION_ATTRIBS
447 _mm_sad_pu8(__m64 __m1, __m64 __m2)
451 asm("psadbh %0, %1, %2\n\t"
453 : "f" (__m1), "f" (__m2)
460 extern __inline __m64 FUNCTION_ATTRIBS
461 _mm_asub_pu8(__m64 __m1, __m64 __m2)
465 asm("pasubub %0, %1, %2\n\t"
467 : "f" (__m1), "f" (__m2)
473 extern __inline __m64 FUNCTION_ATTRIBS
474 _mm_biadd_pu8(__m64 __m1, __m64 __m2)
478 asm("biadd %0, %1, %2\n\t"
480 : "f" (__m1), "f" (__m2)
486 extern __inline __m64 FUNCTION_ATTRIBS
487 _mm_sub_pi8(__m64 __m1, __m64 __m2)
491 asm("psubb %0, %1, %2\n\t"
493 : "f" (__m1), "f" (__m2)
499 extern __inline __m64 FUNCTION_ATTRIBS
500 _mm_sub_pi16(__m64 __m1, __m64 __m2)
504 asm("psubh %0, %1, %2\n\t"
506 : "f" (__m1), "f" (__m2)
512 extern __inline __m64 FUNCTION_ATTRIBS
513 _mm_sub_pi32(__m64 __m1, __m64 __m2)
517 asm("psubw %0, %1, %2\n\t"
519 : "f" (__m1), "f" (__m2)
525 extern __inline __m64 FUNCTION_ATTRIBS
526 _mm_sub_si64(__m64 __m1, __m64 __m2)
530 asm("psubd %0, %1, %2\n\t"
532 : "f" (__m1), "f" (__m2)
538 extern __inline __m64 FUNCTION_ATTRIBS
539 _mm_subs_pi8(__m64 __m1, __m64 __m2)
543 asm("psubsb %0, %1, %2\n\t"
545 : "f" (__m1), "f" (__m2)
551 extern __inline __m64 FUNCTION_ATTRIBS
552 _mm_subs_pi16(__m64 __m1, __m64 __m2)
556 asm("psubsh %0, %1, %2\n\t"
558 : "f" (__m1), "f" (__m2)
565 extern __inline __m64 FUNCTION_ATTRIBS
566 _mm_subs_pu8(__m64 __m1, __m64 __m2)
570 asm("psubusb %0, %1, %2\n\t"
572 : "f" (__m1), "f" (__m2)
578 extern __inline __m64 FUNCTION_ATTRIBS
579 _mm_subs_pu16(__m64 __m1, __m64 __m2)
583 asm("psubush %0, %1, %2\n\t"
585 : "f" (__m1), "f" (__m2)
592 /********** Logical Operations **********/
594 extern __inline __m64 FUNCTION_ATTRIBS
595 _mm_and_si64(__m64 __m1, __m64 __m2)
599 asm("and %0, %1, %2\n\t"
601 : "f" (__m1), "f" (__m2)
607 extern __inline __m64 FUNCTION_ATTRIBS
608 _mm_andnot_si64(__m64 __m1, __m64 __m2)
612 asm("andn %0, %1, %2\n\t"
614 : "f" (__m1), "f" (__m2)
621 extern __inline __m64 FUNCTION_ATTRIBS
622 _mm_or_si32(__m32 __m1, __m32 __m2)
626 asm("or %0, %1, %2\n\t"
628 : "f" (__m1), "f" (__m2)
634 extern __inline __m64 FUNCTION_ATTRIBS
635 _mm_or_si64(__m64 __m1, __m64 __m2)
639 asm("or %0, %1, %2\n\t"
641 : "f" (__m1), "f" (__m2)
647 extern __inline __m64 FUNCTION_ATTRIBS
648 _mm_xor_si64(__m64 __m1, __m64 __m2)
652 asm("xor %0, %1, %2\n\t"
654 : "f" (__m1), "f" (__m2)
661 /********** Shift Operations **********/
663 extern __inline __m64 FUNCTION_ATTRIBS
664 _mm_slli_pi16(__m64 __m, int64_t __count)
668 asm("psllh %0, %1, %2\n\t"
670 : "f" (__m), "f" (*(__m64 *)&__count)
676 extern __inline __m64 FUNCTION_ATTRIBS
677 _mm_slli_pi32(__m64 __m, int64_t __count)
681 asm("psllw %0, %1, %2\n\t"
683 : "f" (__m), "f" (*(__m64 *)&__count)
689 extern __inline __m64 FUNCTION_ATTRIBS
690 _mm_slli_si64(__m64 __m, int64_t __count)
694 asm("dsll %0, %1, %2\n\t"
696 : "f" (__m), "f" (*(__m64 *)&__count)
702 extern __inline __m64 FUNCTION_ATTRIBS
703 _mm_srli_pi16(__m64 __m, int64_t __count)
707 asm("psrlh %0, %1, %2\n\t"
709 : "f" (__m), "f" (*(__m64 *)&__count)
715 extern __inline __m64 FUNCTION_ATTRIBS
716 _mm_srli_pi32(__m64 __m, int64_t __count)
720 asm("psrlw %0, %1, %2\n\t"
722 : "f" (__m), "f" (*(__m64 *)&__count)
728 extern __inline __m64 FUNCTION_ATTRIBS
729 _mm_srli_si64(__m64 __m, int64_t __count)
733 asm("dsrl %0, %1, %2\n\t"
735 : "f" (__m), "f" (*(__m64 *)&__count)
741 extern __inline __m64 FUNCTION_ATTRIBS
742 _mm_srai_pi16(__m64 __m, int64_t __count)
746 asm("psrah %0, %1, %2\n\t"
748 : "f" (__m), "f" (*(__m64 *)&__count)
754 extern __inline __m64 FUNCTION_ATTRIBS
755 _mm_srai_pi32(__m64 __m, int64_t __count)
759 asm("psraw %0, %1, %2\n\t"
761 : "f" (__m), "f" (*(__m64 *)&__count)
767 extern __inline __m64 FUNCTION_ATTRIBS
768 _mm_srai_si64(__m64 __m, int64_t __count)
772 asm("dsra %0, %1, %2\n\t"
774 : "f" (__m), "f" (*(__m64 *)&__count)
781 /********** Conversion Intrinsics **********/
783 extern __inline __m64 FUNCTION_ATTRIBS
789 extern __inline uint64_t FUNCTION_ATTRIBS
792 return *(uint64_t *)&x;
796 /********** Comparison Intrinsics **********/
798 extern __inline __m64 FUNCTION_ATTRIBS
799 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
803 asm("pcmpeqb %0, %1, %2\n\t"
805 : "f" (__m1), "f" (__m2)
811 extern __inline __m64 FUNCTION_ATTRIBS
812 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
816 asm("pcmpeqh %0, %1, %2\n\t"
818 : "f" (__m1), "f" (__m2)
824 extern __inline __m64 FUNCTION_ATTRIBS
825 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
829 asm("pcmpeqw %0, %1, %2\n\t"
831 : "f" (__m1), "f" (__m2)
837 extern __inline __m64 FUNCTION_ATTRIBS
838 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
842 asm("pcmpgtb %0, %1, %2\n\t"
844 : "f" (__m1), "f" (__m2)
850 extern __inline __m64 FUNCTION_ATTRIBS
851 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
855 asm("pcmpgth %0, %1, %2\n\t"
857 : "f" (__m1), "f" (__m2)
863 extern __inline __m64 FUNCTION_ATTRIBS
864 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
868 asm("pcmpgtw %0, %1, %2\n\t"
870 : "f" (__m1), "f" (__m2)
876 extern __inline __m64 FUNCTION_ATTRIBS
877 _mm_cmplt_pi8(__m64 __m1, __m64 __m2)
881 asm("pcmpltb %0, %1, %2\n\t"
883 : "f" (__m1), "f" (__m2)
889 extern __inline __m64 FUNCTION_ATTRIBS
890 _mm_cmplt_pi16(__m64 __m1, __m64 __m2)
894 asm("pcmplth %0, %1, %2\n\t"
896 : "f" (__m1), "f" (__m2)
902 extern __inline __m64 FUNCTION_ATTRIBS
903 _mm_cmplt_pi32(__m64 __m1, __m64 __m2)
907 asm("pcmpltw %0, %1, %2\n\t"
909 : "f" (__m1), "f" (__m2)
916 /********** Miscellaneous Operations **********/
918 extern __inline __m64 FUNCTION_ATTRIBS
919 _mm_packs_pi16(__m64 __m1, __m64 __m2)
923 asm("packsshb %0, %1, %2\n\t"
925 : "f" (__m1), "f" (__m2)
931 extern __inline __m64 FUNCTION_ATTRIBS
932 _mm_packs_pi32(__m64 __m1, __m64 __m2)
936 asm("packsswh %0, %1, %2\n\t"
938 : "f" (__m1), "f" (__m2)
944 extern __inline __m64 FUNCTION_ATTRIBS
945 _mm_packs_pi32_f(__m64 __m1, __m64 __m2)
949 asm("packsswh %0, %1, %2\n\t"
951 : "f" (__m1), "f" (__m2)
957 extern __inline __m64 FUNCTION_ATTRIBS
958 _mm_packs_pu16(__m64 __m1, __m64 __m2)
962 asm("packushb %0, %1, %2\n\t"
964 : "f" (__m1), "f" (__m2)
970 extern __inline __m64 FUNCTION_ATTRIBS
971 _mm_extract_pi16(__m64 __m, int64_t __pos)
975 asm("pextrh %0, %1, %2\n\t"
977 : "f" (__m), "f" (*(__m64 *)&__pos)
983 extern __inline __m64 FUNCTION_ATTRIBS
984 _mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
991 asm("pinsrh_0 %0, %1, %2\n\t"
993 : "f" (__m1), "f" (__m2), "i" (__pos)
1000 asm("pinsrh_1 %0, %1, %2\n\t"
1002 : "f" (__m1), "f" (__m2), "i" (__pos)
1008 asm("pinsrh_2 %0, %1, %2\n\t"
1010 : "f" (__m1), "f" (__m2), "i" (__pos)
1017 asm("pinsrh_3 %0, %1, %2\n\t"
1019 : "f" (__m1), "f" (__m2), "i" (__pos)
1028 extern __inline __m64 FUNCTION_ATTRIBS
1029 _mm_shuffle_pi16(__m64 __m, int64_t __n)
1033 asm("pshufh %0, %1, %2\n\t"
1035 : "f" (__m), "f" (*(__m64 *)&__n)
1041 extern __inline __m64 FUNCTION_ATTRIBS
1042 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
1046 asm("punpckhbh %0, %1, %2\n\t"
1048 : "f" (__m1), "f" (__m2)
1054 extern __inline __m64 FUNCTION_ATTRIBS
1055 _mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
1059 asm("punpckhbh %0, %1, %2\n\t"
1061 : "f" (__m1), "f" (__m2)
1067 extern __inline __m64 FUNCTION_ATTRIBS
1068 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
1072 asm("punpckhhw %0, %1, %2\n\t"
1074 : "f" (__m1), "f" (__m2)
1080 extern __inline __m64 FUNCTION_ATTRIBS
1081 _mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
1085 asm("punpckhhw %0, %1, %2\n\t"
1087 : "f" (__m1), "f" (__m2)
1093 extern __inline __m64 FUNCTION_ATTRIBS
1094 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
1098 asm("punpckhwd %0, %1, %2\n\t"
1100 : "f" (__m1), "f" (__m2)
1106 extern __inline __m64 FUNCTION_ATTRIBS
1107 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
1111 asm("punpcklbh %0, %1, %2\n\t"
1113 : "f" (__m1), "f" (__m2)
1119 /* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
1120 which preserves the data. */
1122 extern __inline __m64 FUNCTION_ATTRIBS
1123 _mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
1127 asm("punpcklbh %0, %1, %2\n\t"
1129 : "f" (__m1), "f" (__m2)
1135 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
1136 datatype, which allows load8888 to use 32-bit loads. */
1138 extern __inline __m64 FUNCTION_ATTRIBS
1139 _mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
1143 asm("punpcklbh %0, %1, %2\n\t"
1145 : "f" (__m1), "f" (__m2)
1151 extern __inline __m64 FUNCTION_ATTRIBS
1152 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
1156 asm("punpcklhw %0, %1, %2\n\t"
1158 : "f" (__m1), "f" (__m2)
1164 extern __inline __m64 FUNCTION_ATTRIBS
1165 _mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
1169 asm("punpcklhw %0, %1, %2\n\t"
1171 : "f" (__m1), "f" (__m2)
1177 extern __inline __m64 FUNCTION_ATTRIBS
1178 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
1182 asm("punpcklwd %0, %1, %2\n\t"
1184 : "f" (__m1), "f" (__m2)
1191 extern __inline __m64 FUNCTION_ATTRIBS
1192 _mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
1196 asm("punpcklwd %0, %1, %2\n\t"
1198 : "f" (__m1), "f" (__m2)
1204 extern __inline void FUNCTION_ATTRIBS
1205 _mm_store_pi32(__m32 *dest, __m64 src)
1207 src = _mm_packs_pu16(src, _mm_setzero_si64());
1209 asm("swc1 %1, %0\n\t"
1216 extern __inline void FUNCTION_ATTRIBS
1217 _mm_store_si64(__m64 *dest, __m64 src)
1219 asm("gssdlc1 %1, 7+%0\n\t"
1220 "gssdrc1 %1, %0\n\t"
1227 extern __inline __m64 FUNCTION_ATTRIBS
1228 _mm_load_si32(const __m32 *src)
1232 asm("lwc1 %0, %1\n\t"
1240 extern __inline __m64 FUNCTION_ATTRIBS
1241 _mm_load_si64(const __m64 *src)
1245 asm("ldc1 %0, %1\n\t"
1253 extern __inline __m64 FUNCTION_ATTRIBS
1254 _mm_loadlo_pi8(const uint32_t *src)
1256 return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
1259 extern __inline __m64 FUNCTION_ATTRIBS
1260 _mm_loadlo_pi8_f(__m64 src)
1262 return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
1265 extern __inline __m64 FUNCTION_ATTRIBS
1266 _mm_loadhi_pi8_f(__m64 src)
1268 return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
1271 extern __inline __m64 FUNCTION_ATTRIBS
1272 _mm_loadlo_pi16(__m64 src)
1274 return _mm_unpacklo_pi16(src, _mm_setzero_si64());
1277 extern __inline __m64 FUNCTION_ATTRIBS
1278 _mm_loadlo_pi16_f(__m64 src)
1280 return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
1283 extern __inline __m64 FUNCTION_ATTRIBS
1284 _mm_loadhi_pi16(__m64 src)
1286 return _mm_unpackhi_pi16(src, _mm_setzero_si64());
1289 extern __inline __m64 FUNCTION_ATTRIBS
1290 _mm_loadhi_pi16_f(__m64 src)
1292 return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
1295 extern __inline __m64 FUNCTION_ATTRIBS
1296 _mm_expand_alpha(__m64 pixel)
1298 return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
1301 extern __inline __m64 FUNCTION_ATTRIBS
1302 _mm_expand_alpha_rev(__m64 pixel)
1304 return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
1307 #endif /* __LOONGSON_MMINTRIN_H__ */