2 * Loongson Multimedia Instruction emulation helpers for QEMU.
4 * Copyright (c) 2011 Richard Henderson <rth@twiddle.net>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
23 /* If the byte ordering doesn't matter, i.e. all columns are treated
24 identically, then this union can be used directly. If byte ordering
25 does matter, we generally ignore dumping to memory. */
36 /* Some byte ordering issues can be mitigated by XORing in the following. */
37 #ifdef HOST_WORDS_BIGENDIAN
38 # define BYTE_ORDER_XOR(N) N
40 # define BYTE_ORDER_XOR(N) 0
43 #define SATSB(x) (x < -0x80 ? -0x80 : x > 0x7f ? 0x7f : x)
44 #define SATUB(x) (x > 0xff ? 0xff : x)
46 #define SATSH(x) (x < -0x8000 ? -0x8000 : x > 0x7fff ? 0x7fff : x)
47 #define SATUH(x) (x > 0xffff ? 0xffff : x)
50 (x < -0x80000000ll ? -0x80000000ll : x > 0x7fffffff ? 0x7fffffff : x)
51 #define SATUW(x) (x > 0xffffffffull ? 0xffffffffull : x)
53 uint64_t helper_paddsb(uint64_t fs, uint64_t ft)
60 for (i = 0; i < 8; ++i) {
61 int r = vs.sb[i] + vt.sb[i];
67 uint64_t helper_paddusb(uint64_t fs, uint64_t ft)
74 for (i = 0; i < 8; ++i) {
75 int r = vs.ub[i] + vt.ub[i];
81 uint64_t helper_paddsh(uint64_t fs, uint64_t ft)
88 for (i = 0; i < 4; ++i) {
89 int r = vs.sh[i] + vt.sh[i];
95 uint64_t helper_paddush(uint64_t fs, uint64_t ft)
102 for (i = 0; i < 4; ++i) {
103 int r = vs.uh[i] + vt.uh[i];
109 uint64_t helper_paddb(uint64_t fs, uint64_t ft)
116 for (i = 0; i < 8; ++i) {
117 vs.ub[i] += vt.ub[i];
122 uint64_t helper_paddh(uint64_t fs, uint64_t ft)
129 for (i = 0; i < 4; ++i) {
130 vs.uh[i] += vt.uh[i];
135 uint64_t helper_paddw(uint64_t fs, uint64_t ft)
142 for (i = 0; i < 2; ++i) {
143 vs.uw[i] += vt.uw[i];
148 uint64_t helper_psubsb(uint64_t fs, uint64_t ft)
155 for (i = 0; i < 8; ++i) {
156 int r = vs.sb[i] - vt.sb[i];
162 uint64_t helper_psubusb(uint64_t fs, uint64_t ft)
169 for (i = 0; i < 8; ++i) {
170 int r = vs.ub[i] - vt.ub[i];
176 uint64_t helper_psubsh(uint64_t fs, uint64_t ft)
183 for (i = 0; i < 4; ++i) {
184 int r = vs.sh[i] - vt.sh[i];
190 uint64_t helper_psubush(uint64_t fs, uint64_t ft)
197 for (i = 0; i < 4; ++i) {
198 int r = vs.uh[i] - vt.uh[i];
204 uint64_t helper_psubb(uint64_t fs, uint64_t ft)
211 for (i = 0; i < 8; ++i) {
212 vs.ub[i] -= vt.ub[i];
217 uint64_t helper_psubh(uint64_t fs, uint64_t ft)
224 for (i = 0; i < 4; ++i) {
225 vs.uh[i] -= vt.uh[i];
230 uint64_t helper_psubw(uint64_t fs, uint64_t ft)
237 for (i = 0; i < 2; ++i) {
238 vs.uw[i] -= vt.uw[i];
243 uint64_t helper_pshufh(uint64_t fs, uint64_t ft)
245 unsigned host = BYTE_ORDER_XOR(3);
251 for (i = 0; i < 4; i++, ft >>= 2) {
252 vd.uh[i ^ host] = vs.uh[(ft & 3) ^ host];
257 uint64_t helper_packsswh(uint64_t fs, uint64_t ft)
262 tmp = (int32_t)(fs >> 0);
264 fd |= (tmp & 0xffff) << 0;
266 tmp = (int32_t)(fs >> 32);
268 fd |= (tmp & 0xffff) << 16;
270 tmp = (int32_t)(ft >> 0);
272 fd |= (tmp & 0xffff) << 32;
274 tmp = (int32_t)(ft >> 32);
276 fd |= (tmp & 0xffff) << 48;
281 uint64_t helper_packsshb(uint64_t fs, uint64_t ft)
286 for (i = 0; i < 4; ++i) {
287 int16_t tmp = fs >> (i * 16);
289 fd |= (uint64_t)(tmp & 0xff) << (i * 8);
291 for (i = 0; i < 4; ++i) {
292 int16_t tmp = ft >> (i * 16);
294 fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
300 uint64_t helper_packushb(uint64_t fs, uint64_t ft)
305 for (i = 0; i < 4; ++i) {
306 int16_t tmp = fs >> (i * 16);
308 fd |= (uint64_t)(tmp & 0xff) << (i * 8);
310 for (i = 0; i < 4; ++i) {
311 int16_t tmp = ft >> (i * 16);
313 fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
319 uint64_t helper_punpcklwd(uint64_t fs, uint64_t ft)
321 return (fs & 0xffffffff) | (ft << 32);
324 uint64_t helper_punpckhwd(uint64_t fs, uint64_t ft)
326 return (fs >> 32) | (ft & ~0xffffffffull);
329 uint64_t helper_punpcklhw(uint64_t fs, uint64_t ft)
331 unsigned host = BYTE_ORDER_XOR(3);
336 vd.uh[0 ^ host] = vs.uh[0 ^ host];
337 vd.uh[1 ^ host] = vt.uh[0 ^ host];
338 vd.uh[2 ^ host] = vs.uh[1 ^ host];
339 vd.uh[3 ^ host] = vt.uh[1 ^ host];
344 uint64_t helper_punpckhhw(uint64_t fs, uint64_t ft)
346 unsigned host = BYTE_ORDER_XOR(3);
351 vd.uh[0 ^ host] = vs.uh[2 ^ host];
352 vd.uh[1 ^ host] = vt.uh[2 ^ host];
353 vd.uh[2 ^ host] = vs.uh[3 ^ host];
354 vd.uh[3 ^ host] = vt.uh[3 ^ host];
359 uint64_t helper_punpcklbh(uint64_t fs, uint64_t ft)
361 unsigned host = BYTE_ORDER_XOR(7);
366 vd.ub[0 ^ host] = vs.ub[0 ^ host];
367 vd.ub[1 ^ host] = vt.ub[0 ^ host];
368 vd.ub[2 ^ host] = vs.ub[1 ^ host];
369 vd.ub[3 ^ host] = vt.ub[1 ^ host];
370 vd.ub[4 ^ host] = vs.ub[2 ^ host];
371 vd.ub[5 ^ host] = vt.ub[2 ^ host];
372 vd.ub[6 ^ host] = vs.ub[3 ^ host];
373 vd.ub[7 ^ host] = vt.ub[3 ^ host];
378 uint64_t helper_punpckhbh(uint64_t fs, uint64_t ft)
380 unsigned host = BYTE_ORDER_XOR(7);
385 vd.ub[0 ^ host] = vs.ub[4 ^ host];
386 vd.ub[1 ^ host] = vt.ub[4 ^ host];
387 vd.ub[2 ^ host] = vs.ub[5 ^ host];
388 vd.ub[3 ^ host] = vt.ub[5 ^ host];
389 vd.ub[4 ^ host] = vs.ub[6 ^ host];
390 vd.ub[5 ^ host] = vt.ub[6 ^ host];
391 vd.ub[6 ^ host] = vs.ub[7 ^ host];
392 vd.ub[7 ^ host] = vt.ub[7 ^ host];
397 uint64_t helper_pavgh(uint64_t fs, uint64_t ft)
404 for (i = 0; i < 4; i++) {
405 vs.uh[i] = (vs.uh[i] + vt.uh[i] + 1) >> 1;
410 uint64_t helper_pavgb(uint64_t fs, uint64_t ft)
417 for (i = 0; i < 8; i++) {
418 vs.ub[i] = (vs.ub[i] + vt.ub[i] + 1) >> 1;
423 uint64_t helper_pmaxsh(uint64_t fs, uint64_t ft)
430 for (i = 0; i < 4; i++) {
431 vs.sh[i] = (vs.sh[i] >= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
436 uint64_t helper_pminsh(uint64_t fs, uint64_t ft)
443 for (i = 0; i < 4; i++) {
444 vs.sh[i] = (vs.sh[i] <= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
449 uint64_t helper_pmaxub(uint64_t fs, uint64_t ft)
456 for (i = 0; i < 4; i++) {
457 vs.ub[i] = (vs.ub[i] >= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
462 uint64_t helper_pminub(uint64_t fs, uint64_t ft)
469 for (i = 0; i < 4; i++) {
470 vs.ub[i] = (vs.ub[i] <= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
475 uint64_t helper_pcmpeqw(uint64_t fs, uint64_t ft)
482 for (i = 0; i < 2; i++) {
483 vs.uw[i] = -(vs.uw[i] == vt.uw[i]);
488 uint64_t helper_pcmpgtw(uint64_t fs, uint64_t ft)
495 for (i = 0; i < 2; i++) {
496 vs.uw[i] = -(vs.uw[i] > vt.uw[i]);
501 uint64_t helper_pcmpeqh(uint64_t fs, uint64_t ft)
508 for (i = 0; i < 4; i++) {
509 vs.uh[i] = -(vs.uh[i] == vt.uh[i]);
514 uint64_t helper_pcmpgth(uint64_t fs, uint64_t ft)
521 for (i = 0; i < 4; i++) {
522 vs.uh[i] = -(vs.uh[i] > vt.uh[i]);
527 uint64_t helper_pcmpeqb(uint64_t fs, uint64_t ft)
534 for (i = 0; i < 8; i++) {
535 vs.ub[i] = -(vs.ub[i] == vt.ub[i]);
540 uint64_t helper_pcmpgtb(uint64_t fs, uint64_t ft)
547 for (i = 0; i < 8; i++) {
548 vs.ub[i] = -(vs.ub[i] > vt.ub[i]);
553 uint64_t helper_psllw(uint64_t fs, uint64_t ft)
563 for (i = 0; i < 2; ++i) {
569 uint64_t helper_psrlw(uint64_t fs, uint64_t ft)
579 for (i = 0; i < 2; ++i) {
585 uint64_t helper_psraw(uint64_t fs, uint64_t ft)
595 for (i = 0; i < 2; ++i) {
601 uint64_t helper_psllh(uint64_t fs, uint64_t ft)
611 for (i = 0; i < 4; ++i) {
617 uint64_t helper_psrlh(uint64_t fs, uint64_t ft)
627 for (i = 0; i < 4; ++i) {
633 uint64_t helper_psrah(uint64_t fs, uint64_t ft)
643 for (i = 0; i < 4; ++i) {
649 uint64_t helper_pmullh(uint64_t fs, uint64_t ft)
656 for (i = 0; i < 4; ++i) {
657 vs.sh[i] *= vt.sh[i];
662 uint64_t helper_pmulhh(uint64_t fs, uint64_t ft)
669 for (i = 0; i < 4; ++i) {
670 int32_t r = vs.sh[i] * vt.sh[i];
676 uint64_t helper_pmulhuh(uint64_t fs, uint64_t ft)
683 for (i = 0; i < 4; ++i) {
684 uint32_t r = vs.uh[i] * vt.uh[i];
690 uint64_t helper_pmaddhw(uint64_t fs, uint64_t ft)
692 unsigned host = BYTE_ORDER_XOR(3);
698 p0 = vs.sh[0 ^ host] * vt.sh[0 ^ host];
699 p0 += vs.sh[1 ^ host] * vt.sh[1 ^ host];
700 p1 = vs.sh[2 ^ host] * vt.sh[2 ^ host];
701 p1 += vs.sh[3 ^ host] * vt.sh[3 ^ host];
703 return ((uint64_t)p1 << 32) | p0;
706 uint64_t helper_pasubub(uint64_t fs, uint64_t ft)
713 for (i = 0; i < 8; ++i) {
714 int r = vs.ub[i] - vt.ub[i];
715 vs.ub[i] = (r < 0 ? -r : r);
720 uint64_t helper_biadd(uint64_t fs)
724 for (i = fd = 0; i < 8; ++i) {
725 fd += (fs >> (i * 8)) & 0xff;
730 uint64_t helper_pmovmskb(uint64_t fs)
734 fd |= ((fs >> 7) & 1) << 0;
735 fd |= ((fs >> 15) & 1) << 1;
736 fd |= ((fs >> 23) & 1) << 2;
737 fd |= ((fs >> 31) & 1) << 3;
738 fd |= ((fs >> 39) & 1) << 4;
739 fd |= ((fs >> 47) & 1) << 5;
740 fd |= ((fs >> 55) & 1) << 6;
741 fd |= ((fs >> 63) & 1) << 7;