_mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
(int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
+#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+ _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+ (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
// Constants:
// for (int i = 1; i< 32; ++i)
// printf("static const int cospi_%d_64 = %.0f;\n", i,
// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
#else
-#define WRAPLOW(x, bd) (x)
+#define WRAPLOW(x, bd) ((int32_t)(x))
#endif // CONFIG_EMULATE_HARDWARE
void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
// These are the coefficients used for the multiplies.
// In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
// where cospi_N_64 = cos(N pi /64)
- const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+ const __m128i k__cospi_A = octa_set_epi16(cospi_16_64, cospi_16_64,
cospi_16_64, cospi_16_64,
cospi_16_64, -cospi_16_64,
cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+ const __m128i k__cospi_B = octa_set_epi16(cospi_16_64, -cospi_16_64,
cospi_16_64, -cospi_16_64,
cospi_16_64, cospi_16_64,
cospi_16_64, cospi_16_64);
- const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+ const __m128i k__cospi_C = octa_set_epi16(cospi_8_64, cospi_24_64,
cospi_8_64, cospi_24_64,
cospi_24_64, -cospi_8_64,
cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+ const __m128i k__cospi_D = octa_set_epi16(cospi_24_64, -cospi_8_64,
cospi_24_64, -cospi_8_64,
cospi_8_64, cospi_24_64,
cospi_8_64, cospi_24_64);
- const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+ const __m128i k__cospi_E = octa_set_epi16(cospi_16_64, cospi_16_64,
cospi_16_64, cospi_16_64,
cospi_16_64, cospi_16_64,
cospi_16_64, cospi_16_64);
- const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+ const __m128i k__cospi_F = octa_set_epi16(cospi_16_64, -cospi_16_64,
cospi_16_64, -cospi_16_64,
cospi_16_64, -cospi_16_64,
cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+ const __m128i k__cospi_G = octa_set_epi16(cospi_8_64, cospi_24_64,
cospi_8_64, cospi_24_64,
-cospi_8_64, -cospi_24_64,
-cospi_8_64, -cospi_24_64);
- const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+ const __m128i k__cospi_H = octa_set_epi16(cospi_24_64, -cospi_8_64,
cospi_24_64, -cospi_8_64,
-cospi_24_64, cospi_8_64,
-cospi_24_64, cospi_8_64);
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
- int64_t sum_long = 0;
+ int32_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 2);
- *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
}
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
- int64_t sum_long = 0;
+ int32_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 4);
- *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
}
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
- sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
- sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}