if (UNLIKELY(local_best_sad == 0xffff)) {
__m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
+ // Re-arrange the values in v_sad_d as follows:
+ // v_loval_d[0] = v_sad_d[0], v_loval_d[1] = v_sad_d[2]
+ // v_loval_d[2] = v_sad_d[1], v_loval_d[3] = v_sad_d[3]
+ // v_loidx_d stores the corresponding indices 0, 2, 1, 3
+ // This re-arrangement is required to ensure that when there exists
+ // more than one minimum, the one with the least index is selected
v_loval_d = _mm_shuffle_epi32(v_sad_d, 0xd8);
v_loidx_d = _mm_set_epi32(3, 1, 2, 0);
v_hival_d = _mm_srli_si128(v_loval_d, 8);
v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
+ // Compare if v_sad_d[1] < v_sad_d[0], v_sad_d[3] < v_sad_d[2]
v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
v_hival_d = _mm_srli_si128(v_loval_d, 4);
v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
+ // min(v_sad_d[2], v_sad_d[3]) < min(v_sad_d[0], v_sad_d[1])
v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);