void gen_fx_fy_sse(const float &dx, const float &dy, float *fx, float *fy) {
__declspec(align(16)) float float_all1 = 1.0, float_2 = 2.0, float_4 = 4.0, float_5 = 5.0, float_8 = 8.0;
__declspec(align(16)) float float_0101[4] = { 1.0, 0.0, 1.0, 0.0 };
__declspec(align(16)) float float_1212[4] = { 2.0, 1.0, 2.0, 1.0 };
const __m128 sse_all1 = _mm_load1_ps(&float_all1);
const __m128 sse_all2 = _mm_load1_ps(&float_2);
const __m128 sse_all4 = _mm_load1_ps(&float_4);
const __m128 sse_all5 = _mm_load1_ps(&float_5);
const __m128 sse_all8 = _mm_load1_ps(&float_8);
const __m128 *sse_0101 = (__m128 *)float_0101;
const __m128 *sse_1212 = (__m128 *)float_1212;
__m128 first = _mm_load1_ps(&dx);
__m128 second = _mm_load1_ps(&dy);
__m128 org = _mm_shuffle_ps(first, second, _MM_SHUFFLE(3,2,1,0));
// 0.0 - 1.0
__m128 res0 = _mm_addsub_ps(*sse_0101, org);
__m128 mul2 = _mm_mul_ps(res0, res0);
__m128 mul3 = _mm_mul_ps(mul2, res0);
__m128 mul2by2 = _mm_mul_ps(sse_all2, mul2);
res0 = _mm_sub_ps(sse_all1, mul2by2);
res0 = _mm_add_ps(res0, mul3);
// 1.0 - 2.0
__m128 res1 = _mm_addsub_ps(*sse_1212, org);
mul2 = _mm_mul_ps(res1, res1);
mul3 = _mm_mul_ps(res1, mul2);
res1 = _mm_sub_ps(sse_all4, _mm_mul_ps(sse_all8, res1));
res1 = _mm_add_ps(res1, _mm_mul_ps(sse_all5, mul2));
res1 = _mm_sub_ps(res1, mul3);
__m128 fx0 = _mm_movelh_ps(res0, res1);
__m128 fy0 = _mm_movehl_ps(res0, res1);
_mm_store_ps(fx, _mm_shuffle_ps(fx0, fx0, _MM_SHUFFLE(2, 0, 1, 3)));
_mm_store_ps(fy, _mm_shuffle_ps(fy0, fy0, _MM_SHUFFLE(0, 2, 3, 1)));
}
[0回]