template <precision P>

struct compute_dot

Defined at line 64 of file ../../third_party/glm/glm/gtc/quaternion_simd.inl

template

<precision

struct compute_quat_mul

<float

, P, true>

{

static tquat

<float

, P> call(tquat

<float

, P> const

q1, tquat

<float

, P> const

q2)

{

// SSE2 STATS: 11 shuffle, 8 mul, 8 add

// SSE4 STATS: 3 shuffle, 4 mul, 4 dpps

__m128 const mul0 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3)));

__m128 const mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));

__m128 const mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));

__m128 const mul3 = _mm_mul_ps(q1.Data, q2.Data);

# if GLM_ARCH

GLM_ARCH_SSE41_BIT

__m128 const add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff);

__m128 const add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff);

__m128 const add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff);

__m128 const add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);

# else

__m128 const mul4 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f));

__m128 const add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul4, mul4));

__m128 const add4 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));

__m128 const mul5 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f));

__m128 const add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul5, mul5));

__m128 const add5 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));

__m128 const mul6 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f));

__m128 const add2 = _mm_add_ps(mul6, _mm_movehl_ps(mul6, mul6));

__m128 const add6 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));

__m128 const mul7 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));

__m128 const add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul7, mul7));

__m128 const add7 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));

#endif

// This SIMD code is a politically correct way of doing this, but in every test I've tried it has been slower than

// the final code below. I'll keep this here for reference - maybe somebody else can do something better...

//__m128 xxyy = _mm_shuffle_ps(add4, add5, _MM_SHUFFLE(0, 0, 0, 0));

//__m128 zzww = _mm_shuffle_ps(add6, add7, _MM_SHUFFLE(0, 0, 0, 0));

//return _mm_shuffle_ps(xxyy, zzww, _MM_SHUFFLE(2, 0, 2, 0));

tquat

<float

, P> Result(uninitialize);

_mm_store_ss(

&Result

.x, add4);

_mm_store_ss(

&Result

.y, add5);

_mm_store_ss(

&Result

.z, add6);

_mm_store_ss(

&Result

.w, add7);

return Result;

}

};

Public Methods

float call (const tquat<float, P> & x, const tquat<float, P> & y)

Defined at line 67 of file ../../third_party/glm/glm/gtc/quaternion_simd.inl