Optimization
From TDN
This is a stub page for TGE optimization tips and tricks. I'll get things started by posting some SSE optimized math functions.
void SSE_matF_x_point4F(const F32 *m, const F32 *p, F32 *presult)
{
PROFILE_START(SSE_matF_x_point4F);
__asm
{
mov edx, p
mov ecx, m
mov eax, presult
movups xmm0, [edx]
movups xmm1, [ecx]
mulps xmm1, xmm0
movups xmm2, [ecx + 16]
mulps xmm2, xmm0
movups xmm3, [ecx + 32]
mulps xmm3, xmm0
movups xmm4, [ecx + 48]
mulps xmm4, xmm0
movaps xmm0, xmm1
movlhps xmm1, xmm2
movhlps xmm2, xmm0
movaps xmm0, xmm3
movlhps xmm3, xmm4
movhlps xmm4, xmm0
addps xmm1, xmm2 //6240 7351 1133 0022
addps xmm3, xmm4 //120142 155131 10280 11391
movaps xmm2, xmm1 //6240 7351 1133 0022
movaps xmm4, xmm3 //120142 155131 10280 11391
shufps xmm2, xmm2, 0xD4 //6240 1133 1133 0022
shufps xmm3, xmm3, 0xD4 //120142 10280 10280 11391
shufps xmm4, xmm4, 0x80 //155131 11391 11391 11391
shufps xmm1, xmm1, 0x80 //7351 0022 0022 0022
movhlps xmm2, xmm3 //6240 1133 120142 10280
movhlps xmm1, xmm4 //7351 0022 155131 11391
addps xmm1, xmm2 //73516240 00221133 155131120142 1139110280
shufps xmm1, xmm1, 0x4E //155131120142 1139110280 73516240 00221133
movups [eax], xmm1
}
PROFILE_END(SSE_matF_x_point4F);
}
void SSE_matF_x_point3F(const F32 *m, const F32 *p, F32 *presult)
{
PROFILE_START(SSE_matF_x_point3F);
F32 p4[4], presult4[4];
U32 size = sizeof(F32) * 3;
dMemcpy(p4, p, size);
p4[3] = 1.f;
static U32 zero[4] = {0, 0, 0, 0};
__asm
{
mov ecx, m
movups xmm0, p4
movups xmm1, [ecx]
mulps xmm1, xmm0
movups xmm2, [ecx + 16]
mulps xmm2, xmm0
movups xmm3, [ecx + 32]
mulps xmm3, xmm0
movups xmm4, zero
movaps xmm0, xmm1
movlhps xmm1, xmm2
movhlps xmm2, xmm0
movaps xmm0, xmm3
movlhps xmm3, xmm4
movhlps xmm4, xmm0
addps xmm1, xmm2 //6240 7351 1133 0022
addps xmm3, xmm4 //120142 155131 10280 11391
movaps xmm2, xmm1 //6240 7351 1133 0022
movaps xmm4, xmm3 //120142 155131 10280 11391
shufps xmm2, xmm2, 0xD4 //6240 1133 1133 0022
shufps xmm3, xmm3, 0xD4 //120142 10280 10280 11391
shufps xmm4, xmm4, 0x80 //155131 11391 11391 11391
shufps xmm1, xmm1, 0x80 //7351 0022 0022 0022
movhlps xmm2, xmm3 //6240 1133 120142 10280
movhlps xmm1, xmm4 //7351 0022 155131 11391
addps xmm1, xmm2 //73516240 00221133 155131120142 1139110280
shufps xmm1, xmm1, 0x4E //155131120142 1139110280 73516240 00221133
movups p4, xmm1
}
dMemcpy(presult, p4, size);
PROFILE_END(SSE_matF_x_point3F);
}
void SSE_matF_x_vectorF(const F32 *m, const F32 *p, F32 *presult)
{
PROFILE_START(SSE_matF_x_vectorF);
F32 p4[4], presult4[4];
U32 size = sizeof(F32) * 3;
dMemcpy(p4, p, size);
p4[3] = 0.f;
static U32 zero[4] = {0, 0, 0, 0};
__asm
{
mov ecx, m
movups xmm0, p4
movups xmm1, [ecx]
mulps xmm1, xmm0
movups xmm2, [ecx + 16]
mulps xmm2, xmm0
movups xmm3, [ecx + 32]
mulps xmm3, xmm0
movups xmm4, zero
movaps xmm0, xmm1
movlhps xmm1, xmm2
movhlps xmm2, xmm0
movaps xmm0, xmm3
movlhps xmm3, xmm4
movhlps xmm4, xmm0
addps xmm1, xmm2 //6240 7351 1133 0022
addps xmm3, xmm4 //120142 155131 10280 11391
movaps xmm2, xmm1 //6240 7351 1133 0022
movaps xmm4, xmm3 //120142 155131 10280 11391
shufps xmm2, xmm2, 0xD4 //6240 1133 1133 0022
shufps xmm3, xmm3, 0xD4 //120142 10280 10280 11391
shufps xmm4, xmm4, 0x80 //155131 11391 11391 11391
shufps xmm1, xmm1, 0x80 //7351 0022 0022 0022
movhlps xmm2, xmm3 //6240 1133 120142 10280
movhlps xmm1, xmm4 //7351 0022 155131 11391
addps xmm1, xmm2 //73516240 00221133 155131120142 1139110280
shufps xmm1, xmm1, 0x4E //155131120142 1139110280 73516240 00221133
movups p4, xmm1
}
dMemcpy(presult, p4, size);
PROFILE_END(SSE_matF_x_vectorF);
}
void SSE_point3F_normalize(F32 *val)
{
PROFILE_START(SSE_point3F_normalize);
F32 p[4];
U32 size = sizeof(F32) * 3;
dMemcpy(p, val, size);
p[3] = 0.f;
__asm {
movups xmm0, p
movaps xmm2, xmm0
mulps xmm0, xmm0
movaps xmm1, xmm0
shufps xmm0, xmm1, 0x4e
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, 0x11
addps xmm0, xmm1
rsqrtps xmm0, xmm0
mulps xmm2, xmm0
movups p, xmm2
}
dMemcpy(val, p, size);
PROFILE_END(SSE_point3F_normalize);
}
F32 SSE_point3F_len(const F32 *p)
{
PROFILE_START(SSE_point3F_len);
F32 result;
F32 p4[4];
dMemcpy(p4, p, sizeof(F32) * 3);
p4[3] = 0.f;
__asm
{
movups xmm0, p4
mulps xmm0, xmm0 ;(w, z, y, x)
movaps xmm1, xmm0 ;(w, z, y, x)
shufps xmm1, xmm1, 0xE5 ;(w, z, y, y)
addss xmm0, xmm1
shufps xmm1, xmm1, 0xE6 ;(w, z, y, z)
addss xmm0, xmm1
sqrtss xmm0, xmm0
movss result, xmm0
}
PROFILE_END(SSE_point3F_len);
return result;
}
F32 SSE_point3F_lenSquared(const F32 *p)
{
PROFILE_START(SSE_point3F_lenSquared);
F32 result;
Point4F p4;
dMemcpy(&p4, p, sizeof(F32) * 3);
p4.w = 0.f;
__asm
{
movups xmm0, p4
mulps xmm0, xmm0 ;(w, z, y, x)
movaps xmm1, xmm0 ;(w, z, y, x)
shufps xmm1, xmm1, 0xE5 ;(w, z, y, y)
addss xmm0, xmm1
shufps xmm1, xmm1, 0xE6 ;(w, z, y, z)
addss xmm0, xmm1
movss result, xmm0
}
PROFILE_END(SSE_point3F_lenSquared);
return result;
}



