Optimization
From TDN
This is a stub page for TGE optimization tips and tricks. I'll get things started by posting some SSE optimized math functions.
void SSE_matF_x_point4F(const F32 *m, const F32 *p, F32 *presult) { PROFILE_START(SSE_matF_x_point4F); __asm { mov edx, p mov ecx, m mov eax, presult movups xmm0, [edx] movups xmm1, [ecx] mulps xmm1, xmm0 movups xmm2, [ecx + 16] mulps xmm2, xmm0 movups xmm3, [ecx + 32] mulps xmm3, xmm0 movups xmm4, [ecx + 48] mulps xmm4, xmm0 movaps xmm0, xmm1 movlhps xmm1, xmm2 movhlps xmm2, xmm0 movaps xmm0, xmm3 movlhps xmm3, xmm4 movhlps xmm4, xmm0 addps xmm1, xmm2 //6240 7351 1133 0022 addps xmm3, xmm4 //120142 155131 10280 11391 movaps xmm2, xmm1 //6240 7351 1133 0022 movaps xmm4, xmm3 //120142 155131 10280 11391 shufps xmm2, xmm2, 0xD4 //6240 1133 1133 0022 shufps xmm3, xmm3, 0xD4 //120142 10280 10280 11391 shufps xmm4, xmm4, 0x80 //155131 11391 11391 11391 shufps xmm1, xmm1, 0x80 //7351 0022 0022 0022 movhlps xmm2, xmm3 //6240 1133 120142 10280 movhlps xmm1, xmm4 //7351 0022 155131 11391 addps xmm1, xmm2 //73516240 00221133 155131120142 1139110280 shufps xmm1, xmm1, 0x4E //155131120142 1139110280 73516240 00221133 movups [eax], xmm1 } PROFILE_END(SSE_matF_x_point4F); } void SSE_matF_x_point3F(const F32 *m, const F32 *p, F32 *presult) { PROFILE_START(SSE_matF_x_point3F); F32 p4[4], presult4[4]; U32 size = sizeof(F32) * 3; dMemcpy(p4, p, size); p4[3] = 1.f; static U32 zero[4] = {0, 0, 0, 0}; __asm { mov ecx, m movups xmm0, p4 movups xmm1, [ecx] mulps xmm1, xmm0 movups xmm2, [ecx + 16] mulps xmm2, xmm0 movups xmm3, [ecx + 32] mulps xmm3, xmm0 movups xmm4, zero movaps xmm0, xmm1 movlhps xmm1, xmm2 movhlps xmm2, xmm0 movaps xmm0, xmm3 movlhps xmm3, xmm4 movhlps xmm4, xmm0 addps xmm1, xmm2 //6240 7351 1133 0022 addps xmm3, xmm4 //120142 155131 10280 11391 movaps xmm2, xmm1 //6240 7351 1133 0022 movaps xmm4, xmm3 //120142 155131 10280 11391 shufps xmm2, xmm2, 0xD4 //6240 1133 1133 0022 shufps xmm3, xmm3, 0xD4 //120142 10280 10280 11391 shufps xmm4, xmm4, 0x80 //155131 11391 11391 11391 shufps xmm1, xmm1, 0x80 //7351 0022 0022 0022 movhlps xmm2, xmm3 //6240 1133 120142 10280 movhlps xmm1, xmm4 //7351 0022 155131 11391 addps xmm1, xmm2 //73516240 00221133 155131120142 1139110280 shufps xmm1, xmm1, 0x4E //155131120142 1139110280 73516240 00221133 movups p4, xmm1 } dMemcpy(presult, p4, size); PROFILE_END(SSE_matF_x_point3F); } void SSE_matF_x_vectorF(const F32 *m, const F32 *p, F32 *presult) { PROFILE_START(SSE_matF_x_vectorF); F32 p4[4], presult4[4]; U32 size = sizeof(F32) * 3; dMemcpy(p4, p, size); p4[3] = 0.f; static U32 zero[4] = {0, 0, 0, 0}; __asm { mov ecx, m movups xmm0, p4 movups xmm1, [ecx] mulps xmm1, xmm0 movups xmm2, [ecx + 16] mulps xmm2, xmm0 movups xmm3, [ecx + 32] mulps xmm3, xmm0 movups xmm4, zero movaps xmm0, xmm1 movlhps xmm1, xmm2 movhlps xmm2, xmm0 movaps xmm0, xmm3 movlhps xmm3, xmm4 movhlps xmm4, xmm0 addps xmm1, xmm2 //6240 7351 1133 0022 addps xmm3, xmm4 //120142 155131 10280 11391 movaps xmm2, xmm1 //6240 7351 1133 0022 movaps xmm4, xmm3 //120142 155131 10280 11391 shufps xmm2, xmm2, 0xD4 //6240 1133 1133 0022 shufps xmm3, xmm3, 0xD4 //120142 10280 10280 11391 shufps xmm4, xmm4, 0x80 //155131 11391 11391 11391 shufps xmm1, xmm1, 0x80 //7351 0022 0022 0022 movhlps xmm2, xmm3 //6240 1133 120142 10280 movhlps xmm1, xmm4 //7351 0022 155131 11391 addps xmm1, xmm2 //73516240 00221133 155131120142 1139110280 shufps xmm1, xmm1, 0x4E //155131120142 1139110280 73516240 00221133 movups p4, xmm1 } dMemcpy(presult, p4, size); PROFILE_END(SSE_matF_x_vectorF); } void SSE_point3F_normalize(F32 *val) { PROFILE_START(SSE_point3F_normalize); F32 p[4]; U32 size = sizeof(F32) * 3; dMemcpy(p, val, size); p[3] = 0.f; __asm { movups xmm0, p movaps xmm2, xmm0 mulps xmm0, xmm0 movaps xmm1, xmm0 shufps xmm0, xmm1, 0x4e addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, 0x11 addps xmm0, xmm1 rsqrtps xmm0, xmm0 mulps xmm2, xmm0 movups p, xmm2 } dMemcpy(val, p, size); PROFILE_END(SSE_point3F_normalize); } F32 SSE_point3F_len(const F32 *p) { PROFILE_START(SSE_point3F_len); F32 result; F32 p4[4]; dMemcpy(p4, p, sizeof(F32) * 3); p4[3] = 0.f; __asm { movups xmm0, p4 mulps xmm0, xmm0 ;(w, z, y, x) movaps xmm1, xmm0 ;(w, z, y, x) shufps xmm1, xmm1, 0xE5 ;(w, z, y, y) addss xmm0, xmm1 shufps xmm1, xmm1, 0xE6 ;(w, z, y, z) addss xmm0, xmm1 sqrtss xmm0, xmm0 movss result, xmm0 } PROFILE_END(SSE_point3F_len); return result; } F32 SSE_point3F_lenSquared(const F32 *p) { PROFILE_START(SSE_point3F_lenSquared); F32 result; Point4F p4; dMemcpy(&p4, p, sizeof(F32) * 3); p4.w = 0.f; __asm { movups xmm0, p4 mulps xmm0, xmm0 ;(w, z, y, x) movaps xmm1, xmm0 ;(w, z, y, x) shufps xmm1, xmm1, 0xE5 ;(w, z, y, y) addss xmm0, xmm1 shufps xmm1, xmm1, 0xE6 ;(w, z, y, z) addss xmm0, xmm1 movss result, xmm0 } PROFILE_END(SSE_point3F_lenSquared); return result; }