Optimization

From TDN

This is a stub page for TGE optimization tips and tricks. I'll get things started by posting some SSE optimized math functions.

void SSE_matF_x_point4F(const F32 *m, const F32 *p, F32 *presult)
{
	PROFILE_START(SSE_matF_x_point4F);

	__asm
	{
		mov edx, p
		mov ecx, m
		mov eax, presult
		movups xmm0, [edx]
		movups xmm1, [ecx]
		mulps  xmm1, xmm0
		movups xmm2, [ecx + 16]
		mulps xmm2, xmm0
		movups xmm3, [ecx + 32]
		mulps xmm3, xmm0
		movups xmm4, [ecx + 48]
		mulps xmm4, xmm0
		movaps xmm0, xmm1
		movlhps xmm1, xmm2
		movhlps xmm2, xmm0
		movaps xmm0, xmm3
		movlhps xmm3, xmm4
		movhlps xmm4, xmm0
		addps xmm1, xmm2  //6240 7351 1133 0022
		addps xmm3, xmm4  //120142 155131 10280 11391
		movaps xmm2, xmm1 //6240 7351 1133 0022
		movaps xmm4, xmm3 //120142 155131 10280 11391
		shufps xmm2, xmm2, 0xD4 //6240 1133 1133 0022
		shufps xmm3, xmm3, 0xD4 //120142 10280 10280 11391
		shufps xmm4, xmm4, 0x80 //155131 11391 11391 11391
		shufps xmm1, xmm1, 0x80 //7351 0022 0022 0022
		movhlps xmm2, xmm3 //6240 1133 120142 10280
		movhlps xmm1, xmm4 //7351 0022 155131 11391
		addps xmm1, xmm2 //73516240 00221133 155131120142 1139110280
		shufps xmm1, xmm1, 0x4E  //155131120142 1139110280 73516240 00221133  
		movups [eax], xmm1
	}

	PROFILE_END(SSE_matF_x_point4F);
}

void SSE_matF_x_point3F(const F32 *m, const F32 *p, F32 *presult)
{
	PROFILE_START(SSE_matF_x_point3F);
	F32 p4[4], presult4[4];
	U32 size = sizeof(F32) * 3;
	dMemcpy(p4, p, size);
	p4[3] = 1.f;
	static U32 zero[4] = {0, 0, 0, 0};

	__asm
	{
		mov ecx, m
		movups xmm0, p4
		movups xmm1, [ecx]
		mulps  xmm1, xmm0
		movups xmm2, [ecx + 16]
		mulps xmm2, xmm0
		movups xmm3, [ecx + 32]
		mulps xmm3, xmm0
		movups xmm4, zero
		movaps xmm0, xmm1
		movlhps xmm1, xmm2
		movhlps xmm2, xmm0
		movaps xmm0, xmm3
		movlhps xmm3, xmm4
		movhlps xmm4, xmm0
		addps xmm1, xmm2  //6240 7351 1133 0022
		addps xmm3, xmm4  //120142 155131 10280 11391
		movaps xmm2, xmm1 //6240 7351 1133 0022
		movaps xmm4, xmm3 //120142 155131 10280 11391
		shufps xmm2, xmm2, 0xD4 //6240 1133 1133 0022
		shufps xmm3, xmm3, 0xD4 //120142 10280 10280 11391
		shufps xmm4, xmm4, 0x80 //155131 11391 11391 11391
		shufps xmm1, xmm1, 0x80 //7351 0022 0022 0022
		movhlps xmm2, xmm3 //6240 1133 120142 10280
		movhlps xmm1, xmm4 //7351 0022 155131 11391
		addps xmm1, xmm2 //73516240 00221133 155131120142 1139110280
		shufps xmm1, xmm1, 0x4E  //155131120142 1139110280 73516240 00221133  
		movups p4, xmm1
	}

	dMemcpy(presult, p4, size);

	PROFILE_END(SSE_matF_x_point3F);
}

void SSE_matF_x_vectorF(const F32 *m, const F32 *p, F32 *presult)
{
	PROFILE_START(SSE_matF_x_vectorF);
	F32 p4[4], presult4[4];
	U32 size = sizeof(F32) * 3;
	dMemcpy(p4, p, size);
	p4[3] = 0.f;
	static U32 zero[4] = {0, 0, 0, 0};

	__asm
	{
		mov ecx, m
		movups xmm0, p4
		movups xmm1, [ecx]
		mulps  xmm1, xmm0
		movups xmm2, [ecx + 16]
		mulps xmm2, xmm0
		movups xmm3, [ecx + 32]
		mulps xmm3, xmm0
		movups xmm4, zero
		movaps xmm0, xmm1
		movlhps xmm1, xmm2
		movhlps xmm2, xmm0
		movaps xmm0, xmm3
		movlhps xmm3, xmm4
		movhlps xmm4, xmm0
		addps xmm1, xmm2  //6240 7351 1133 0022
		addps xmm3, xmm4  //120142 155131 10280 11391
		movaps xmm2, xmm1 //6240 7351 1133 0022
		movaps xmm4, xmm3 //120142 155131 10280 11391
		shufps xmm2, xmm2, 0xD4 //6240 1133 1133 0022
		shufps xmm3, xmm3, 0xD4 //120142 10280 10280 11391
		shufps xmm4, xmm4, 0x80 //155131 11391 11391 11391
		shufps xmm1, xmm1, 0x80 //7351 0022 0022 0022
		movhlps xmm2, xmm3 //6240 1133 120142 10280
		movhlps xmm1, xmm4 //7351 0022 155131 11391
		addps xmm1, xmm2 //73516240 00221133 155131120142 1139110280
		shufps xmm1, xmm1, 0x4E  //155131120142 1139110280 73516240 00221133  
		movups p4, xmm1
	}

	dMemcpy(presult, p4, size);

	PROFILE_END(SSE_matF_x_vectorF);
}

void SSE_point3F_normalize(F32 *val)
{
	PROFILE_START(SSE_point3F_normalize);
	F32 p[4];
	U32 size = sizeof(F32) * 3;
	dMemcpy(p, val, size);
	p[3] = 0.f;

	__asm {
		movups xmm0, p
		movaps xmm2, xmm0
		mulps xmm0, xmm0
		movaps xmm1, xmm0
		shufps xmm0, xmm1, 0x4e
		addps xmm0, xmm1
		movaps xmm1, xmm0
		shufps xmm1, xmm1, 0x11
		addps xmm0, xmm1
		rsqrtps xmm0, xmm0 
		mulps xmm2, xmm0
		movups p, xmm2
	}

	dMemcpy(val, p, size);

	PROFILE_END(SSE_point3F_normalize);
}

F32 SSE_point3F_len(const F32 *p)
{
	PROFILE_START(SSE_point3F_len);
	F32 result;
	F32 p4[4];
	dMemcpy(p4, p, sizeof(F32) * 3);
	p4[3] = 0.f;

	__asm
	{
		movups xmm0, p4
		mulps xmm0, xmm0 ;(w, z, y, x)
		movaps xmm1, xmm0 ;(w, z, y, x)
		shufps xmm1, xmm1, 0xE5 ;(w, z, y, y)
		addss xmm0, xmm1
		shufps xmm1, xmm1, 0xE6  ;(w, z, y, z)
		addss xmm0, xmm1
		sqrtss xmm0, xmm0
		movss result, xmm0
	}

	PROFILE_END(SSE_point3F_len);

	return result;
}

F32 SSE_point3F_lenSquared(const F32 *p)
{
	PROFILE_START(SSE_point3F_lenSquared);
	F32 result;
	Point4F p4;
	dMemcpy(&p4, p, sizeof(F32) * 3);
	p4.w = 0.f;

	__asm
	{
		movups xmm0, p4
		mulps xmm0, xmm0 ;(w, z, y, x)
		movaps xmm1, xmm0 ;(w, z, y, x)
		shufps xmm1, xmm1, 0xE5 ;(w, z, y, y)
		addss xmm0, xmm1
		shufps xmm1, xmm1, 0xE6  ;(w, z, y, z)
		addss xmm0, xmm1
		movss result, xmm0
	}

	PROFILE_END(SSE_point3F_lenSquared);
	return result;
}