//-------------------------------------------------------------------------------------
//
// Copyright 2009 Intel Corporation
// All Rights Reserved
//
// Permission is granted to use, copy, distribute and prepare derivative works of this
// software for any purpose and without fee, provided, that the above copyright notice
// and this statement appear in all copies.  Intel makes no representations about the
// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
// assume any responsibility for any errors which may appear in this software nor any
// responsibility to update it.
//

/////////////////////////////////////////////////////////////////////////////
// Based upon:
//
// Approximate Math Library for SSE / SSE2
//  Header File
//  Version 2.0
//  Author Alex Klimovitski, Intel GmbH
/////////////////////////////////////////////////////////////////////////////
#include <emmintrin.h>

#include "AMaths.h"
#include "AMaths_internal.h"

#ifdef AMATHS_ASM

void __declspec(naked) __stdcall am_sincos_ps(__m128 x, __m128* s, __m128* c)
{
	__asm
	{
		movaps	xmm7, xmm0
		andps	xmm0, _ps_am_inv_sign_mask
		mov		ecx, esp
		andps	xmm7, _ps_am_sign_mask
		mulps	xmm0, _ps_am_2_o_pi
		and		ecx, ~15

		movhlps	xmm2, xmm0
		movq	mm4, _pi32_1
		cvttps2pi	mm0, xmm0
		movq	mm5, _pi32_1
		movq	mm2, mm0
		paddd	mm4, mm0
		movq	mm7, _pi32_0x80000000
		cvttps2pi	mm1, xmm2
		pslld	mm2, (31 - 1)
		pslld	mm4, (31 - 1)
		movq	mm3, mm1
		paddd	mm5, mm1
		pslld	mm3, (31 - 1)
		pand	mm2, mm7
		pslld	mm5, (31 - 1)
		pand	mm4, mm7
		movq	[ecx - 32], mm2
		pand	mm3, mm7
		pand	mm5, mm7
		movq	[ecx - 32 + 8], mm3
		movq	[ecx - 48], mm4
		movq	[ecx - 48 + 8], mm5

		cvtpi2ps	xmm3, mm1
		pand	mm1, _pi32_1
		movaps	xmm4, _ps_am_1
		ASM_MOVE_L2H(xmm3)
		pxor	mm7, mm7
		cvtpi2ps	xmm3, mm0
		pand	mm0, _pi32_1
		pcmpeqd	mm1, mm7
		subps	xmm0, xmm3
		movq	[ecx - 16 + 8], mm1
		minps	xmm0, xmm4
		pcmpeqd	mm0, mm7
		subps	xmm4, xmm0
		movq	[ecx - 16], mm0

		movaps	xmm3, [ecx - 16]
		movaps	xmm6, xmm4
		andps	xmm4, xmm3
		movaps	xmm2, xmm3
		andnps	xmm3, xmm0
		andps	xmm0, xmm2
		orps	xmm4, xmm3

		mov		eax, [esp + 4 + 16]
		mov		edx, [esp + 4 + 16 + 4]

		andnps	xmm2, xmm6
		orps	xmm0, xmm2

		movaps	xmm3, _ps_sincos_p3
		movaps	xmm1, xmm0
		movaps	xmm5, xmm4
		xorps	xmm7, [ecx - 32]
		mulps	xmm0, xmm0
		mulps	xmm4, xmm4
		movaps	xmm2, xmm0
		movaps	xmm6, xmm4
		orps	xmm1, xmm7
		movaps	xmm7, _ps_sincos_p2
		mulps	xmm0, xmm3
		mulps	xmm4, xmm3
		movaps	xmm3, _ps_sincos_p1
		addps	xmm0, xmm7
		addps	xmm4, xmm7
		movaps	xmm7, _ps_sincos_p0
		mulps	xmm0, xmm2
		mulps	xmm4, xmm6
		orps	xmm5, [ecx - 48]
		addps	xmm0, xmm3
		addps	xmm4, xmm3
		mulps	xmm0, xmm2
		mulps	xmm4, xmm6
		addps	xmm0, xmm7
		addps	xmm4, xmm7
		mulps	xmm0, xmm1
		mulps	xmm4, xmm5

		movaps	[eax], xmm0
		movaps	[edx], xmm4

		ret		16 + 4 + 4 + 8
	}
}

#endif