/*
 * DMLib
 * -- Vector and matrix functions
 * Programmed and designed by Matti 'ccr' Hamalainen
 * (C) Copyright 2011-2012 Tecnic Software productions (TNSP)
 */
#include "dmvecmat.h"


void dm_vector_add_n(DMVector *dst, const DMVector *src, const int nlist)
{
    int i;
    for (i = 0; i < nlist; i++)
    {
#ifdef DM_USE_SIMD
    asm("movups      %2,     %%xmm1\n"
        "movups      %1,     %%xmm2\n"
        "addps       %%xmm2, %%xmm1\n"
        "movups      %%xmm1, %0\n"
        : "=m" (dst[i])
        : "m" (dst[i]), "m" (src[i])
        : "memory", "%xmm1", "%xmm2");
#else
        dm_vector_add(dst + i, src + i);
#endif
    }
}


void dm_vector_add_r_n(DMVector *dst, const DMVector *src1, const DMVector *src2, const int nlist)
{
    int i;
    for (i = 0; i < nlist; i++)
    {
#ifdef DM_USE_SIMD
    asm("movups      %2,     %%xmm1\n"
        "movups      %1,     %%xmm2\n"
        "addps       %%xmm2, %%xmm1\n"
        "movups      %%xmm1, %0\n"
        : "=m" (dst[i])
        : "m" (src1[i]), "m" (src2[i])
        : "memory", "%xmm1", "%xmm2");
#else
        dm_vector_add_r(dst + i, src1 + i, src2 + i);
#endif
    }
}


void dm_vector_sub_n(DMVector *dst, const DMVector *src, const int nlist)
{
    int i;
    for (i = 0; i < nlist; i++)
    {
#ifdef DM_USE_SIMD
    asm("movups      %2,     %%xmm1\n"
        "movups      %1,     %%xmm2\n"
        "subps       %%xmm2, %%xmm1\n"
        "movups      %%xmm1, %0\n"
        : "=m" (dst[i])
        : "m" (dst[i]), "m" (src[i])
        : "memory", "%xmm1", "%xmm2");
#else
        dm_vector_add(dst + i, src + i);
#endif
    }
}


void dm_vector_sub_r_n(DMVector *dst, const DMVector *src1, const DMVector *src2, const int nlist)
{
    int i;
    for (i = 0; i < nlist; i++)
    {
#ifdef DM_USE_SIMD
    asm("movups      %2,     %%xmm1\n"
        "movups      %1,     %%xmm2\n"
        "subps       %%xmm2, %%xmm1\n"
        "movups      %%xmm1, %0\n"
        : "=m" (dst[i])
        : "m" (src1[i]), "m" (src2[i])
        : "memory", "%xmm1", "%xmm2");
#else
        dm_vector_sub_r(dst + i, src1 + i, src2 + i);
#endif
    }
}


/* Multiply given vector with a matrix
 */
void dm_vector_mul_by_mat(DMVector *vd, const DMVector *vs, const DMMatrix *mat)
{
#ifdef DM_USE_SIMD
    asm volatile(
        "mov           %1,        %%edx\n"
        "movups        (%%edx),   %%xmm4\n"
        "movups      16(%%edx),   %%xmm5\n"
        "movups      32(%%edx),   %%xmm6\n"
        "movups      48(%%edx),   %%xmm7\n"

        // vector -> xmm0
        "movups      %2,     %%xmm0\n"
        
        // zero final result in xmm2
        "xorps       %%xmm2, %%xmm2\n"

        // perform shuffle and multiply and add whole "column" "X"
        "movups      %%xmm0, %%xmm1\n"
        "shufps      $0x00,  %%xmm1, %%xmm1\n"
        "mulps       %%xmm4, %%xmm1\n"
        "addps       %%xmm1, %%xmm2\n"

        // Y
        "movups      %%xmm0, %%xmm1\n"
        "shufps      $0x55,  %%xmm1, %%xmm1\n"
        "mulps       %%xmm5, %%xmm1\n"
        "addps       %%xmm1, %%xmm2\n"

        // Z
        "movups      %%xmm0, %%xmm1\n"
        "shufps      $0xAA,  %%xmm1, %%xmm1\n"
        "mulps       %%xmm6, %%xmm1\n"
        "addps       %%xmm1, %%xmm2\n"

        // W
        "movups      %%xmm0, %%xmm1\n"
        "shufps      $0xFF,  %%xmm1, %%xmm1\n"
        "mulps       %%xmm7, %%xmm1\n"
        "addps       %%xmm1, %%xmm2\n"

        // Result ->
        "movups      %%xmm2, %0\n"
        : "=m" (vd)
        : "m" (mat), "m" (vs)
        : "memory", "%edx", "%xmm0", "%xmm1", "%xmm2", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
        );
#else
    vd->x = (vs->x * mat->m[0][0]) + (vs->y * mat->m[1][0]) + (vs->z * mat->m[2][0]);
    vd->y = (vs->x * mat->m[0][1]) + (vs->y * mat->m[1][1]) + (vs->z * mat->m[2][1]);
    vd->z = (vs->x * mat->m[0][2]) + (vs->y * mat->m[1][2]) + (vs->z * mat->m[2][2]);
#endif
}


/* Multiply list of given vectors with given matrix.
 */
void dm_vector_mul_by_mat_n(DMVector *list, const int nlist, const DMMatrix *mat)
{
    int i;

#ifdef DM_USE_SIMD
    asm volatile(
        "mov           %0,        %%edx\n"
        "movups        (%%edx),   %%xmm4\n"
        "movups      16(%%edx),   %%xmm5\n"
        "movups      32(%%edx),   %%xmm6\n"
        "movups      48(%%edx),   %%xmm7\n"
        :
        : "m" (mat)
        : "%edx", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
        );
#endif

    for (i = 0; i < nlist; i++)
    {
#ifdef DM_USE_SIMD
        asm volatile
            (
            // list[i] -> xmm0
            "movups      %1,     %%xmm0\n"
            
            // zero final result in xmm2
            "xorps       %%xmm2, %%xmm2\n"

            // perform shuffle and multiply and add whole "column" "X"
            "movups      %%xmm0, %%xmm1\n"
            "shufps      $0x00,  %%xmm1, %%xmm1\n"
            "mulps       %%xmm4, %%xmm1\n"
            "addps       %%xmm1, %%xmm2\n"

            // Y
            "movups      %%xmm0, %%xmm1\n"
            "shufps      $0x55,  %%xmm1, %%xmm1\n"
            "mulps       %%xmm5, %%xmm1\n"
            "addps       %%xmm1, %%xmm2\n"

            // Z
            "movups      %%xmm0, %%xmm1\n"
            "shufps      $0xAA,  %%xmm1, %%xmm1\n"
            "mulps       %%xmm6, %%xmm1\n"
            "addps       %%xmm1, %%xmm2\n"

            // W
            "movups      %%xmm0, %%xmm1\n"
            "shufps      $0xFF,  %%xmm1, %%xmm1\n"
            "mulps       %%xmm7, %%xmm1\n"
            "addps       %%xmm1, %%xmm2\n"

            // Result ->
            "movups      %%xmm2, %0\n"
            : "=m" (list[i])
            : "m" (list[i])
            : "memory", "%xmm0", "%xmm1", "%xmm2", "%xmm4", "%xmm5", "%xmm6", "%xmm7");
#else
        DMVector q;
        memcpy(&q, &list[i], sizeof(DMVector));

        list[i].x = (q.x * mat->m[0][0]) + (q.y * mat->m[1][0]) + (q.z * mat->m[2][0]);
        list[i].y = (q.x * mat->m[0][1]) + (q.y * mat->m[1][1]) + (q.z * mat->m[2][1]);
        list[i].z = (q.x * mat->m[0][2]) + (q.y * mat->m[1][2]) + (q.z * mat->m[2][2]);
#endif
    }
}


/* Set matrix to unit-matrix
 */
void dm_matrix_unit(DMMatrix *mat)
{
    memset(mat, 0, sizeof(DMMatrix));
    mat->m[0][0] = 1.0f;
    mat->m[1][1] = 1.0f;
    mat->m[2][2] = 1.0f;
    mat->m[3][3] = 1.0f;
}


/* Transpose the matrix mat2 to mat1
 */
void dm_matrix_transpose(DMMatrix *mat1, const DMMatrix *mat2)
{
    int i, j;

    for (i = 0; i < DM_MATRIX_SIZE; i++)
        for (j = 0; j < DM_MATRIX_SIZE; j++)
            mat1->m[i][j] = mat2->m[j][i];
}


/* Multiply matrices mat1 and mat2, putting result into mat1
 */
void dm_matrix_mul_r(DMMatrix *dst, const DMMatrix *mat1, const DMMatrix *mat2)
{
#ifdef DM_USE_SIMD
    asm volatile(
        "mov           %1,        %%ebx\n"
        "mov           %2,        %%edx\n"

        // --------------------------------------------------

        // 0
        "movups        (%%ebx),   %%xmm0\n" // mat1[0]
        "movups        (%%edx),   %%xmm1\n" // mat2[0]
        "shufps        $0x00, %%xmm1, %%xmm1\n" // mat2[0][0]
        "mulps         %%xmm0,    %%xmm1\n"
        "movups        %%xmm1,    %%xmm3\n"
        
        // 1
        "movups        16(%%ebx), %%xmm0\n" // mat1[0]
        "movups        (%%edx),   %%xmm1\n" // mat2[0]
        "shufps        $0x55, %%xmm1, %%xmm1\n" // mat2[0][1]
        "mulps         %%xmm0,    %%xmm1\n"
        "addps         %%xmm1,    %%xmm3\n"

        // 2
        "movups        32(%%ebx), %%xmm0\n" // mat1[0]
        "movups        (%%edx),   %%xmm1\n" // mat2[0]
        "shufps        $0xAA, %%xmm1, %%xmm1\n" // mat2[0][2]
        "mulps         %%xmm0,    %%xmm1\n"
        "addps         %%xmm1,    %%xmm3\n"

        // 3
        "movups        48(%%ebx), %%xmm0\n" // mat1[0]
        "movups        (%%edx),   %%xmm1\n" // mat2[0]
        "shufps        $0xFF, %%xmm1, %%xmm1\n" // mat2[0][3]
        "mulps         %%xmm0,    %%xmm1\n"
        "addps         %%xmm1,    %%xmm3\n"

        "mov           %0,        %%ebx\n"
        "movups        %%xmm3,    (%%ebx)\n"

        // --------------------------------------------------

        "mov           %1,        %%ebx\n"

        // 0
        "movups        (%%ebx),   %%xmm0\n" // mat1[0]
        "movups        16(%%edx), %%xmm1\n" // mat2[1]
        "shufps        $0x00, %%xmm1, %%xmm1\n" // mat2[0][0]
        "mulps         %%xmm0,    %%xmm1\n"
        "movups        %%xmm1,    %%xmm3\n"
        
        // 1
        "movups        16(%%ebx), %%xmm0\n" // mat1[0]
        "movups        16(%%edx), %%xmm1\n" // mat2[1]
        "shufps        $0x55, %%xmm1, %%xmm1\n" // mat2[0][1]
        "mulps         %%xmm0,    %%xmm1\n"
        "addps         %%xmm1,    %%xmm3\n"

        // 2
        "movups        32(%%ebx), %%xmm0\n" // mat1[0]
        "movups        16(%%edx), %%xmm1\n" // mat2[1]
        "shufps        $0xAA, %%xmm1, %%xmm1\n" // mat2[0][2]
        "mulps         %%xmm0,    %%xmm1\n"
        "addps         %%xmm1,    %%xmm3\n"

        // 3
        "movups        48(%%ebx), %%xmm0\n" // mat1[0]
        "movups        16(%%edx), %%xmm1\n" // mat2[1]
        "shufps        $0xFF, %%xmm1, %%xmm1\n" // mat2[0][3]
        "mulps         %%xmm0,    %%xmm1\n"
        "addps         %%xmm1,    %%xmm3\n"

        "mov           %0,        %%ebx\n"
        "movups        %%xmm3,    16(%%ebx)\n"

        // --------------------------------------------------

        "mov           %1,        %%ebx\n"

        // 0
        "movups        (%%ebx),   %%xmm0\n" // mat1[0]
        "movups        32(%%edx), %%xmm1\n" // mat2[1]
        "shufps        $0x00, %%xmm1, %%xmm1\n" // mat2[0][0]
        "mulps         %%xmm0,    %%xmm1\n"
        "movups        %%xmm1,    %%xmm3\n"
        
        // 1
        "movups        16(%%ebx), %%xmm0\n" // mat1[0]
        "movups        32(%%edx), %%xmm1\n" // mat2[1]
        "shufps        $0x55, %%xmm1, %%xmm1\n" // mat2[0][1]
        "mulps         %%xmm0,    %%xmm1\n"
        "addps         %%xmm1,    %%xmm3\n"

        // 2
        "movups        32(%%ebx), %%xmm0\n" // mat1[0]
        "movups        32(%%edx), %%xmm1\n" // mat2[1]
        "shufps        $0xAA, %%xmm1, %%xmm1\n" // mat2[0][2]
        "mulps         %%xmm0,    %%xmm1\n"
        "addps         %%xmm1,    %%xmm3\n"

        // 3
        "movups        48(%%ebx), %%xmm0\n" // mat1[0]
        "movups        32(%%edx), %%xmm1\n" // mat2[1]
        "shufps        $0xFF, %%xmm1, %%xmm1\n" // mat2[0][3]
        "mulps         %%xmm0,    %%xmm1\n"
        "addps         %%xmm1,    %%xmm3\n"

        "mov           %0,        %%ebx\n"
        "movups        %%xmm3,    32(%%ebx)\n"

        // --------------------------------------------------

        "mov           %1,        %%ebx\n"

        // 0
        "movups        (%%ebx),   %%xmm0\n" // mat1[0]
        "movups        48(%%edx), %%xmm1\n" // mat2[1]
        "shufps        $0x00, %%xmm1, %%xmm1\n" // mat2[0][0]
        "mulps         %%xmm0,    %%xmm1\n"
        "movups        %%xmm1,    %%xmm3\n"
        
        // 1
        "movups        16(%%ebx), %%xmm0\n" // mat1[0]
        "movups        48(%%edx), %%xmm1\n" // mat2[1]
        "shufps        $0x55, %%xmm1, %%xmm1\n" // mat2[0][1]
        "mulps         %%xmm0,    %%xmm1\n"
        "addps         %%xmm1,    %%xmm3\n"

        // 2
        "movups        32(%%ebx), %%xmm0\n" // mat1[0]
        "movups        48(%%edx), %%xmm1\n" // mat2[1]
        "shufps        $0xAA, %%xmm1, %%xmm1\n" // mat2[0][2]
        "mulps         %%xmm0,    %%xmm1\n"
        "addps         %%xmm1,    %%xmm3\n"

        // 3
        "movups        48(%%ebx), %%xmm0\n" // mat1[0]
        "movups        48(%%edx), %%xmm1\n" // mat2[1]
        "shufps        $0xFF, %%xmm1, %%xmm1\n" // mat2[0][3]
        "mulps         %%xmm0,    %%xmm1\n"
        "addps         %%xmm1,    %%xmm3\n"

        "mov           %0,        %%ebx\n"
        "movups        %%xmm3,    48(%%ebx)\n"

        : "=m" (dst)
        : "m" (mat1), "m" (mat2)
        : "memory", "%edx", "%ebx", "%xmm0", "%xmm2", "%xmm3"
        );
#else
    int i, j;
    for (i = 0; i < DM_MATRIX_SIZE; i++)
        for (j = 0; j < DM_MATRIX_SIZE; j++)
            dst->m[i][j] =
                (mat1->m[i][0] * mat2->m[0][j]) +
                (mat1->m[i][1] * mat2->m[1][j]) +
                (mat1->m[i][2] * mat2->m[2][j]);
#endif
}


void dm_matrix_mul(DMMatrix *mat1, const DMMatrix *mat2)
{
    DMMatrix tmpM;
    dm_matrix_mul_r(&tmpM, mat1, mat2);
    memcpy(mat1, &tmpM, sizeof(DMMatrix));
}


/* Multiply given list of matrices (size of nMatrices units) with given matrix.
 */
void dm_matrix_mul_n(DMMatrix * list, const int nlist, const DMMatrix *mat)
{
    int i;
    for (i = 0; i < nlist; i++)
        dm_matrix_mul(&list[i], mat);
}


/* Optimized rotation matrix creation
 */
void dm_matrix_rot(DMMatrix *mat,
    const DMFloat sx, const DMFloat sy, const DMFloat sz,
    const DMFloat cx, const DMFloat cy, const DMFloat cz)
{
    const DMFloat
        q = cx * sz,
        l = cx * cz,
        i = sx * sz,
        j = sx * cz;

    memset(mat, 0, sizeof(DMMatrix));

    mat->m[0][0] = cy * cz;
    mat->m[0][1] = cy * sz;
    mat->m[0][2] = -sy;


    mat->m[1][0] = (sy * j) - q;
    mat->m[1][1] = (sy * i) + l;
    mat->m[1][2] = sx * cy;


    mat->m[2][0] = (sy * l) + i;
    mat->m[2][1] = (sy * q) - j;
    mat->m[2][2] = cx * cy;

    mat->m[3][3] = 1.0f;
}
