/***************************************************************************

    M.A.M.E.32  -  Multiple Arcade Machine Emulator for Win32
    Win32 Portions Copyright (C) 1997 Michael Soderstrom and Chris Kirmse
    
    This file is part of MAME32, and may only be used, modified and
    distributed under the terms of the MAME license, in "readme.txt".
    By continuing to use, modify or distribute this file you indicate
    that you have read the license and understand and accept it fully.

 ***************************************************************************/

/***************************************************************************

  RenderBitmap.c

    16bit VScanlines HScanlines   Double    
       0       0          0          0     RenderBitmap()
       0       0          0          1     RenderDoubleBitmap()
       0       0          1          0     -
       0       0          1          1     RenderDoubleHScanlinesBitmap()
       0       1          0          0     -
       0       1          0          1     RenderDoubleVScanlinesBitmap()
       0       1          1          0     -
       0       1          1          1     RenderDoubleVHScanlinesBitmap()?
       1       0          0          0     RenderBitmap16()
       1       0          0          1     RenderDoubleBitmap16()
       1       0          1          0     -
       1       0          1          1     RenderDoubleHScanlinesBitmap16()
       1       1          0          0     -
       1       1          0          1     RenderDoubleVScanlinesBitmap16()
       1       1          1          0     -
       1       1          1          1     RenderDoubleVHScanlinesBitmap16()?

 ***************************************************************************/

#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <assert.h>
#include "osdepend.h"
#include "RenderBitmap.h"
#include "MAME32.h"
#include "Display.h"
#include "dirty.h"

#ifdef MAME_MMX
#include <mmintrin.h>

/***************************************************************************
    Function prototypes
 ***************************************************************************/
#define RENDERARGS struct osd_bitmap* pSrcBitmap, UINT nSrcStartLine, UINT nSrcStartColumn, UINT nNumLines, UINT nNumColumns, BYTE* pDst, UINT nDstWidth

static void RenderBitmap(RENDERARGS);
static void RenderDoubleBitmap(RENDERARGS);
static void RenderDoubleHScanlinesBitmap(RENDERARGS);
static void RenderDoubleVScanlinesBitmap(RENDERARGS); 
static void RenderDoubleHVScanlinesBitmap(RENDERARGS); 

static void RenderDirtyBitmap(RENDERARGS); 
static void RenderDirtyDoubleBitmap(RENDERARGS); 
static void RenderDirtyDoubleHScanlinesBitmap(RENDERARGS); 
static void RenderDirtyDoubleVScanlinesBitmap(RENDERARGS); 
static void RenderDirtyDoubleHVScanlinesBitmap(RENDERARGS); 

static void RenderBitmap16(RENDERARGS); 
static void RenderDoubleBitmap16(RENDERARGS); 
static void RenderDoubleHScanlinesBitmap16(RENDERARGS); 
static void RenderDoubleVScanlinesBitmap16(RENDERARGS); 
static void RenderDoubleHVScanlinesBitmap16(RENDERARGS); 

static void RenderDirtyBitmap16(RENDERARGS); 
static void RenderDirtyDoubleBitmap16(RENDERARGS); 
static void RenderDirtyDoubleHScanlinesBitmap16(RENDERARGS); 
static void RenderDirtyDoubleVScanlinesBitmap16(RENDERARGS); 
static void RenderDirtyDoubleHVScanlinesBitmap16(RENDERARGS); 

static __inline void    DoubleLine(BYTE* pSrc, UINT nSrcHalfWidth, BYTE* pDst);
static __inline void    DoubleDirtyLine(BYTE* pSrc, UINT x, UINT y, UINT nSrcXMax, BYTE* pDst);
static __inline void    ExpandLine(BYTE* pSrc, UINT nSrcHalfWidth, BYTE* pDst, __m64 bg);
static __inline void    ExpandDirtyLine(BYTE* pSrc, UINT x, UINT y, UINT nSrcXMax, BYTE* pDst, BYTE bg);
static __inline void    DoubleLine2(BYTE* pSrc, UINT nSrcHalfWidth, BYTE* pDst, UINT nDstWidth);
static __inline void    DoubleDirty2Lines(BYTE* pSrc, UINT x, UINT y, UINT nSrcXMax, BYTE* pDst, UINT nDstWidth);

static __inline void    DoubleLine16(WORD* pSrc, UINT nSrcWidth, BYTE* pDst);
static __inline void    DoubleDirtyLine16(BYTE* pSrc, UINT nSrcWidth, BYTE* pDst, UINT y);
static __inline void    ExpandLine16(WORD* pSrc, UINT nSrcWidth, BYTE* pDst, __m64 bg);
static __inline void    ExpandDirtyLine16(BYTE* pSrc, UINT nSrcWidth, BYTE* pDst, BYTE bg, UINT y);

/***************************************************************************
    External variables
 ***************************************************************************/


/***************************************************************************
    Internal structures
 ***************************************************************************/


/***************************************************************************
    Internal variables
 ***************************************************************************/


/***************************************************************************
    External function definitions
 ***************************************************************************/

RenderMethod SelectRenderMethodMMX(BOOL bDouble, BOOL bHScanLines, BOOL bVScanLines,
                                enum DirtyMode eDirtyMode, BOOL b16bit)
{
    RenderMethod Render = NULL;

        if (bDouble == TRUE)
        {
            if (bHScanLines == TRUE)
            {
                if (b16bit == TRUE)
                {
                    if (eDirtyMode == USE_DIRTYRECT)
                        Render = NULL /* RenderDirtyDoubleHScanlinesBitmap16 */;
                    else
                        Render = RenderDoubleHScanlinesBitmap16;
                }
                else
                {
                    if (eDirtyMode == USE_DIRTYRECT)
                        Render = NULL /* RenderDirtyDoubleHScanlinesBitmap */;
                    else
                        Render = RenderDoubleHScanlinesBitmap;
                }
            }
            else
            if (bVScanLines == TRUE)
            {
                if (b16bit == TRUE)
                {
                    if (eDirtyMode == USE_DIRTYRECT)
                        Render = NULL /* RenderDirtyDoubleVScanlinesBitmap16 */;
                    else
                        Render = RenderDoubleVScanlinesBitmap16;
                }
                else
                {
                    if (eDirtyMode == USE_DIRTYRECT)
                        Render = NULL /* RenderDirtyDoubleVScanlinesBitmap */;
                    else
                        Render = RenderDoubleVScanlinesBitmap;
                }
            }
            else
            {
                if (b16bit == TRUE)
                {
                    if (eDirtyMode == USE_DIRTYRECT)
/*
                        Render = RenderDirtyDoubleBitmap16;
*/
                        Render = RenderDoubleBitmap16;
                    else
                        Render = RenderDoubleBitmap16;
                }
                else
                {
                    if (eDirtyMode == USE_DIRTYRECT)
                        Render = RenderDirtyDoubleBitmap;
                    else
                        Render = RenderDoubleBitmap;
                }
            }
        }
        else
        {
            if (b16bit == TRUE)
            {
                if (eDirtyMode == USE_DIRTYRECT)
                    Render = RenderDirtyBitmap16;
                else
                    Render = NULL /* RenderBitmap16 */;
            }
            else
            {
                if (eDirtyMode == USE_DIRTYRECT)
                    Render = RenderDirtyBitmap;
                else
                    Render = NULL /* RenderBitmap */;
            }
        }
#ifdef OLD
    if (bDouble == TRUE)
    {
        if (bHScanLines == TRUE
        &&  bVScanLines == TRUE)
        {
            if (b16bit == TRUE)
            {
                if (bUseDirty == TRUE)
                    Render = NULL /* RenderDirtyDoubleHVScanlinesBitmap/*16*/;
                else
                    Render = RenderDoubleHVScanlinesBitmap16;
            }
            else
            {
                if (bUseDirty == TRUE)
                    Render = NULL /* RenderDirtyDoubleHVScanlinesBitmap */;
                else
                    Render = RenderDoubleHVScanlinesBitmap;
            }
        }
        else
        if (bHScanLines == TRUE)
        {
            if (b16bit == TRUE)
            {
                if (bUseDirty == TRUE)
                    /* Render = RenderDirtyDoubleHScanlinesBitmap16; */
                    Render = RenderDoubleHScanlinesBitmap16;
                else
                    Render = RenderDoubleHScanlinesBitmap16;
            }
            else
            {
                if (bUseDirty == TRUE)
                    Render = NULL /* RenderDirtyDoubleHScanlinesBitmap */;
                else
                    Render = RenderDoubleHScanlinesBitmap;
            }
        }
        else
        if (bVScanLines == TRUE)
        {
            if (b16bit == TRUE)
            {
                if (bUseDirty == TRUE)
                    Render = NULL /* RenderDirtyDoubleVScanlinesBitmap16 */;
                else
                    Render = RenderDoubleVScanlinesBitmap16;
            }
            else
            {
                if (bUseDirty == TRUE)
                    Render = NULL /* RenderDirtyDoubleVScanlinesBitmap */;
                else
                    Render = RenderDoubleVScanlinesBitmap;
            }
        }
        else
        {
            if (b16bit == TRUE)
            {
                if (bUseDirty == TRUE)
                    /* Render =  RenderDirtyDoubleBitmap16; */
                    Render = RenderDoubleBitmap16;
                else
                    Render = RenderDoubleBitmap16;
            }
            else
            {
                if (bUseDirty == TRUE)
                    Render = RenderDirtyDoubleBitmap;
                else
                    Render = RenderDoubleBitmap;
            }
        }
    }
    else
    {
        if (b16bit == TRUE)
            if (bUseDirty == TRUE)
                Render = RenderDirtyBitmap16;
            else
                Render = NULL /* RenderBitmap16 */;
        else
            if (bUseDirty == TRUE)
                Render = RenderDirtyBitmap;
            else
                Render = NULL /* RenderBitmap  */;
    }
#endif
//    assert(Render != NULL);

    return Render;
}

/***************************************************************************
    Internal functions
 ***************************************************************************/

static void RenderBitmap(RENDERARGS) /* not used */
{
    BYTE*   pSrc = pSrcBitmap->line[nSrcStartLine];

    while (nNumLines--)
    {
        memcpy(pDst, pSrc, pSrcBitmap->width);
        
        pSrc += pSrcBitmap->width;
        pDst += nDstWidth;
    }
}

static void RenderBitmap16(RENDERARGS) /* not used */
{
    unsigned short* pSrc = (unsigned short*)pSrcBitmap->line[nSrcStartLine];
    unsigned int    nLen = pSrcBitmap->width << 1;

    while (nNumLines--)
    {
        memcpy(pDst, pSrc, nLen);
        
        pSrc += pSrcBitmap->width;
        pDst += nDstWidth;
    }
}

static void RenderDirtyBitmap(RENDERARGS)
{
    UINT    y;

    if (nSrcStartColumn & 0x7)
    {
        nNumColumns -= 8-(nSrcStartColumn & 0x7);
        nSrcStartColumn += 8-(nSrcStartColumn & 0x7);

    }
    nNumColumns -= nNumColumns & 0x7;

    for (y = nSrcStartLine; y < nNumLines + nSrcStartLine; y++)
    {
        if (IsDirtyLine(y))
        {
        
            __m64 *dirty1 = (__m64 *) ((BYTE *)(dirty_buffer1+y*dirty_width)+(nSrcStartColumn>>3));
            __m64 *dirty2 = (__m64 *) ((BYTE *)(dirty_buffer2+y*dirty_width)+(nSrcStartColumn>>3));
            __m64 *dirty;
            char sixtyfour[8];
            UINT x  = 0;
            __m64 *pdwDst = (__m64 *)(pDst + (y - nSrcStartLine) * nDstWidth);
            
            dirty = (__m64 *) sixtyfour;

            while (x < nNumColumns)
            {
                *dirty = _m_por(*dirty1,*dirty2);
                dirty1++;
                dirty2++;
#define DO_PIXELS8  { \
                        *pdwDst = *(__m64 *)(pSrcBitmap->line[y] + x +nSrcStartColumn); \
                } \
                    x += 8; \
                    pdwDst++; \
                    if (x >=nNumColumns) goto RDBEnd;

                if (((int *)sixtyfour)[0])
                {
                    if (sixtyfour[0])
                        DO_PIXELS8;
                    if (sixtyfour[1])
                        DO_PIXELS8;
                    if (sixtyfour[2])
                        DO_PIXELS8;
                    if (sixtyfour[3])
                        DO_PIXELS8;
                }
                else
                {
                    x += 32;
                    pdwDst +=4;
                }
                 if (((int *)sixtyfour)[1])
                {
                    if (sixtyfour[4])
                        DO_PIXELS8;
                    if (sixtyfour[5])
                        DO_PIXELS8;
                    if (sixtyfour[6])
                        DO_PIXELS8;
                    if (sixtyfour[7])
                    {
                        *pdwDst = *(__m64 *)(pSrcBitmap->line[y] + x + nSrcStartColumn);
                    }
                    x += 8;
                    pdwDst++;
                }
                else
                {
                    x += 32;
                    pdwDst +=4;
                }
           }
        }
RDBEnd:
    }
    _m_empty();
}

static void RenderDirtyBitmap16(RENDERARGS)
{
    UINT    y;
    if (nSrcStartColumn & 0x7)
    {
        nNumColumns -= 8-(nSrcStartColumn & 0x7);
        nSrcStartColumn += 8-(nSrcStartColumn & 0x7);
    }
    nNumColumns -= nNumColumns & 0x7;
    
    for (y = nSrcStartLine; y < nNumLines + nSrcStartLine; y++)
    {
        if (IsDirtyLine(y))
        {
        
            __m64 *dirty1 = (__m64 *) (((BYTE*)(dirty_buffer1+y*dirty_width))+(nSrcStartColumn>>3));
            __m64 *dirty2 = (__m64 *) (((BYTE*)(dirty_buffer2+y*dirty_width))+(nSrcStartColumn>>3));
            __m64 *dirty;
            char sixtyfour[8];
            UINT x  = 0;
            __m64 *pdwDst = (__m64 *)(pDst + (y - nSrcStartLine) * nDstWidth);
            __m64 *pSrc = (__m64 *)(pSrcBitmap->line[y]+nSrcStartColumn*2);
            
            dirty = (__m64 *) sixtyfour;

            while (x < nNumColumns)
            {
                *dirty = _m_por(*dirty1,*dirty2);
                dirty1++;
                dirty2++;

#define DO_PIXELS16 { *pdwDst++ = *pSrc++; \
                        *pdwDst++ = *pSrc++; \
                    }\
                    else \
                    { \
                        pdwDst+=2; \
                        pSrc+=2; \
                    } \
                    x += 8;\
                    if (x >=nNumColumns) goto RDBEnd16;

                if (((int *)sixtyfour)[0])
                {
                    if (sixtyfour[0])
                        DO_PIXELS16;
                    if (sixtyfour[1])
                        DO_PIXELS16;
                    if (sixtyfour[2])
                        DO_PIXELS16;
                    if (sixtyfour[3])
                        DO_PIXELS16;
                }
                else
                {
                    x += 32;
                    pdwDst +=8;
                    pSrc +=8;
                }
                 if (((int *)sixtyfour)[1])
                {
                    if (sixtyfour[4])
                        DO_PIXELS16;
                    if (sixtyfour[5])
                        DO_PIXELS16;
                    if (sixtyfour[6])
                        DO_PIXELS16;
                    if (sixtyfour[7])
                    {
                        *pdwDst++ = *pSrc++; 
                        *pdwDst++ = *pSrc++; 
                    }
                    else
                    {
                        pdwDst+=2;
                        pSrc+=2;
                    }
                    x += 8;
                }
                else
                {
                    x += 32;
                    pdwDst += 8;
                    pSrc += 8;
                }
           }
        }
RDBEnd16:
    }
    _m_empty();
}

static void RenderDoubleBitmap(RENDERARGS)
{
    UINT    nDoubleDstWidth = nDstWidth << 1;
    int     nSrcQrtWidth   = nNumColumns >> 3;
    UINT    nDstQrtWidth   = nDstWidth >> 3;

    while (nNumLines--)
    {
        DoubleLine2(pSrcBitmap->line[nSrcStartLine++]+nSrcStartColumn, nSrcQrtWidth, pDst, nDstQrtWidth);

        pDst += nDoubleDstWidth;
    }
    _m_empty();
}

static void RenderDoubleBitmap16(RENDERARGS)
{
    int nQrtWidth = nNumColumns >> 2;

    while (nNumLines--)
    {
        DoubleLine16(pSrcBitmap->line[nSrcStartLine]+nSrcStartColumn, nQrtWidth, pDst);
        pDst += nDstWidth;

        DoubleLine16(pSrcBitmap->line[nSrcStartLine]+nSrcStartColumn, nQrtWidth, pDst);
        pDst += nDstWidth;

        nSrcStartLine++;
    }
    _m_empty();
}


static void RenderDirtyDoubleBitmap(RENDERARGS)
{
    UINT    nDstQrtWidth = nDstWidth >> 3;
    UINT    nDstDoubleWidth = nDstWidth << 1;

    while (nNumLines--)
    {
        if (IsDirtyLine(nSrcStartLine))
            DoubleDirty2Lines(pSrcBitmap->line[nSrcStartLine],
                          nSrcStartColumn, nSrcStartLine, 
                          nSrcStartColumn + nNumColumns,
                          pDst, nDstQrtWidth);
        nSrcStartLine++;
        pDst += nDstDoubleWidth;
    }
    _m_empty();
}

static void RenderDoubleHScanlinesBitmap(RENDERARGS)
{
    int     nDstDoubleWidth = nDstWidth << 1;
    int     nSrcHalfWidth   = nNumColumns >> 3;

    while (nNumLines--)
    {
        DoubleLine(pSrcBitmap->line[nSrcStartLine++]+nSrcStartColumn, nSrcHalfWidth, pDst);
        pDst += nDstDoubleWidth;
    }
    _m_empty();
}

static void RenderDoubleHScanlinesBitmap16(RENDERARGS)
{
    int     nDstDoubleWidth = nDstWidth << 1;
    int nQrtWidth = nNumColumns >> 2;

    while (nNumLines--)
    {
        DoubleLine16(pSrcBitmap->line[nSrcStartLine++]+nSrcStartColumn, nQrtWidth, pDst);
        pDst += nDstDoubleWidth;
    }
    _m_empty();
}

static void RenderDirtyDoubleHScanlinesBitmap(RENDERARGS)
{
    int     nDstDoubleWidth = nDstWidth << 1;

    while (nNumLines--)
    {
        DoubleDirtyLine(pSrcBitmap->line[nSrcStartLine],
                        nSrcStartColumn, nSrcStartLine,
                        nSrcStartColumn + nNumColumns, pDst);
        pDst += nDstDoubleWidth;
    }
    _m_empty();
}

static void RenderDoubleVScanlinesBitmap(RENDERARGS)
{
    int     nSrcHalfWidth   = nNumColumns >> 3;
    __m64   mBlackPen;

    {
        BYTE    nBlackPen       = MAME32App.m_pDisplay->GetBlackPen();
        DWORD  temp;

        temp = ((DWORD)nBlackPen) * 0x01010101;
        mBlackPen = _m_por(_m_psllq(_m_from_int(temp),32),_m_from_int(temp));
    }


        

    while (nNumLines--)
    {
        ExpandLine(pSrcBitmap->line[nSrcStartLine]+nSrcStartColumn, nSrcHalfWidth, pDst, mBlackPen);
        pDst += nDstWidth;

        ExpandLine(pSrcBitmap->line[nSrcStartLine]+nSrcStartColumn, nSrcHalfWidth, pDst, mBlackPen);
        pDst += nDstWidth;
        nSrcStartLine++;
    }
    _m_empty();
}

static void RenderDoubleVScanlinesBitmap16(RENDERARGS)
{
    __m64   mBlackPen;

    nNumColumns >>= 2;

    {
        WORD    nBlackPen       = MAME32App.m_pDisplay->GetBlackPen();
        DWORD  temp;

        temp = ((DWORD)nBlackPen)* 0x00010001;
        mBlackPen = _m_por(_m_psllq(_m_from_int(temp),32),_m_from_int(temp));
    }

    while (nNumLines--)
    {
        WORD*   pSrc            = ((WORD*)pSrcBitmap->line[nSrcStartLine])+nSrcStartColumn;
        ExpandLine16(pSrc, nNumColumns, pDst, mBlackPen);
        pDst += nDstWidth;

        ExpandLine16(pSrc, nNumColumns, pDst, mBlackPen);
        pDst += nDstWidth;

        nSrcStartLine++;
    }
    _m_empty();
}

static void RenderDirtyDoubleVScanlinesBitmap(RENDERARGS)
{
    BYTE    nBlackPen = MAME32App.m_pDisplay->GetBlackPen();

    while (nNumLines--)
    {
        ExpandDirtyLine(pSrcBitmap->line[nSrcStartLine],
                        nSrcStartColumn, nSrcStartLine,
                        nSrcStartColumn + nNumColumns,
                        pDst, nBlackPen);
        pDst += nDstWidth;

        ExpandDirtyLine(pSrcBitmap->line[nSrcStartLine],
                        nSrcStartColumn, nSrcStartLine,
                        nSrcStartColumn + nNumColumns,
                        pDst, nBlackPen);
        pDst += nDstWidth;

        nSrcStartLine++;
    }
    _m_empty();
}

static void RenderDoubleHVScanlinesBitmap(RENDERARGS)
{
    int     nDstDoubleWidth = nDstWidth << 1;
    int     nSrcHalfWidth   = nNumColumns >> 3;
    __m64   mBlackPen;

    {
        BYTE    nBlackPen       = MAME32App.m_pDisplay->GetBlackPen();
        DWORD  temp;

        temp = (DWORD)((nBlackPen << 24) | (nBlackPen << 8));
        mBlackPen = _m_por(_m_psllq(_m_from_int(temp),32),_m_from_int(temp));
    }

    while (nNumLines--)
    {
        ExpandLine(pSrcBitmap->line[nSrcStartLine++]+nSrcStartColumn, nSrcHalfWidth, pDst, mBlackPen);
        pDst += nDstDoubleWidth;
    }
    _m_empty();
}

static void RenderDoubleHVScanlinesBitmap16(RENDERARGS)
{
    int     nDstDoubleWidth = nDstWidth << 1;
    WORD    nBlackPen       = MAME32App.m_pDisplay->GetBlackPen();
    __m64   mBlackPen;
    
    nNumColumns >>= 2;

    {
        WORD    nBlackPen       = MAME32App.m_pDisplay->GetBlackPen();
        DWORD  temp;

        temp = ((DWORD)nBlackPen)* 0x00010001;
        mBlackPen = _m_por(_m_psllq(_m_from_int(temp),32),_m_from_int(temp));
    }

    while (nNumLines--)
    {
        ExpandLine16(((WORD*)pSrcBitmap->line[nSrcStartLine])+nSrcStartColumn, 
            nNumColumns, pDst, mBlackPen);
        pDst += nDstDoubleWidth;
        nSrcStartLine++;
    }
    _m_empty();
}

static void RenderDirtyDoubleHVScanlinesBitmap(RENDERARGS)
{
    int     nDstDoubleWidth = nDstWidth << 1;
    BYTE    nBlackPen       = MAME32App.m_pDisplay->GetBlackPen();

    while (nNumLines--)
    {
        ExpandDirtyLine(pSrcBitmap->line[nSrcStartLine],
                        nSrcStartColumn, nSrcStartLine,
                        nSrcStartColumn + nNumColumns,
                        pDst, nBlackPen);

        pDst += nDstDoubleWidth;
        nSrcStartLine++;
    }
    _m_empty();
}

/* support functions */

static __inline void DoubleLine(BYTE* pSrc, UINT nSrcHalfWidth, BYTE* pDst)
{
    __m64 *pDDst = (__m64 *) pDst;
    __m64 *pMSrc = (__m64 *) pSrc;

    while (nSrcHalfWidth--)
    {

        *pDDst = _m_punpcklbw(*pMSrc,*pMSrc);
        pDDst++;
        *pDDst = _m_punpckhbw(*pMSrc,*pMSrc);
        pDDst++;
        pMSrc++;
    }
}

static __inline void DoubleLine16(WORD* pSrc, UINT nSrcWidth, BYTE* pDst)
{
    __m64 *pDDst = (__m64 *) pDst;
    __m64 *pMSrc = (__m64 *) pSrc;

    while (nSrcWidth--)
    {

        *pDDst = _m_punpcklwd(*pMSrc,*pMSrc);
        pDDst++;
        *pDDst = _m_punpckhwd(*pMSrc,*pMSrc);
        pDDst++;
        pMSrc++;
    }
}

static __inline void DoubleDirtyLine(BYTE* pSrc, UINT x, UINT y, UINT nSrcXMax, BYTE* pDst)
{
    BYTE    pixel1;
    BYTE    pixel2;
       
    while (x < nSrcXMax)
    {
        if (((x % (32)) == 0) && !IsDirtyDword(x, y))
        {
            x += 32;
            ((DWORD*)pDst) += 16; /* 32 pixels * (1 DWORD / 4 pixels) * 2 */
        }
        else
        {
            if (IsDirty2(x, y))
            {
                pixel1 = *(pSrc + x);
                pixel2 = *(pSrc + x + 1);

                *((DWORD*)pDst)++ = (DWORD)((pixel2 << 24) | (pixel2 << 16) | (pixel1 << 8) | pixel1);
            }
            else
            {
                ((DWORD*)pDst)++;
            }
            x += 2;
        }
    }
}

static __inline void ExpandDirtyLine(BYTE* pSrc, UINT x, UINT y, UINT nSrcXMax, BYTE* pDst, BYTE bg)
{   
    while (x < nSrcXMax)
    {
        if (((x % (32)) == 0) && !IsDirtyDword(x, y))
        {
            x += 32;
            ((DWORD*)pDst) += 16; /* 32 pixels * (1 DWORD / 4 pixels) * 2 */
        }
        else
        {
            if (IsDirty2(x, y))
            {
                *((DWORD*)pDst)++ = (DWORD)((bg << 24) | (*(pSrc + x + 1) << 16) | (bg << 8) | *(pSrc + x));
            }
            else
            {
                ((DWORD*)pDst)++;
            }
            x += 2;
        }
    }
}

static __inline void ExpandLine(BYTE* pSrc, UINT nSrcHalfWidth, BYTE* pDst, __m64 bg)
{
    __m64 *pDDst = (__m64 *) pDst;
    __m64 *pMSrc = (__m64 *) pSrc;

    while (nSrcHalfWidth--)
    {

        *pDDst = _m_punpcklbw(*pMSrc,bg);
        pDDst++;
        *pDDst = _m_punpckhbw(*pMSrc,bg);
        pDDst++;
        pMSrc++;
    }
}

static __inline void ExpandLine16(WORD* pSrc, UINT nSrcWidth, BYTE* pDst, __m64 bg)
{
    __m64 *pDDst = (__m64 *) pDst;
    __m64 *pMSrc = (__m64 *) pSrc;

    while (nSrcWidth--)
    {

        *pDDst = _m_punpcklwd(*pMSrc,bg);
        pDDst++;
        *pDDst = _m_punpckhwd(*pMSrc,bg);
        pDDst++;
        pMSrc++;
    }
}

static __inline void DoubleLine2(BYTE* pSrc, UINT nSrcHalfWidth, BYTE* pDst, UINT nDstWidth)
{
    __m64 *pDDst = (__m64 *) pDst;
    __m64 *pMSrc = (__m64 *) pSrc;

    while (nSrcHalfWidth--)
    {

        *pDDst = *(pDDst+ nDstWidth) = _m_punpcklbw(*pMSrc,*pMSrc);
        pDDst++;
        *pDDst = *(pDDst+ nDstWidth) = _m_punpckhbw(*pMSrc,*pMSrc);
        pDDst++;
        pMSrc++;
    }
}

static __inline void DoubleDirty2Lines(BYTE* pSrc, UINT x, UINT y, UINT nSrcXMax,
                                       BYTE* pDst, UINT nDstWidth)
{
    BYTE    pixel1;
    BYTE    pixel2;
    __m64*  pDDst = (__m64 *) pDst;

    while (x < nSrcXMax)
    {
        if (((x % (32)) == 0) && !IsDirtyDword(x, y))
        {
            x += 32;
            pDDst += 8; /* 32 pixels * (1 DWORD / 4 pixels) * 2 */
        }
        else
        {
            if (IsDirty8(x, y))
            {
                __m64 *pMSrc = (__m64 *)(pSrc + x);

                *pDDst = *(pDDst+nDstWidth) =_m_punpcklbw(*pMSrc,*pMSrc);
                pDDst++;
                *pDDst = *(pDDst+nDstWidth) =_m_punpckhbw(*pMSrc,*pMSrc);
            }
            else
            {
                (pDDst)++;
            }
            pDDst++;
            x += 8;
        }
    }
}

#ifdef OLD
static __inline void DoubleDirtyLine2(BYTE* pSrc, UINT nSrcWidth, BYTE* pDst, 
                                      UINT y, UINT nDstWidth)
{
    UINT    x = 0;
    __m64 *pDDst = (__m64 *) pDst;
       
    while (x < nSrcWidth)
    {
        if (((x % (32)) == 0) && !IsDirtyDword(x, y))
        {
            x += 32;
            pDDst += 8; /* 32 pixels * (1 DWORD / 4 pixels) * 2 */
        }
        else
        {
            if (IsDirty8(x, y))
            {
                __m64 *pMSrc = (__m64 *)(pSrc + x);

                *pDDst = *(pDDst+nDstWidth) =_m_punpcklbw(*pMSrc,*pMSrc);
                pDDst++;
                *pDDst = *(pDDst+nDstWidth) =_m_punpckhbw(*pMSrc,*pMSrc);
            }
            else
            {
                (pDDst)++;
            }
            pDDst++;
            x += 8;
        }
    }
}
#endif


/* MAME_MMX */
#else
/* stub to allow compilation with no MMX */
RenderMethod SelectRenderMethodMMX(BOOL bDouble, BOOL bHScanLines, BOOL bVScanLines,
                                BOOL bUseDirty, BOOL b16bit)
{
    return NULL;
}
#endif

/* do not add code after this point */
