/* $Id: mach64dma.c,v 1.8 1999/12/12 10:44:17 johnc Exp $ */

/*
 * GLX Hardware Device Driver for ATI Rage Pro
 * Copyright (C) 1999 Gareth Hughes
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * WITTAWAT YAMWONG, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
 * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * Based on MGA driver: mgadma.c ???
 *
 *    Gareth Hughes <garethh@bell-labs.com>
 */

#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <stdio.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>

#include "mach64glx.h"
#include "pb.h"

/* public vars */
mach64Dma_buffer	*dma_buffer;	/* dmaBuffers[ activeDmaBuffer ] */


/* This will be overwritten from the default values when glx.so is
 * loaded on the client.
 */
void mach64ServerDmaFlush( int wait );
void    (*mach64DoDmaFlush)( int ) = mach64ServerDmaFlush;
mach64UI32	mach64ActiveDmaBuffer = 0;

void mach64DmaResetBuffer( void );
static void mach64FlushPseudoDma( void );


unsigned _SWAP( unsigned a ) {
	return ( ( a & 255 ) << 24 )
		| ( ( a >> 8 ) & 255 ) << 16
		| ( ( a >> 16 ) & 255 ) << 8
 		| ( a >> 24 );
}


static void delay( void )
{ }

/*
 * mach64WaitForDmaCompletion
 *
 */
#define	TIMEOUT_USEC		1000000

int mach64WaitForDmaCompletion( void )
{
    int		startTime;
    int		curTime;
    int		i;

    if ( !mach64glx.dmaActive ) {
	return 0;
    }
    mach64glx.dmaActive = 0;

    if ( mach64glx.skipDma ) {
	return 0;
    }
    startTime = 0;
    curTime = 0;

    while ( 1 )
    {
    	int guiStat;

    	guiStat = INREG( MACH64REG_GUI_STAT );
    	guiStat = SWAP( guiStat );

    	if ( !( guiStat & 0x00000001 ) ) {
	    break;
    	}

	curTime = usec();
	if ( ( startTime == 0 ) || ( curTime < startTime /*wrap case*/ ) ) {
	    startTime = curTime;
	} else if ( curTime - startTime > TIMEOUT_USEC ) {
	    mach64Msg( 1, "waitForDmaCompletion timed out\n" );
	    break;
	}

	/* spin in place a bit so we aren't hammering the register */
	for ( i = 0 ; i < 10000 ; i++ ) {
	    delay();
	}
    }

    mach64Msg( 10, "waitForDmaCompletion, usec: %d\n", curTime - startTime );
    if ( MACH64_ISBUSY() ) {
	mach64Msg( 1, "waitForDmaCompletion: still going!\n" );
    }

    return curTime - startTime;
}

/*
 * mach64DmaResetBuffer
 */
void mach64DmaResetBuffer()
{
    dma_buffer = dmaBuffers[ mach64ActiveDmaBuffer ];
    dma_buffer->tableDwords = 0;
    dma_buffer->bufferDwords = 0;

    /* This is required because each dma buffer is finished by a
     * return to 2d state, as expected by the X server.
     */
    if ( mach64DB && mach64Ctx ) {
	if ( MESA_VERBOSE & VERBOSE_DRIVER ) {
	    fprintf( stderr, "needEnter3D and ENTER3D in mach64DmaResetBuffer\n" );
	}
	mach64Ctx->new_state |= MACH64_NEW_CONTEXT;
    }
}

int	xchangeDummy;

#ifndef NO_MTRR
static void FlushWriteCombining( void )
{
#ifdef USE_X86_ASM
    __asm__ volatile( " push %%eax ; xchg %%eax, %0 ; pop %%eax" : : "m" (xchangeDummy) );
    __asm__ volatile( " push %%eax ; push %%ebx ; push %%ecx ; push %%edx ; movl $0,%%eax ; cpuid ; pop %%edx ; pop %%ecx ; pop %%ebx ; pop %%eax" : /* no outputs */ :  /* no inputs */ );
#endif
}
#endif

/*
 * mach64FlushRealDma
 */
#define DMA_CHUNKSIZE	0x1000

void mach64FlushRealDma( void )
{
    mach64UI32	*table_ptr;
    int		i = 0;
	int	chunks;
	
    if ( mach64glx.skipDma ) {
	return;
    }
    chunks = ( dma_buffer->bufferDwords * 4 + DMA_CHUNKSIZE - 1 ) / DMA_CHUNKSIZE;
    mach64Msg( 19, "mach64FlushRealDma() %d dwords, %d entries\n",
	       dma_buffer->bufferDwords, chunks );

    /* GTH: Do we really need this here? */
    MACH64_WAITFREE();

#ifndef	NO_MTRR
    /* make sure any write combining data is flushed */
    FlushWriteCombining();
#endif

    /* if we are using a card memory buffer, do a read to
       guarantee the store buffer is flushed */
    xchangeDummy = dma_buffer->virtualBuffer[dma_buffer->bufferDwords];

    table_ptr = dma_buffer->virtualTable;
    dma_buffer->tableDwords = 0;

    /* generate the descriptors for the full 4k chunks */
    for ( i = 0 ; i < chunks-1 ; i++ )
    {
	table_ptr[DMA_FRAME_BUF_OFFSET] = MACH64REG_BM_ADDR + 0x7ff800;

	table_ptr[DMA_SYS_MEM_ADDR] =
	    dma_buffer->physicalBuffer + i * DMA_CHUNKSIZE;

	table_ptr[DMA_COMMAND] = DMA_CHUNKSIZE | 0x40000000;

	table_ptr[DMA_RESERVED] = 0;

	dma_buffer->tableDwords += 4;
	table_ptr += 4;
    }

    /* generate the final descriptor for any remaining commands */
	table_ptr[DMA_FRAME_BUF_OFFSET] = MACH64REG_BM_ADDR + 0x7ff800;

	table_ptr[DMA_SYS_MEM_ADDR] =
	    dma_buffer->physicalBuffer + i * DMA_CHUNKSIZE;

	table_ptr[DMA_COMMAND] =
	    ( (dma_buffer->bufferDwords * 4 - i * DMA_CHUNKSIZE )
	      | 0x80000000 | 0x40000000 );

	table_ptr[DMA_RESERVED] = 0;

 	dma_buffer->tableDwords += 4;


    mach64Msg( 19, "  table entries: %d buffer cmds: %d\n", dma_buffer->tableDwords / 4, dma_buffer->bufferDwords / 2 );

    for ( i = 0 ; i < dma_buffer->tableDwords / 4 ; i++ )
    {
	mach64Msg( 19, "    entry: %d addr: %p cmd: 0x%x\n", i, dma_buffer->virtualTable[4*i+1], dma_buffer->virtualTable[4*i+2] );
    }

    OUTREG( MACH64REG_BUS_CNTL,
	    ( INREG( MACH64REG_BUS_CNTL ) & BUS_master_MASK )
	    | BUS_master_enable );

    /* FIXME: Add 16k circular buffer size (0x0) to register header */
    OUTREG( MACH64REG_BM_GUI_TABLE, dma_buffer->physicalTable | 0x0 );

    OUTREG( MACH64REG_SRC_CNTL, INREG( MACH64REG_SRC_CNTL ) |
	    SRC_bm_enable | SRC_bm_gui_sync | SRC_bm_op_sysmem_to_reg );

/*
 * To start the DMA transfer, we need to initiate a GUI operation.  We can
 * write any value to the register, as it is only used to start the engine.
 */
#define MACH64DMAINITIATE()	OUTREG( MACH64REG_DST_HEIGHT_WIDTH, 0x0000 )

    /* initiate the transfer */
    MACH64DMAINITIATE();
#if 0
    /* FIXME: GTH - This is an ugly hack */
    MACH64_WAITFREE();

    /* disable bus mastering transfers */
    OUTREG( MACH64REG_BUS_CNTL,
	    ( INREG( MACH64REG_BUS_CNTL ) & BUS_master_MASK )
	    | BUS_master_disable );

    OUTREG( MACH64REG_SRC_CNTL,
	    ( INREG( MACH64REG_SRC_CNTL ) & SRC_bm_MASK )
	    | SRC_bm_disable );
#endif	    
}

/*
 * mach64DmaFlush
 * Send all pending commands off to the hardware.
 * If we are running async, the hardware will be drawing
 * while we return to do other things.
 */
void mach64ServerDmaFlush( int wait )
{
    int		start, end;

    /* if the buffer is empty, just change in place */
    if ( !dma_buffer->bufferDwords ) {
	if ( wait ) {
	    mach64WaitForDmaCompletion();
	}
	mach64DmaResetBuffer();
	return;
    }

    mach64glx.c_dmaFlush++;

    /* wait for the last buffer to complete */
    if ( !mach64WaitForDmaCompletion() ) {
	mach64glx.hardwareWentIdle = 1;
    }

    /* Add the commands at the end of the buffer to go back to
     * drawing on the front buffer the way the X server expects.
     */
    {
	DMALOCALS;
	int		old;

	// allow these to go into the overflow safety zone
	old = dma_buffer->overflowBufferDwords;
	dma_buffer->overflowBufferDwords = dma_buffer->maxBufferDwords;

	DMAGETPTR( 60 );

	DMAOUTREG( MACH64REG_DST_OFF_PITCH, mach64glx.registers[MACH64REG_DST_OFF_PITCH>>2] );
	DMAOUTREG( MACH64REG_SRC_OFF_PITCH, mach64glx.registers[MACH64REG_SRC_OFF_PITCH>>2] );
	DMAOUTREG( MACH64REG_DP_SRC, mach64glx.registers[MACH64REG_DP_SRC>>2] );
	DMAOUTREG( MACH64REG_DP_MIX, mach64glx.registers[MACH64REG_DP_MIX>>2] );
	DMAOUTREG( MACH64REG_DP_FRGD_CLR, mach64glx.registers[MACH64REG_DP_FRGD_CLR>>2] );
	DMAOUTREG( MACH64REG_DP_WRITE_MASK, mach64glx.registers[MACH64REG_DP_WRITE_MASK>>2] );
	DMAOUTREG( MACH64REG_DP_PIX_WIDTH, mach64glx.registers[MACH64REG_DP_PIX_WIDTH>>2] );
	DMAOUTREG( MACH64REG_Z_CNTL, mach64glx.registers[MACH64REG_Z_CNTL>>2] );
	DMAOUTREG( MACH64REG_CLR_CMP_CNTL, mach64glx.registers[MACH64REG_CLR_CMP_CNTL>>2] );
	DMAOUTREG( MACH64REG_ALPHA_TST_CNTL, mach64glx.registers[MACH64REG_CLR_CMP_CNTL>>2] );
	DMAOUTREG( MACH64REG_GUI_TRAJ_CNTL, mach64glx.registers[MACH64REG_GUI_TRAJ_CNTL>>2] );
	DMAOUTREG( MACH64REG_SCALE_3D_CNTL, mach64glx.registers[MACH64REG_SCALE_3D_CNTL>>2] );
	DMAOUTREG( MACH64REG_SETUP_CNTL, mach64glx.registers[MACH64REG_SETUP_CNTL>>2] );
	/* can't use the composite registers, because they are write only and the save was wrong */ 
	DMAOUTREG( MACH64REG_SC_LEFT, mach64glx.registers[MACH64REG_SC_LEFT>>2] );
	DMAOUTREG( MACH64REG_SC_RIGHT, mach64glx.registers[MACH64REG_SC_RIGHT>>2] );
	DMAOUTREG( MACH64REG_SC_TOP, mach64glx.registers[MACH64REG_SC_TOP>>2] );
	DMAOUTREG( MACH64REG_SC_BOTTOM, mach64glx.registers[MACH64REG_SC_BOTTOM>>2] );
	/* these should terminate the dma, so they should be last */
	DMAOUTREG( MACH64REG_BUS_CNTL, mach64glx.registers[MACH64REG_BUS_CNTL>>2] );
	DMAOUTREG( MACH64REG_SRC_CNTL, mach64glx.registers[MACH64REG_SRC_CNTL>>2] );

	DMAADVANCE();

	dma_buffer->overflowBufferDwords = old;
    }

    /* collect timing information if we are going syncronously */
    if ( mach64glx.dmaDriver != 3 ) {
	start = usec();
    } else {
	start = end = 0;
    }

    /* we will have to wait before doing any software rendering */
    mach64glx.dmaActive = 1;

    if ( mach64glx.dmaDriver < 2 ) {
	mach64FlushPseudoDma();
    } else {
	mach64FlushRealDma();
    }

    if ( ( mach64glx.dmaDriver == 2 ) || wait ) {
	/* wait until the dma completes */
	mach64WaitForDmaCompletion();
    }

    if ( mach64glx.dmaDriver != 3 ) {
	end = usec();
    }

    mach64Msg( 9, "flushmode %i, buffer %i: dwords:%i  usec:%i\n",
	       mach64glx.dmaDriver,  mach64ActiveDmaBuffer,
	       dma_buffer->bufferDwords, end - start );

    /* swap to using the other buffer */
    mach64ActiveDmaBuffer ^= 1;

    mach64DmaResetBuffer();
}

/*
 * mach64DmaFlush
 */
void mach64DmaFlush( void )
{
    mach64DoDmaFlush( 0 );
}

/*
 * mach64DmaFinish
 */
void mach64DmaFinish( void )
{
	/* get out fast if we know dma isn't running, because this gets
	called for every software rendered scanline... */
	if ( !mach64glx.dmaActive && !dma_buffer->bufferDwords ) {
		return;
	}

    /* note this for the performance block display */
    mach64glx.c_drawWaits++;

    mach64DoDmaFlush( 1 );
}


/*
 * mach64DmaOverflow
 * This is called when DMAGETPTR is at the end of the buffer
 */
void mach64DmaOverflow( int newDwords )
{
    mach64Msg( 9, "mach64DmaOverflow(%i)\n", newDwords );

    /* flush all the current commands so we will have another
       empty buffer */
    mach64DmaFlush();

    /* Overflow can happen anywhere, so normal update mechanisms
     * aren't sufficient.
     */
    if ( mach64Ctx ) {
	mach64Ctx->new_state |= MACH64_NEW_CONTEXT;
	mach64DDUpdateState( mach64Ctx->gl_ctx );
    }

    mach64glx.c_overflows++;
    if ( newDwords > dma_buffer->maxBufferDwords ) {
	FatalError("mach64DmaOverflow > maxBufferDwords");
    }
}


/*
 * mach64FlushPseudoDma
 * Hand feed a dma buffer to the card instead of using DMA
 */
static void mach64FlushPseudoDma( void )
{
    mach64UI32	*src;
    int		i;
    int		count;
    int		reg, data;
    int		startTime, t;

    count = dma_buffer->bufferDwords;

    mach64Msg( 20, "primary pseudoDma: %i dwords\n", count );

    mach64glx.hardwareWentIdle = 1;

    if ( mach64glx.skipDma ) {
    	return;
    }

    /* hand feed each register to the card */
    startTime = usec();

    src = dma_buffer->virtualBuffer;

    for ( i = 0 ; i < count ; i+= 2 )
    {
 	reg = src[i];
 	data = src[i+1];

     	if ( !(i & 15) ) {
	    while ( INREG( MACH64REG_FIFO_STAT ) & 0x0000ffff ) {
		t = usec();
		if ( t - startTime > 1000000 ) {
		    FatalError( "mach64FlushPseudoDma timed out at dword %i of %i", i, count );
		}
	    }
     	}

     	OUTREG( MMSELECT( reg ), SWAP(data) );
    }

}


/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * End:
 */
