/* $Id: mach64dmainit.c,v 1.48 2000/03/26 06:06:39 gareth Exp $ */

/*
 * GLX Hardware Device Driver for ATI Rage Pro
 * Copyright (C) 1999 Gareth Hughes
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * WITTAWAT YAMWONG, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
 * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * Based on MGA driver: mgadmainit.c by John Carmack
 *
 *    Gareth Hughes <gareth@precisioninsight.com>
 */

/*
 * This file is only entered at startup.  After mach64InitGLX completes,
 * nothing here will be executed again.
 *
 * We need to:
 *
 * make sure we have a chipset we can render with
 *
 * determine the current resolution and color depth that the
 * X server is running in,
 *
 * set up our dma / agp memory
 *
 * determine how we will be communicating with the card: dma or pseudo dma.
 */

#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <stdio.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>


#include "xsmesaP.h"
#include "glx_config.h"
#include "glx_init.h"

#define GC XXGC
#include "gcstruct.h"
#include "pixmapstr.h"
#include "servermd.h" /* PixmapBytePad */
#include "scrnintstr.h"
#include "regionstr.h"
#include "windowstr.h"
#undef GC

#include "glx_symbols.h"

#include "mach64glx.h"
#include "hw_mtrr.h"
#include "hw_agp.h"

#ifdef HAVE_LINUX_NEWAGP
/* AGP kernel module interface */
#include <linux/agpgart.h>
#include <sys/ioctl.h>
#endif

#if defined(USE_X86_ASM)
#include "X86/common_x86asm.h"
#endif

/*

GLX_MACH64_DMA=0	: virtual buffer, pseudo dma
GLX_MACH64_DMA=1	: physical buffer, pseudo dma
GLX_MACH64_DMA=2	: physical buffer, real dma
GLX_MACH64_DMA=3	: physical buffer, real dma, async

*/

memHeap_t	*cardHeap;

hwUI32		sysmemBytes;		/* size of memory block */
hwUI32		sysmemPhysical;		/* 0 if we don't have a physical mapping */
unsigned char	*sysmemVirtual;


#ifdef HAVE_LINUX_NEWAGP
/* This is way to simple for now, we should probably
 * write better support for the new agp module
 * but this is okay for now I suppose;
 */
int		gartfd;
void		*gartbuf;

#define GLX_AGPDEVICE		AGP_DEVICE
#define GLX_AGP_INFO_IOCTL	AGPIOC_INFO
#define MAX_AGP_KEYS		128
agp_info	mach64agp;
agp_allocate	agp_entries[MAX_AGP_KEYS];
#endif


/* private vars */
mach64Dma_buffer	*dmaBuffers[2];

void mach64DmaResetBuffer( void );


/*=============================================*/

/*=============================================*/

static void DmaBenchmark( int dwords ) {
	int	start, end;
	int	mb;
	int	j, k;
	float	fsec;
	DMALOCALS;

	mach64DmaFinish();

	// round down to multiple of 32 dwords
	dwords = dwords & (~31);

	for ( j = 0 ; j < dwords / 32 ; j++ ) {
		DMAGETPTR( 32 );

		for ( k = 0 ; k < 16 ; k++ ) {
			DMAOUTREG( MACH64_Z_CNTL, 0 );
		}

		DMAADVANCE();
	}

	start = usec();

	mach64DmaFinish();

	end = usec();

	fsec = ( end - start ) / 1000000.0;

	mb = ( (float)dwords * 4 / 0x100000 ) / fsec;

	hwMsg( 1, "DmaBenchmark 0x%x bytes, %5.3f sec: %i mb/s\n",
	       dwords*4, fsec, mb );
}

/*=============================================*/

/*
 * AllocateScatteredMemory
 * Create a file of the needed size that will be memory mapped
 * into both the client and server address spaces.
 *
 * Uses:
 * mach64glx.dmaSize
 *
 * Sets:
 * mach64glx.dmaMemory
 * mach64glx.memoryFileName
 * mach64glx.dmaMemoryFs
 */
static void AllocateScatteredMemory( void ) {

	if ( __glx_is_server ) {
		char	*name;
		char	junk;

		//name = tempnam( NULL, "glx_" );
		name = "/tmp/glxmemory";
		hwMsg( 1, "Using memory file: %s\n", name );
		strcpy( mach64glx.memoryFileName, name );
		//free( name );

		/* create a file of the needed size */
		mach64glx.dmaMemoryFd = open( mach64glx.memoryFileName,
					      O_RDWR | O_CREAT , 0600 );
		if ( mach64glx.dmaMemoryFd == -1 ) {
			FatalError( "Couldn't open %s", mach64glx.memoryFileName );
		}

		/* fill the file */
		junk = 0;
		lseek( mach64glx.dmaMemoryFd, mach64glx.dmaSize * 1024 * 1024 - 1,
		       SEEK_SET );
		write( mach64glx.dmaMemoryFd, &junk, 1 );
	} else {
		/* open the file, which was created by the server */
		hwMsg( 1, "Using memory file: %s\n", mach64glx.memoryFileName );
		mach64glx.dmaMemoryFd = open( mach64glx.memoryFileName, O_RDWR, 0600 );
		if ( mach64glx.dmaMemoryFd == -1 ) {
			FatalError( "Couldn't open %s", mach64glx.memoryFileName );
		}
	}

	/* memory map the file, which we will use for our dma memory */
	mach64glx.dmaMemory = mmap( NULL, mach64glx.dmaSize * 0x100000,
				    PROT_READ | PROT_WRITE,  MAP_SHARED, mach64glx.dmaMemoryFd, 0 );

	if ( mach64glx.dmaMemory == MAP_FAILED ) {
		FatalError( "mmap of glx memory buffer failed" );
	}

	/* lock it down */
	if ( mlock( mach64glx.dmaMemory, mach64glx.dmaSize * 1024 * 1024 ) == -1 ) {
		/* we could fall back to pseudo dma if we wanted to */
		FatalError( "Couldn't mlock scattered memory" );
	}

	hwMsg( 1, "%i megs of scattered memory at virtual 0x%x\n",
	       mach64glx.dmaSize, mach64glx.dmaMemory );
}

/*
 * LocateScatteredMemory
 * Do evil things to find the physical memory address of a
 * block of virtual memory.
 * This probably won't work properly on alpha.
 * It may not work properly on ppc.
 */
#define	PAGE_SIZE			4096		// to ragepro, ok to be < system
static void LocateScatteredMemory( void ) {
	int	numPages;
	int	i;
	int	*vm;
	int	magic1, magic2;
	int	remaining;
	int	memFd;
	int	test[2];
	int	page;

	// it doesn't matter if this is smaller than
	// the actual page size
	numPages = ( mach64glx.dmaSize * 0x100000 ) / PAGE_SIZE;
	vm = (int *)mach64glx.dmaMemory;

	hwMsg( 1, "Locating %i pages in VM\n", numPages );

	// allocate a table to hold the remappings
	mach64glx.memoryRemapping =
		malloc( numPages * sizeof( mach64glx.memoryRemapping[0] ) );
	memset( mach64glx.memoryRemapping, 0,
		numPages * sizeof( mach64glx.memoryRemapping[0] ) );

	// create two different numbers that will be different each
	// time the process is run
	magic1 = usec();
	magic2 = magic1 ^ 0x12345678;

	// fill our buffer with some markers we can find
	for ( i = 0 ; i < numPages ; i++ ) {
		vm[i*(PAGE_SIZE>>2)] = magic1;
		vm[i*(PAGE_SIZE>>2)+1] = i;
	}

	// scan /dev/mem looking for our markers
	memFd = open( "/dev/mem", O_RDONLY );
	if ( memFd == -1 ) {
		FatalError( "Couldn't open /dev/mem" );
	}
	remaining = numPages;

	for ( i = 1 ; i < 0x100000 ; i++ ) {
		lseek( memFd, i * PAGE_SIZE, SEEK_SET );
		if ( read( memFd, &test, sizeof( test ) ) != sizeof( test ) ) {
			hwMsg( 1, "read of /dev/mem failed at 0x%x\n", i * PAGE_SIZE );
			break;
		}
		if ( test[0] != magic1 ) {
			continue;
		}
		page = test[1];
		if ( page < 0 || page >= numPages ) {
			hwMsg( 1, "found magic1, but page = %i\n", page );
			continue;
		}
		// now change the value and see if it changes in the mmap
		// to make extra sure we are talking about the same page
		vm[page*(PAGE_SIZE>>2)] = magic2;
		lseek( memFd, i * PAGE_SIZE, SEEK_SET );
		if ( read( memFd, &test, sizeof( test ) ) != sizeof( test ) ) {
			hwMsg( 1, "reread of /dev/mem failed at 0x%x\n", i * PAGE_SIZE );
			break;
		}
		if ( test[0] != magic2 ) {
			hwMsg( 1, "magic2 failed\n" );
			continue;
		}
		if ( mach64glx.memoryRemapping[ page ] ) {
			FatalError( "Found a page twice" );
		}
		hwMsg( 19, "virtual page 0x%x found at physical page 0x%x\n", page, i );
		mach64glx.memoryRemapping[ page ] = i * PAGE_SIZE;
		remaining--;
		if ( !remaining ) {
			break;
		}
	}
	close( memFd );

	if ( remaining ) {
		FatalError( "Didn't find %i pages", remaining );
	}
}


/*
 * AllocateCommandBuffers
 */
#define	OVERFLOW_DWORDS	96

static void AllocateCommandBuffers( void ) {
	int		commandBytes;
	int		descriptorBytes;

	/* decide how big the descriptor table and each command buffer will be */
	descriptorBytes = 16384;
	commandBytes = ( ( mach64glx.dmaSize * 0x100000 ) - descriptorBytes ) / 2;

	/* put the descriptor table first */
	mach64glx.descriptorPhysical = mach64glx.memoryRemapping[0];
	mach64glx.descriptorMemory = mach64glx.dmaMemory;
	mach64glx.maxDescriptors = descriptorBytes / 16;
	hwMsg( 1, "descriptorMemory: 0x%08x\n", mach64glx.descriptorMemory );

	/* always leave enough room for register restores after overflow checks */

	/* setup the two buffers that will be ping-ponged */
	dmaBuffers[0] = malloc( sizeof(mach64Dma_buffer) );
	memset( dmaBuffers[0], '\0', sizeof(mach64Dma_buffer) );

	dmaBuffers[0]->virtualBuffer = (hwUI32 *)mach64glx.dmaMemory + descriptorBytes / 4;
	dmaBuffers[0]->maxBufferDwords = commandBytes / 4;
	dmaBuffers[0]->overflowBufferDwords = dmaBuffers[0]->maxBufferDwords - OVERFLOW_DWORDS;

	dmaBuffers[1] = malloc( sizeof(mach64Dma_buffer) );
	memset( dmaBuffers[1], '\0', sizeof(mach64Dma_buffer) );

	dmaBuffers[1]->virtualBuffer = (hwUI32 *)mach64glx.dmaMemory + commandBytes / 4 + descriptorBytes / 4;
	dmaBuffers[1]->maxBufferDwords = commandBytes / 4;
	dmaBuffers[1]->overflowBufferDwords = dmaBuffers[1]->maxBufferDwords - OVERFLOW_DWORDS;

	hwMsg( 1, "dmaBuffers[]->maxBufferDwords = %i\n", dmaBuffers[0]->maxBufferDwords );
	hwMsg( 1, "dmaBuffers[0]->Buffer virt: 0x%08x\n", dmaBuffers[0]->virtualBuffer );
	hwMsg( 1, "dmaBuffers[1]->Buffer virt: 0x%08x\n", dmaBuffers[1]->virtualBuffer );

	mach64DmaResetBuffer();

	/* make sure the memory is read/write */
	mach64glx.dma_buffer->virtualBuffer[0] = 12;
	if ( mach64glx.dma_buffer->virtualBuffer[0] != 12 ) {
		FatalError( "Dma buffer isn't read/write!" );
	}
}



/*=============================================*/

#ifdef HAVE_LINUX_NEWAGP

/*
 * AllocateGARTMemory
 *
 * FIXME: No longer needed, but will keep here for a while anyway.
 */
static int AllocateGARTMemory( size_t size )
{
	int		i, j, k, m, pages = (size + 4095) / 4096;
	int		mode_mask;

	gartfd = open( GLX_AGPDEVICE, O_RDWR );
	if ( gartfd == -1 ) {
		hwMsg( 1, "unable to open " GLX_AGPDEVICE ": %s\n", sys_errlist[errno] );
		return -1;
	}

	if ( ioctl( gartfd, AGPIOC_ACQUIRE ) != 0 ) {
		hwMsg( 1, "error acquiring agp module: %s\n", sys_errlist[errno] );
		close( gartfd );
		return -1;
	}

	if ( ioctl( gartfd, AGPIOC_INFO, &mach64agp ) != 0 ) {
		hwMsg( 1, "error doing AGP info ioctl: %s\n", sys_errlist[errno] );
		hwMsg( 1, "first attempt\n" );
		close( gartfd );
		return -1;
	}

	gartbuf = mmap( NULL, mach64agp.aper_size * 0x100000, PROT_READ | PROT_WRITE, MAP_SHARED, gartfd, 0 );
	if ( gartbuf == MAP_FAILED ) {
		hwMsg( 1, "mmap() on " GLX_AGPDEVICE " failed: %s\n", sys_errlist[errno] );
		close( gartfd );
		return -1;
	}

	if ( __glx_is_server ) {
		agp_setup	mode_setup;

		/* This should be table driven for what agp mode registers we know to work.
		 * Currently it just sets whatever is there which is not right.
		 */

		if ( !( mode_mask = glx_getint_secure( "mach64_gart_mode_mask" ) ) ) {
			hwMsg( 1, "no mach64_gart_mode_mask defined: using mode 1\n" );
			mode_mask = 1;
		}

		mode_setup.agp_mode = (mach64agp.agp_mode & ~7) | (mach64agp.agp_mode & mode_mask);

		if( ioctl( gartfd, AGPIOC_SETUP, &mode_setup ) != 0 ) {
			hwMsg(1, "Error initializing AGP point to point connection\n");
			close( gartfd );
			return -1;
		}

		/* Call information function a second time for the agp mode */
		if ( ioctl( gartfd, AGPIOC_INFO, &mach64agp ) != 0 ) {
			hwMsg( 1, "error doing AGP info ioctl: %s\n", sys_errlist[errno] );
			hwMsg( 1, "second attempt\n" );
			close( gartfd );
			return -1;
		}
	} else {
		return 0;
	}

	i = pages / 1024;
	j = pages % 1024;
	memset( agp_entries, 0, sizeof(agp_allocate) * MAX_AGP_KEYS );

	for ( k = 0 ; k < i ; k++ ) {
		agp_allocate	*entry;
		agp_bind	bind;

		entry = agp_entries + k;
		entry->pg_count = 1024;
		entry->type = 0;

		if ( ioctl( gartfd, AGPIOC_ALLOCATE, entry ) ) {
			/* free previous pages */
			for ( m = 0 ; m < k ; m++ ) {
				int	key = agp_entries[m].key;

				ioctl( gartfd, AGPIOC_DEALLOCATE, key );
			}

			hwMsg( 1, "AGPGART: allocation of %i pages failed\n", pages );
			return -1;
		}

		bind.key = entry->key;
		bind.pg_start = k * 1024;

		if ( ioctl( gartfd, AGPIOC_BIND, &bind ) ) {
			/* free previous pages */
			for ( m = 0 ; m < k ; m++ ) {
				int	key = agp_entries[m].key;

				ioctl( gartfd, AGPIOC_DEALLOCATE, key );
			}

			hwMsg( 1, "AGPGART: bind of %i pages failed\n", pages );
			return -1;
		}
	}

	if ( j != 0 ) {
		agp_allocate	*entry;
		agp_bind	bind;

		entry = agp_entries + i;
		entry->pg_count = j;
		entry->type = 0;

		if ( ioctl( gartfd, AGPIOC_ALLOCATE, entry ) ) {
			/* free previous pages */
			for ( m = 0 ; m < i ; m++ ) {
				int	key = agp_entries[m].key;

				ioctl( gartfd, AGPIOC_DEALLOCATE, key );
			}

			hwMsg( 1, "AGPGART: allocation of %i pages failed\n", pages );
			return -1;
		}

		bind.pg_start = (i + 1) * 1024;
		bind.key = entry->key;

		if ( ioctl( gartfd, AGPIOC_BIND, &bind ) ) {
			for ( m = 0 ; m < i ; m++ ) {
				int	key = agp_entries[m].key;

				ioctl( gartfd, AGPIOC_DEALLOCATE, key );
			}

			hwMsg( 1, "AGPGART: allocation of %i pages failed\n", pages );
			return -1;
		}
	}

	if ( ioctl( gartfd, AGPIOC_RELEASE ) != 0 ) {
		hwMsg( 1, "error releasing agp module: %s\n", sys_errlist[errno] );
		hwMsg( 1, "direct rendering will not work.\n" );
		return 0;
	}

	return 0;
}

#endif /* HAVE_LINUX_NEWAGP */

/*
 *
 */
static int InitTextureMemory( void )
{
#ifdef HAVE_LINUX_NEWAGP
	if ( mach64glx.dmaDriver >= 3 ) {
		agp_info	info;
		int		size;
		int		mode_mask;

		if ( mach64glx.dmaDriver == 4 ) {
			mode_mask = 2;
		} else {
			mode_mask = 1;
		}

		/* try agp memory - limit aperture to 16 megs */
		if ( hwInitAGPMem( mode_mask, 16 ) == 0 ) {

			/* get information about the agp aperture */
			hwGetAGPInfo( &info );

			switch ( info.aper_size ) {
			case 4:
				size = 0x3f; break;
			case 8:
				size = 0x3e; break;
			case 16:
				size = 0x3c; break;
			case 32:
				size = 0x38; break;
			case 64:
				size = 0x30; break;
			case 128:
				size = 0x20; break;
			case 256:
				size = 0x00; break;
			}

			if ( __glx_is_server ) {
				/* set up the agp aperture */
				OUTREG( MACH64_AGP_BASE, info.aper_base );
				OUTREG( MACH64_AGP_CNTL,
					( size << 0 )		/* aperture size */
					| ( 0 << 8 )		/* num idle clks */
					| ( 1 << 16 )		/* 1 = high priority read */
					| ( 1 << 17 ) );	/* 1 = TRDY mode */
			}

			hwMsg( 1, "AGP aperture: %p size: %p code: %p\n",
			       info.aper_base, info.aper_size * 0x100000, size );
			return 1;
		}

		/* fall back to local card memory for textures */
		hwMsg( 1, "hwInitAGPMem() failed, using card heap\n" );
	}
#endif  /* HAVE_LINUX_NEWAGP */

	/* ensure we use local card memory for textures */
	if ( mach64glx.dmaDriver > 2 ) {
		mach64glx.dmaDriver = 2;
	}

	if ( __glx_is_server ) {
		/* clean up the agp aperture */
		OUTREG( MACH64_AGP_BASE, 0 );
		OUTREG( MACH64_AGP_CNTL, 0 );
	}

	return 0;
}


/*=============================================*/

static void delay( void )
{ }

/*
 * WaitForDmaProbeCompletion
 */
#define	TIMEOUT_USEC		1000000

static int WaitForDmaProbeCompletion( void )
{
	int	startTime;
	int	curTime;
	int	guiStat;
	int	i;

	startTime = 0;
	curTime = 0;

	while ( 1 )
	{
		guiStat = SWAP( INREG( MACH64_GUI_STAT ) );

		if ( !( guiStat & 0x00000001 ) ) {
			break;
		}

		curTime = usec();
		if ( ( startTime == 0 ) || ( curTime < startTime /*wrap case*/ ) ) {
			startTime = curTime;
		} else if ( curTime - startTime > TIMEOUT_USEC ) {
			hwMsg( 10, "dma probe timed out...\n" );
			break;
		}

		/* spin in place a bit so we aren't hammering the register */
		for ( i = 0 ; i < 10000 ; i++ ) {
			delay();
		}
	}

	/* check if probe failed */
	if ( guiStat & 0x00000001 ) {
		return 0;
	}
	return 1;
}


/*
 * ProbeDmaSystem
 * Verify that real dma will actually work.  If it fails, fall back to
 * psuedo dma and dump info about current state.
 */
static void ProbeDmaSystem( void )
{
	int	i, j;
	DMALOCALS;

	/* only probe the system on the server side */
	if ( !__glx_is_server ) {
		return;
	}

	hwMsg( 1, "probing dma system...\n" );

	/* clear the register so we can tell if dma actually functioned */
	/* I would like to use the scratch registers, but they don't go */
	/* through the fifo, so they can't be set by dma.  The pattern */
	/* register should be safe to use... */

	OUTREG( MACH64_PAT_REG0, 0x11111111 );
	if ( INREG( MACH64_PAT_REG0 ) != 0x11111111 ) {
		hwError( "ProbeDmaSystem: scratch reg test failed" );
	}

	/* buffer 4k (a single descriptor entry) worth of commands */
	for ( i = 0 ; i < 0x1e ; i++ ) {
		DMAGETPTR( 32 );

		for ( j = 0 ; j < 16 ; j++ ) {
			DMAOUTREG( MACH64_PAT_REG0, 0x22222222 );
		}

		DMAADVANCE();
	}

	/* start the transfer */
	mach64DmaFlush();

	/* make sure the transfer completed */
	if ( !WaitForDmaProbeCompletion() ) {
		hwError( "*** dma error: 4k dma transfer failed, falling back to pseudo dma ***\n" );

		goto failed;
	}
	hwMsg( 1, "first probe completed\n" );

	/* make sure the register actually got written to */
	i = INREG( MACH64_PAT_REG0 );
	if ( i != 0x22222222 ) {
		hwMsg( 1, "first probe did NOT set register ( 0x%x != 0x22222222 )", i );
		goto failed;
	} else {
		hwMsg( 1, "first probe correctly set register\n" );
	}

	/* buffer 128k (multiple descriptors) worth of commands */
	for ( i = 0 ; i < 0x3fe ; i++ ) {
		DMAGETPTR( 32 );

		for ( j = 0 ; j < 16 ; j++ ) {
			DMAOUTREG( MACH64_PAT_REG0, 0 );
		}

		DMAADVANCE();
	}

	/* start the transfer */
	mach64DmaFlush();

	/* make sure the transfer completed */
	if ( !WaitForDmaProbeCompletion() ) {
		hwError( "*** dma error: 128k dma transfer failed, falling back to pseudo dma ***\n" );
		goto failed;
	}
	hwMsg( 10, "second probe successful\n" );

	hwMsg( 1, "dma system seems to be working...\n" );
	return;

 failed:
 	/* see if we can learn anything from the failure state */
	mach64DumpEngineState();

	/* fall back to pseudo dma */
	mach64EngineReset();
	mach64glx.dmaDriver = 0;

	mach64DmaResetBuffer();
}


/*=============================================*/


/*
 * mach64DmaInit
 *
*/
static void mach64DmaInit(void)
{
	/* Server init - queries environment variables.  The client
	 * gets these values from the sever and initializes them in
	 * mach64direct.c
	 */
	if ( __glx_is_server )
	{
		/* default to async dma on scattered buffers if not configured */
		if ( !glx_getvar_secure( "mach64_dma" ) ) {
			mach64glx.dmaDriver = 2;
		} else {
			mach64glx.dmaDriver = glx_getint_secure( "mach64_dma" );
		}
	}

	/* size of dma buffers is fixed due to single descriptor table page */
	mach64glx.dmaSize = 2;

	hwMsg( 1, "mach64DmaInit: attempting to use mach64_dma = %i\n", mach64glx.dmaDriver );
	hwMsg( 1, "mach64DmaInit: allocating fixed %i megs for dma\n", mach64glx.dmaSize );

 	/* allocate a block of scattered system memory */
	AllocateScatteredMemory();
	LocateScatteredMemory();

	/* initialize the agp aperture for texture storage */
	InitTextureMemory();

	/* setup the two command buffers in the apropriate memory space */
	AllocateCommandBuffers();

	/* clear any engine/fifo errors */
	if ( __glx_is_server ) {
		mach64EngineReset();
	}

	/* verify that real dma actually works, otherwise fall back to pseudo dma */
	if ( __glx_is_server && mach64glx.dmaDriver >= 1 ) {
		ProbeDmaSystem();
	}

	hwMsg( 1, "mach64DmaInit: mach64_dma = %i: %s%s%s\n",
	       mach64glx.dmaDriver,
	       ( mach64glx.dmaDriver == 0 ) ? "pseudo DMA" : ( mach64glx.dmaDriver == 1 ) ? "sync DMA" : "async DMA",
	       ( mach64glx.dmaDriver < 3 ) ? ", local textures" : ", AGP textures",
	       ( mach64glx.dmaDriver == 4 ) ? ", AGP 2X" : ( mach64glx.dmaDriver == 3) ? ", AGP 1X" : "" );

	if ( __glx_is_server ) {
		/* benchmark the writing speed to the command buffer */
		hwMsg( 1, "dma buffer write speed:\n" );
		MemoryBenchmark( dmaBuffers[0]->virtualBuffer, dmaBuffers[0]->overflowBufferDwords );
		MemoryBenchmark( dmaBuffers[0]->virtualBuffer, dmaBuffers[0]->overflowBufferDwords );
		MemoryBenchmark( dmaBuffers[0]->virtualBuffer, dmaBuffers[0]->overflowBufferDwords );

		/* benchmark the read speed of the card's dma */
		if ( mach64glx.dmaDriver >= 1 ) {
			hwMsg( 1, "dma buffer transfer speed:\n" );
			DmaBenchmark( dmaBuffers[0]->overflowBufferDwords - 32 );
			DmaBenchmark( dmaBuffers[0]->overflowBufferDwords - 32 );
			DmaBenchmark( dmaBuffers[0]->overflowBufferDwords - 32 );
		}

		/* benchmark the writing speed to the texture memory */
		if ( mach64glx.dmaDriver >= 3 ) {
			agp_mem_t	mem;

			if ( hwAllocAGPMem( &mem, 1024 * 1024 ) > 0 ) {

				hwMsg( 1, "agp texture write speed:\n" );
				MemoryBenchmark( mem.buffer, 1024 * 1024 );
				MemoryBenchmark( mem.buffer, 1024 * 1024 );
				MemoryBenchmark( mem.buffer, 1024 * 1024 );

				hwFreeAGPMem( &mem );
			}
		}
	}
}

/*
 * TestRegisters
 *
 * Do a simple read/write test to the Mach64's scratch registers,
 * enable the second register bank, and save current values off
 */
static GLboolean TestRegisters( void )
{
	int		tmp;
	int		i, r;

	tmp = INREG( MACH64_SCRATCH_REG0 );
	OUTREG( MACH64_SCRATCH_REG0, 0x55555555 );

	if ( INREG( MACH64_SCRATCH_REG0 ) != 0x55555555 ) {
		hwError( "Mach64 probe failed on read 1 of SCRATCH_REG0 %x\n", MACH64_SCRATCH_REG0 );
		return GL_FALSE;
	} else {
		hwMsg( 1, "SCRATCH_REG0 read 1 successful\n" );

		OUTREG( MACH64_SCRATCH_REG0, 0xaaaaaaaa );

		if ( INREG( MACH64_SCRATCH_REG0 ) != 0xaaaaaaaa ) {
			hwError( "Mach64 probe failed on read 2 of SCRATCH_REG0 %x\n", MACH64_SCRATCH_REG0 );
			return GL_FALSE;
		} else {
			hwMsg( 1, "SCRATCH_REG0 read 2 successful\n" );
		}
	}

	OUTREG( MACH64_SCRATCH_REG0, tmp );

	/* enable the second bank of registers, where the 3D lives */
	MACH64_WAITFREE();
	r = SWAP( INREG( MACH64_BUS_CNTL ) );
	hwMsg( 1, "BUS_CNTL = 0x%x\n", r );

	r |= BUS_EXT_REG_EN;

	OUTREG( MACH64_BUS_CNTL, SWAP( r ) );

	/* save off the current set register values, which we */
	/* will use to restore at the end of each dma buffer */
#if 0
	hwMsg( 1, "Configuration registers:\n" );
	for ( i = 0 ; i < 256 ; i+=4 ) {
		r = pcibusRead( mach64PciTag, i );
		hwMsg(1, "0x%2x : 0x%8x\n", i, r );
	}
#endif
	//hwMsg(1, "Drawing registers:\n" );
	for ( i = 0x0 ; i < 0x7ff ; i+= 4 ) {
		r = SWAP( INREG( i ) );
		mach64glx.registers[i>>2] = r;
		//hwMsg(1, "0x%2x (%i_%2x): 0x%8x\n", i, i < 0x400, (i>>2)&255, r );
	}

	MACH64_WAITFREE();
	r = SWAP( INREG( MACH64_CONFIG_CHIP_ID ) );
	hwMsg( 1, "CONFIG_CHIP_ID = 0x%08x\n", r );
	hwMsg( 1, "  ASIC = 0x%x\n", (r & 0x3f000000) >> 24 );
	hwMsg( 1, " major = 0x%03x foundry = 0x%03x minor = 0x%02x\n",
	       (r & 0x07000000) >> 24, (r & 0x38000000) >> 27, (r & 0xc0000000) >> 30 );

	return GL_TRUE;
}


/*
 * mach64InitLogging
 *
 */
static void mach64InitLogging( void )
{
	char	*logName;

	/* open the logfile and set loglevel */
	logName = glx_getvar_secure( "hw_logfile" );
	if ( __glx_is_server ) {
		hwOpenLog( logName, "[mach64] " );
	} else {
		/* direct rendering clients use a different file
		   so they don't stomp on the server's log */
		if ( logName ) {
			char	newName[1024];

			strcpy( newName, logName );
			strcat( newName, "_direct" );
			hwOpenLog( newName, "[mach64] " );
		}
		else {
			/* just set logging prefix */
			hwOpenLog( NULL, "[mach64] " );
		}
	}

	if ( glx_getvar( "hw_loglevel" ) ) {
		hwSetLogLevel( glx_getint( "hw_loglevel" ) );
	} else {
		hwSetLogLevel( DBG_LEVEL_BASE );
	}
}


/*
 *
 * GetXServerInfo
 * this will be different on the fbdev server and the mach64 server
 */
static int GetXServerInfo( void ) {

	if ( glx_server == XF_FBDEV ) {
		mach64glx.linearPhysical = 0;		// use /dev/fb
		// fbdevInfoRec.MemBase;
		mach64glx.linearBase = (char *)GLXSYM(fbdevVirtBase);
		// fbdevInfoRec.IObase;
		mach64glx.MMIOBase = (char *)GLXSYM(fbdevRegBase);
		mach64glx.depth = GLXSYM(fbdevInfoRec).depth;
		mach64glx.virtualX = GLXSYM(fbdevInfoRec).virtualX;
		mach64glx.virtualY = GLXSYM(fbdevInfoRec).virtualY;
		mach64glx.displayWidth = GLXSYM(fbdevInfoRec).displayWidth;
		mach64glx.videoRam = GLXSYM(fbdevInfoRec).videoRam;
		mach64glx.bytesPerPixel = ( GLXSYM(fbdevInfoRec).bitsPerPixel + 7 ) / 8;
	} else {
		mach64glx.linearPhysical = GLXSYM(mach64ApertureAddr);
		mach64glx.linearBase = (char *)GLXSYM(mach64VideoMem);
		mach64glx.MMIOBase = (char *)GLXSYM(mach64MemRegMap);
		mach64glx.depth = GLXSYM(mach64InfoRec).depth;
		mach64glx.virtualX = GLXSYM(mach64InfoRec).virtualX;
		mach64glx.virtualY = GLXSYM(mach64InfoRec).virtualY;
		mach64glx.displayWidth = GLXSYM(mach64InfoRec).displayWidth;
		mach64glx.videoRam = GLXSYM(mach64InfoRec).videoRam;
		mach64glx.bytesPerPixel = ( GLXSYM(mach64InfoRec).bitsPerPixel + 7 ) / 8;
	}


	/* under some conditions the fbdev server reports
	   displayWidth == -1 for some reason */
	if ( mach64glx.displayWidth < mach64glx.virtualX ) {
		mach64glx.displayWidth = mach64glx.virtualX;
	}

	hwMsg( 1, "width: %d\n", mach64glx.virtualX );
	hwMsg( 1, "height: %d\n", mach64glx.virtualY );
	hwMsg( 1, "pitch: %d\n", mach64glx.displayWidth );
	hwMsg( 1, "depth: %d\n", mach64glx.depth );
	hwMsg( 1, "bytesPerPixel: %d\n", mach64glx.bytesPerPixel );
	hwMsg( 1, "videoRam: %dk\n", mach64glx.videoRam );
	hwMsg( 1, "memBase: 0x%08x\n", mach64glx.linearBase );
	hwMsg( 1, "ioBase: 0x%08x\n", mach64glx.MMIOBase );
	hwMsg( 1, "memPhysical: 0x%08x\n", mach64glx.linearPhysical );

	return 1;
}


/*
 * Make sure the X server is in an OK bit depth and the font acceleration
 * is disabled (to give us offscreen memory)
 */
static GLboolean CheckXSettings()
{
	if ( mach64glx.depth == 24 ) {
		mach64glx.depth = 32;	// FIXME
	}

	if( ( mach64glx.depth != 15 ) && ( mach64glx.depth != 16 ) && ( mach64glx.depth != 32 ) ) {
		hwError( "Unsupported depth: %d, only 15, 16, and 32 bpp are supported right now\n",
			 mach64glx.depth );
		return GL_FALSE;
	}

	/* make sure the user has turned off the pixmap and font cache */
	/* if the don't, then rendering will write all over their fonts */
	/* the fbdev server doesn't have font acceleration, so it doesn't matter */
	if (glx_server != XF_FBDEV) {
		if ( !OFLG_ISSET( OPTION_NO_FONT_CACHE, &GLXSYM(mach64InfoRec).options ) ||
		     !OFLG_ISSET( OPTION_NO_PIXMAP_CACHE, &GLXSYM(mach64InfoRec).options ) ) {
			hwError( "Font and pixmap caches must be disabled to use the GLX module.\n" );
			hwError( "Make sure you have the following in your XF86config file:\n" );
			hwError( "Section \"Device\"\n" );
 	  		hwError( "	Option	\"no_font_cache\"\n" );
 	   		hwError( "	Option	\"no_pixmap_cache\"\n" );
			return GL_FALSE;
		}
	}

	return GL_TRUE;
}


/*
 * mach64DrawTest
 *
 * Figure out how to draw the things we need.
 */
static void mach64DrawTest( void ) {
	DMALOCALS;
	int 	i;
	int	j;
	int	screenFormat;

	/* draw something with the cpu to show we found the framebuffer properly */
	/* and to provide a background for the primitive tests */
	for ( i = 0 ; i < 1022*1024 ; i++ ) {
		((int *)mach64glx.linearBase)[i] = i;
	}

	DMAGETPTR( 100 );

	switch( mach64glx.depth ) {
	case 15: screenFormat = 3; break;
	case 16: screenFormat = 4; break;
	case 32: screenFormat = 6; break;
	}

	DMAOUTREG( MACH64_DP_PIX_WIDTH,
		   ( screenFormat << 0 )
		   | ( screenFormat << 4 )
		   | ( screenFormat << 8 )
		   | ( screenFormat << 16 )
		   | ( screenFormat << 28 )
		   );

	/* simple fill */
	DMAOUTREG( MACH64_DP_FRGD_CLR, 0x12345678 );		/* random color */
	DMAOUTREG( MACH64_DP_WRITE_MASK, 0xffffffff );		/* write to all */
	DMAOUTREG( MACH64_DP_MIX, BKGD_MIX_D | FRGD_MIX_S );	/* bkground leave alone */
	DMAOUTREG( MACH64_DP_SRC, BKGD_SRC_FRGD_CLR | FRGD_SRC_FRGD_CLR | MONO_SRC_ONE );

	DMAOUTREG( MACH64_CLR_CMP_CNTL, 0 );			/* disable color compare */
	DMAOUTREG( MACH64_GUI_TRAJ_CNTL, DST_X_LEFT_TO_RIGHT | DST_Y_TOP_TO_BOTTOM );
	DMAOUTREG( MACH64_DST_X_Y, (100<<16)|120 );		/* start at (120, 100) */
	DMAOUTREG( MACH64_DST_WIDTH_HEIGHT, (100<<16)|120 );	/* 120 wide, 100 high */

	/* block copy */
	DMAOUTREG( MACH64_DP_WRITE_MASK, 0xffffffff );		/* write to all */
	DMAOUTREG( MACH64_DP_MIX, BKGD_MIX_D | FRGD_MIX_S );	/* bkground leave alone */
	DMAOUTREG( MACH64_DP_SRC, BKGD_SRC_BKGD_CLR | FRGD_SRC_BLIT | MONO_SRC_ONE );

	DMAOUTREG( MACH64_CLR_CMP_CNTL, 0 );			/* disable color compare */

	DMAOUTREG( MACH64_GUI_TRAJ_CNTL, DST_X_LEFT_TO_RIGHT | DST_Y_TOP_TO_BOTTOM );

	DMAOUTREG( MACH64_SRC_OFF_PITCH, ( (mach64glx.displayWidth/8) << 22 ) | 0 );
	DMAOUTREG( MACH64_SRC_WIDTH1, 120 );			/* hangs without this... */
	DMAOUTREG( MACH64_SRC_Y_X, (0<<16)|0 );

	DMAOUTREG( MACH64_DST_X_Y, (100<<16)|300 );		/* start at (300, 100) */
	DMAOUTREG( MACH64_DST_WIDTH_HEIGHT, (100<<16)|120 );	/* 120 wide, 100 high */

	/* triangle rendering */
	DMAOUTREG( MACH64_SCALE_3D_CNTL,
		   (3<<6) | (1<<16) );	/* enable setup for smooth, all source blend */
	DMAOUTREG( MACH64_SETUP_CNTL, 0 );
	DMAOUTREG( MACH64_Z_CNTL, 0 );
	DMAOUTREG( MACH64_ALPHA_TST_CNTL, 0 );

	DMAOUTREG( MACH64_DP_SRC, BKGD_SRC_3D | FRGD_SRC_3D | MONO_SRC_ONE );

	DMAOUTREG( MACH64_VERTEX_1_ARGB, 0xff0000ff );
	DMAOUTREG( MACH64_VERTEX_1_X_Y, (550<<18)|(100<<2) );
	DMAOUTREG( MACH64_VERTEX_2_ARGB, 0xff00ff00 );
	DMAOUTREG( MACH64_VERTEX_2_X_Y, (500<<18)|(220<<2) );
	DMAOUTREG( MACH64_VERTEX_3_ARGB, 0xffff0000 );
	DMAOUTREG( MACH64_VERTEX_3_X_Y, (600<<18)|(200<<2) );

	/* the setup engine only works if area has correct sign, */
	/* which isn't clear in the docs! */
	DMAOUTFLOAT( MACH64_ONE_OVER_AREA_UC, -1.0/100.0 );

	DMAOUTREG( MACH64_DP_SRC, BKGD_SRC_BKGD_CLR | FRGD_SRC_BLIT | MONO_SRC_ONE );	/* back to blit */

	DMAADVANCE();
	mach64DmaFinish();

	/* draw a bunch of stuff to test buffer switching */
	for ( j = 0 ; j < 100 ; j+= 10 ) {
		DMAGETPTR( 1000 );
		for ( i = 0 ; i < 1000 ; i+= 10 ) {
			DMAOUTREG( MACH64_DP_FRGD_CLR, 0x12345678 );		/* random color */
			DMAOUTREG( MACH64_DP_WRITE_MASK, 0xffffffff );		/* write to all */
			DMAOUTREG( MACH64_DP_MIX, BKGD_MIX_D | FRGD_MIX_S );	/* bkground leave alone */
			DMAOUTREG( MACH64_DP_SRC, BKGD_SRC_FRGD_CLR | FRGD_SRC_FRGD_CLR | MONO_SRC_ONE );
			DMAOUTREG( MACH64_CLR_CMP_CNTL, 0 );			/* disable color compare */
			DMAOUTREG( MACH64_GUI_TRAJ_CNTL, DST_X_LEFT_TO_RIGHT | DST_Y_TOP_TO_BOTTOM );
			DMAOUTREG( MACH64_DST_X_Y, (j<<16)|i );			/* start at (120, 100) */
			DMAOUTREG( MACH64_DST_WIDTH_HEIGHT, (8<<16)|8 );	/* 120 wide, 100 high */
		}
		DMAADVANCE();
		mach64DmaFinish();
	}

	/* let it sit there a bit so we can see it */
	sleep( 3 );
}


/*
 * mach64InitGLX
 * This is the initial entry point for the mach64 hardware driver,
 * called at X server module load time, or libGL direct rendering
 * init time.
 */
GLboolean mach64InitGLX( void )
{
	mach64InitLogging();

	/* if we are a direct rendering client, we don't need to do this */
	if ( __glx_is_server ) {
		/* find out where the hardware is from X */
		if ( !GetXServerInfo() ) {
			hwMsg( 1, "GetXServerInfo failed!\n" );
			return GL_FALSE;
		}

    		/* check to make sure that we aren't running in 8 bit mode */
    		/* or with font acceleration */
    		if ( !CheckXSettings() ) {
			return GL_FALSE;
    		}

  		/* do the initial register setup and checking */
 		if ( !TestRegisters() ) {
			return GL_FALSE;
		}
	}

	/* start up our card memory manager */
	cardHeap = mmInit( 0, mach64glx.videoRam * 1024 );
	if ( !cardHeap ) {
		hwMsg( 1, "cardHeap creation failed, exiting!\n" );
		return GL_FALSE;	/* really shouldn't happen */
	}

	/* reserve memory for the front buffer */
	mmReserveMem( cardHeap, 0, mach64glx.displayWidth * mach64glx.virtualY * mach64glx.bytesPerPixel );

	/* reserve memory for the second bank of memory mapped registers */
	mmReserveMem( cardHeap, (mach64glx.videoRam-1) * 1024, 1024 );

	/* the remaining memory is available for back buffers, depth
	   buffers, and textures */
	mmDumpMemInfo( cardHeap );

	/* init the dma system */
	mach64DmaInit();

	/* hook some things from the software GLX interface */
	GLXProcs.CreateContext = mach64GLXCreateContext;
	GLXProcs.DestroyContext = mach64GLXDestroyContext;
	GLXProcs.SwapBuffers = mach64GLXSwapBuffers;
	GLXProcs.CreateImage = mach64GLXCreateImage;
	GLXProcs.DestroyImage = mach64GLXDestroyImage;
	GLXProcs.CreateDepthBuffer = mach64GLXCreateDepthBuffer;
	GLXProcs.MakeCurrent = mach64GLXMakeCurrent;
	GLXProcs.BindBuffer = mach64GLXBindBuffer;
	GLXProcs.VendorPrivate = mach64GLXVendorPrivate;
	GLXProcs.AllowDirect = mach64GLXAllowDirect;
	// should we hook CreateWindowBuffer?

	/* these vars can be changed between invocations of direct clients */
	if ( glx_getint( "mach64_nullprims" ) ) {
		hwMsg( 1, "enabling mach64_nullprims\n" );
		mach64glx.nullprims = 1;
	}
	if ( glx_getint( "mach64_skipdma" ) ) {
		hwMsg( 1, "enabling mach64_skipdma\n" );
		mach64glx.skipDma = 1;
	}
	if ( glx_getint( "hw_boxes" ) ) {
		hwMsg( 1, "enabling hw_boxes\n" );
		mach64glx.boxes = 1;
	}
	if ( glx_getint( "mach64_nofallback" ) ) {
		hwMsg( 1, "enabling mach64_nofallback\n" );
		mach64glx.noFallback = 1;
	}

	if ( glx_getint( "mach64_finish" ) ) {
		hwMsg( 1, "enabling mach64_finish\n" );
		mach64glx.enforceFinish = 1;
	}

    	/* see if we can draw our basic primitives */
    	if ( __glx_is_server && glx_getint( "mach64_drawtest" ) ) {
		hwMsg( 1, "enabling mach64_drawtest\n" );
		mach64DrawTest();
	}

	hwError( "mach64InitGLX completed\n" );
	return GL_TRUE;
}


/*
 * Local Variables:
 * mode: c
 * tab-width: 8
 * c-basic-offset: 8
 * End:
 */
