/* $Id: mach64tri.c,v 1.33 2000/03/27 07:27:03 gareth Exp $ */

/*
 * GLX Hardware Device Driver for ATI Rage Pro
 * Copyright (C) 1999 John Carmack, Gareth Hughes
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * WITTAWAT YAMWONG, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
 * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 *    John Carmack <johnc@idsoftware.com>
 *    Gareth Hughes <gareth@precisioninsight.com>
 */

#include "mach64glx.h"

#include "glx_symbols.h"

/*

The rage pro has a dma register transfer option that allows you to transfer
blocks of sequential registers without having to give a seperate register
address for each data dword.

We use that here to transfer all seven dwords that make up a (non-multitexture)
vertex with eight dwords of dma.

*/

void mach64Triangle( GLcontext *ctx, GLuint e0, GLuint e1, GLuint e2, GLuint pv )
{
	const struct vertex_buffer *VB;
	float		offset;
	float		ooa;
	int		i, j;
	GLubyte 	*col;
	GLfloat 	(*tc0)[4];
	GLfloat 	(*tc1)[4];
	GLfloat 	*win;
	int		x, y;
	int		dwords;
	int		xx[3], yy[3];			/* 2 fractional bits for hardware */
	GLubyte		(*colorArray)[4];		/* different for two sided lighting */
	GLubyte		(*specularArray)[4];		/* different for two sided lighting */
	GLuint		*vertices;
	int		numvertices;
	GLboolean	replace[3];
	DMALOCALS;

	mach64glx.c_triangles++;
	VB = ctx->VB;

	/* calculate one-over-area, culling the triangle if needed
	 *
	 * it is necessary to use the exact integralized X/Y values that
	 * will be put into the setup registers for the area calculation,
	 * rather than the floating point values, otherwise there will be
	 * a slight error, which can result in creeping interpolators
	 * on thin triangles and tiny cracks when nearly-edge on triangles
	 * are improperly back face culled.
	 */

	/* important note: it is possible to have window coordinates that extend
	 * outside the window bounds, even after clipping.  This happens when an
	 * app (like Blender) sets viewports larger than the window.  We properly
	 * set scissor to the window size even when it is disabled, but we need
	 * to mask off Y values when they are packed into a regeister to prevent
	 * negative number bit pollution.
	 */
	win = VB->Win.data[e0];
	xx[0] = (int)(win[0]*4);
	yy[0] = (int)(win[1]*4);

	win = VB->Win.data[e1];
	xx[1] = (int)(win[0]*4);
	yy[1] = (int)(win[1]*4);

	win = VB->Win.data[e2];
	xx[2] = (int)(win[0]*4);
	yy[2] = (int)(win[1]*4);

	ooa = 0.25 * 0.25 * ( ( xx[1] - xx[0] ) * ( yy[0] - yy[2] ) +
			      ( yy[1] - yy[0] ) * ( xx[2] - xx[0] ) );
	if ( ooa * ctx->backface_sign < 0 ) {
		return;		/* culled */
	}
	ooa = 1.0 / ooa;

	/* polygon offset will result in an adustment to the z value
	 * before it is programmed to the hardware.  For maximum
	 * accuracy, we add the offset as a float before integralizing
	 * for the hardware.
	 */
	if ( ctx->TriangleCaps & DD_TRI_OFFSET ) {
		offset = ctx->PolygonZoffset;
	} else {
		offset = 0;
	}

	/* two sided lighting requires using different colors
	 * depending on which face is showing.
	 */
	if ( ctx->TriangleCaps & DD_TRI_LIGHT_TWOSIDE ) {
		int		facing;

		facing = ( ooa > 0.0 ) ^ ctx->Polygon.FrontBit;
		colorArray = VB->Color[facing]->data;
		specularArray = VB->Spec[facing];
	} else {
		colorArray = VB->Color[0]->data;
		specularArray = VB->Spec[0];
	}

	/* reuse vertices that have already been sent to the hardware. We
	 * only save vertices that would otherwise be written to the same
	 * place in the setup engine, else we have to do lots of fancy
	 * stuff and recalculate ooa which will generally make the
	 * optimization slower than just reprogramming all three vertices.
	 */
	vertices = mach64glx.setupVertices;
	numvertices = 0;

	replace[0] = replace[1] = replace[2] = GL_TRUE;

	/* ensure all three vertices are not clipped before reusing them */
	/* we also can't reuse them in flat shade without checking a bunch more stuff */
	if ( !(ctx->TriangleCaps & DD_FLATSHADE) &&
	     ( e0 < mach64glx.setupMax ) && ( e1 < mach64glx.setupMax ) && ( e2 < mach64glx.setupMax ) ) {

		if ( e0 == vertices[0] ) {
			replace[0] = GL_FALSE;
		} else {
			vertices[0] = e0;
		}

		if ( e1 == vertices[1] ) {
			replace[1] = GL_FALSE;
		} else {
			vertices[1] = e1;
		}

		if ( e2 == vertices[2] ) {
			replace[2] = GL_FALSE;
		} else {
			vertices[2] = e2;
		}

	} else {
		vertices[0] = e0;
		vertices[1] = e1;
		vertices[2] = e2;
	}

	/* calculate the number of dwords per block we'll be sending */
	dwords = ( mach64Ctx->multitex ) ? 12 : 8;

	DMAGETPTR( 3*dwords+2 );	/* this will check for overflow and set dma_ptr */

	for ( j = 0 ; j < 3 ; j++ ) {

		/* don't replace vertices already in the setup engine */
		if ( !replace[j] ) {
			continue;
		}

		/* setup for programming seven sequential registers */
		switch ( j ) {
		case 0:
			dma_ptr[0] = SWAP( ADRINDEX( MACH64_VERTEX_1_S ) | (6<<16) );
			break;
		case 1:
			dma_ptr[0] = SWAP( ADRINDEX( MACH64_VERTEX_2_S ) | (6<<16) );
			break;
		case 2:
			dma_ptr[0] = SWAP( ADRINDEX( MACH64_VERTEX_3_S ) | (6<<16) );
			break;
		}
		i = vertices[j];

#ifdef __PPC__
// byte swapped
// we can do the colors without requiring any additional swapping by
// just changing the order we load them in, and we can swap the floats
// by loading them initially as integers
		/* window / depth / homogeneous coordinate */
		win = VB->Win.data[i];

		/* texcoord - FIXME: premultiply by W */
		tc0 = VB->TexCoordPtr[mach64Ctx->tmu_source[0]]->data;
		dma_ptr[1] = SWAP(*(int *)&tc0[i][0]);					/* VERTEX_1_S */
		dma_ptr[2] = SWAP(*(int *)&tc0[i][1]);					/* VERTEX_1_T */

		dma_ptr[3] = SWAP(*(int *)&win[3]);					/* VERTEX_1_W */

		/* specular / fog factor */
		col = &(specularArray[i][0]);
		dma_ptr[4] = (col[2]<<24) | (col[1]<<16) | (col[0]<<8) | col[3];	/* VERTEX_1_SPEC_ARGB */

		dma_ptr[5] = SWAP(((int)(win[2] + offset)) << 16);			/* VERTEX_1_Z */

		/* shaded color */
		col = &(colorArray[i][0]);
		dma_ptr[6] = (col[2]<<24) | (col[1]<<16) | (col[0]<<8) | col[3];	/* VERTEX_1_ARGB */

		/* x / y have two bits of integer precision */
		x = xx[j];
		y = ( mach64DB->height << 2 ) - yy[j];

		dma_ptr[7] = SWAP( (x<<16) | (y&0xffff) );				/* VERTEX_1_X_Y */
#else
// native format
		/* window / depth / homogeneous coordinate */
		win = VB->Win.data[i];

		/* texcoord */
		tc0 = VB->TexCoordPtr[mach64Ctx->tmu_source[0]]->data;
		*(float *)(&dma_ptr[1]) = tc0[i][0] * win[3];				/* VERTEX_1_S */
		*(float *)(&dma_ptr[2]) = tc0[i][1] * win[3];				/* VERTEX_1_T */

		if ( VB->TexCoordPtr[mach64Ctx->tmu_source[0]]->size < 4 ) {
			*(float *)(&dma_ptr[3]) = win[3];				/* VERTEX_1_W */
		} else {
			*(float *)(&dma_ptr[3]) = tc0[i][3] * win[3];			/* VERTEX_1_W */
		}

		/* specular / fog factor */
		col = &(specularArray[i][0]);
		dma_ptr[4] = (col[3]<<24) | (col[0]<<16) | (col[1]<<8) | col[2];	/* VERTEX_1_SPEC_ARGB */

		dma_ptr[5] = ((int)(win[2] + offset)) << 16;				/* VERTEX_1_Z */

		/* shaded color */
		col = &(colorArray[i][0]);
		dma_ptr[6] = (col[3]<<24) | (col[0]<<16) | (col[1]<<8) | col[2];	/* VERTEX_1_ARGB */

		/* x / y have two bits of integer precision */
		x = xx[j];
		y = ( mach64DB->height << 2 ) - yy[j];
		dma_ptr[7] = (x<<16) | (y&0xffff);					/* VERTEX_1_X_Y */
#endif

		/* multitexturing */
		if ( mach64Ctx->multitex ) {
			/* setup for programming three sequential registers */
			switch ( j ) {
			case 0:
				dma_ptr[8] = SWAP( ADRINDEX( MACH64_VERTEX_1_SECONDARY_S ) | (2<<16) );
				break;
			case 1:
				dma_ptr[8] = SWAP( ADRINDEX( MACH64_VERTEX_2_SECONDARY_S ) | (2<<16) );
				break;
			case 2:
				dma_ptr[8] = SWAP( ADRINDEX( MACH64_VERTEX_3_SECONDARY_S ) | (2<<16) );
				break;
			}

#ifdef __PPC__
			/* texcoord - FIXME: premultiply by W */
			tc1 = VB->TexCoordPtr[mach64Ctx->tmu_source[1]]->data;
			dma_ptr[9]  = SWAP(*(int *)&tc1[i][0]);				/* VERTEX_1_SECONDARY_S */
			dma_ptr[10] = SWAP(*(int *)&tc1[i][1]);				/* VERTEX_1_SECONDARY_T */

			dma_ptr[11] = SWAP(*(int *)&win[3]);				/* VERTEX_1_SECONDARY_W */
#else
			/* texcoord */
			tc1 = VB->TexCoordPtr[mach64Ctx->tmu_source[1]]->data;
			*(float *)(&dma_ptr[9])  = tc1[i][0] * win[3];			/* VERTEX_1_SECONDARY_S */
			*(float *)(&dma_ptr[10]) = tc1[i][1] * win[3];			/* VERTEX_1_SECONDARY_T */

			if ( VB->TexCoordPtr[mach64Ctx->tmu_source[1]]->size < 4 ) {
				*(float *)(&dma_ptr[11]) = win[3];			/* VERTEX_1_SECONDARY_W */
			} else {
				*(float *)(&dma_ptr[11]) = tc1[i][3] * win[3];		/* VERTEX_1_SECONDARY_W */
			}
#endif
		}

		dma_ptr += dwords;
		numvertices++;
	}

	/* if we are flat shading, go back and stamp a single color on all vertexes */
	if ( ctx->TriangleCaps & DD_FLATSHADE ) {
		int		offset;

		col = &(colorArray[pv][0]);
		offset = ( mach64Ctx->multitex ) ? -6 : -2;

		for ( i = 0 ; i < 3 ; i++, offset -= dwords ) {
			dma_ptr[offset] =
#ifdef __PPC__
				(col[2]<<24) | (col[1]<<16) | (col[0]<<8) | col[3];	/* VERTEX_1_ARGB */
#else
				(col[3]<<24) | (col[0]<<16) | (col[1]<<8) | col[2];	/* VERTEX_1_ARGB */
#endif
		}
	}

	/* write one-over-area to kick off the triangle */
	DMAOUTFLOAT( MACH64_ONE_OVER_AREA_UC, ooa );

	// advance the dwords directly instead of using DMAADVANCE, because
	// we have been directly moving dma_ptr
	mach64glx.dma_buffer->bufferDwords += numvertices * dwords + 2;
}


static void mach64Quad( GLcontext *ctx, GLuint v0, GLuint v1, GLuint v2, GLuint v3, GLuint pv )
{
   mach64Triangle( ctx, v0, v1, v3, pv );
   mach64Triangle( ctx, v1, v2, v3, pv );
}


static void mach64Line( GLcontext *ctx, GLuint v0, GLuint v1, GLuint pv )
{
	const struct vertex_buffer *VB;
	float		ooa, offset, width;
	float		dx, dy, ix, iy;
	int		j;
	GLubyte 	*col, *spec;
	GLfloat 	(*tc)[4];
	GLfloat 	*win;
	int		x, y;
	int		xx[3], yy[3];			/* 2 fractional bits for hardware */
	GLubyte		(*colorArray)[4];		/* different for two sided lighting */
	GLubyte		(*specularArray)[4];		/* different for two sided lighting */
	hwUI32		vertices[2][8];
	DMALOCALS;

	mach64glx.c_lines++;
	VB = ctx->VB;
	width = ctx->Line.Width;

	dx = VB->Win.data[v0][0] - VB->Win.data[v1][0];
	dy = VB->Win.data[v0][1] - VB->Win.data[v1][1];

	ix = width * 0.5; iy = 0;

	if ( ( ix < 0.5 ) && ( ix > 0.1 ) ) {
		ix = 0.5;
	}
	if ( dx * dx > dy * dy ) {
		iy = ix; ix = 0;
	}

	if ( ctx->TriangleCaps & DD_TRI_OFFSET ) {
		offset = ctx->PolygonZoffset;
	} else {
		offset = 0;
	}

	if ( ctx->TriangleCaps & DD_TRI_LIGHT_TWOSIDE ) {
		int		facing;

		facing = ( ooa > 0.0 ) ^ ctx->Polygon.FrontBit;
		colorArray = VB->Color[facing]->data;
		specularArray = VB->Spec[facing];
	} else {
		colorArray = VB->Color[0]->data;
		specularArray = VB->Spec[0];
	}

	tc = VB->TexCoordPtr[0]->data;

	/* precalculate the setup engine blocks */
#ifdef __PPC___
	win = VB->Win.data[v0];
	col = ( ctx->TriangleCaps & DD_FLATSHADE ) ? &(colorArray[pv][0]) :  &(colorArray[v0][0]);
	spec = &(specularArray[v0][0]);

	vertices[0][0] = 0;
	vertices[0][1] = SWAP(*(int *)&tc[v0][0]);
	vertices[0][2] = SWAP(*(int *)&tc[v0][1]);
	vertices[0][3] = SWAP(*(int *)&win[3]);
	vertices[0][4] = (spec[2]<<24) | (spec[1]<<16) | (spec[0]<<8) | spec[3];
	vertices[0][5] = SWAP(((int)(win[2] + offset)) << 16);
	vertices[0][6] = (col[2]<<24) | (col[1]<<16) | (col[0]<<8) | col[3];
	vertices[0][7] = 0;

	win = VB->Win.data[v1];
	col = ( ctx->TriangleCaps & DD_FLATSHADE ) ? &(colorArray[pv][0]) :  &(colorArray[v1][0]);
	spec = &(specularArray[v1][0]);

	vertices[1][0] = 0;
	vertices[1][1] = SWAP(*(int *)&tc[v1][0]);
	vertices[1][2] = SWAP(*(int *)&tc[v1][1]);
	vertices[1][3] = SWAP(*(int *)&win[3]);
	vertices[1][4] = (spec[2]<<24) | (spec[1]<<16) | (spec[0]<<8) | spec[3];
	vertices[1][5] = SWAP(((int)(win[2] + offset)) << 16);
	vertices[1][6] = (col[2]<<24) | (col[1]<<16) | (col[0]<<8) | col[3];
	vertices[1][7] = 0;
#else
	win = VB->Win.data[v0];
	col = ( ctx->TriangleCaps & DD_FLATSHADE ) ? &(colorArray[pv][0]) :  &(colorArray[v0][0]);
	spec = &(specularArray[v0][0]);

	vertices[0][0] = 0;
	vertices[0][1] = tc[v0][0] * win[3];
	vertices[0][2] = tc[v0][1] * win[3];
	vertices[0][3] = win[3];
	vertices[0][4] = (spec[3]<<24) | (spec[0]<<16) | (spec[1]<<8) | spec[2];
	vertices[0][5] = ((int)(win[2] + offset)) << 16;
	vertices[0][6] = (col[3]<<24) | (col[0]<<16) | (col[1]<<8) | col[2];
	vertices[0][7] = 0;

	win = VB->Win.data[v1];
	col = ( ctx->TriangleCaps & DD_FLATSHADE ) ? &(colorArray[pv][0]) :  &(colorArray[v1][0]);
	spec = &(specularArray[v1][0]);

	vertices[1][0] = 0;
	vertices[1][1] = tc[v1][0] * win[3];
	vertices[1][2] = tc[v1][1] * win[3];
	vertices[1][3] = win[3];
	vertices[1][4] = (spec[3]<<24) | (spec[0]<<16) | (spec[1]<<8) | spec[2];
	vertices[1][5] = ((int)(win[2] + offset)) << 16;
	vertices[1][6] = (col[3]<<24) | (col[0]<<16) | (col[1]<<8) | col[2];
	vertices[1][7] = 0;
#endif

	/* ================================ */

	DMAGETPTR( 3*8+2 );

	/* calculate coords, ooa */
	win = VB->Win.data[v0];
	xx[0] = (int)((win[0]-ix)*4);
	yy[0] = (int)((win[1]-iy)*4);

	win = VB->Win.data[v1];
	xx[1] = (int)((win[0]+ix)*4);
	yy[1] = (int)((win[1]+iy)*4);

	win = VB->Win.data[v0];
	xx[2] = (int)((win[0]+ix)*4);
	yy[2] = (int)((win[1]+iy)*4);

	ooa = 0.25 * 0.25 * ( ( xx[1] - xx[0] ) * ( yy[0] - yy[2] ) +
			      ( yy[1] - yy[0] ) * ( xx[2] - xx[0] ) );
	ooa = 1.0 / ooa;

	/* program the three vertices using precalculated values */
	dma_ptr[0] = SWAP( ADRINDEX( MACH64_VERTEX_1_S ) | (6<<16) );
	for ( j = 1 ; j < 7 ; j++ ) {
		dma_ptr[j] = vertices[0][j];
	}
	x = xx[0];
	y = ( mach64DB->height << 2 ) - yy[0];
	dma_ptr[7] = SWAP( (x<<16) | (y&0xffff) );
	dma_ptr += 8;

	dma_ptr[0] = SWAP( ADRINDEX( MACH64_VERTEX_2_S ) | (6<<16) );
	for ( j = 1 ; j < 7 ; j++ ) {
		dma_ptr[j] = vertices[1][j];
	}
	x = xx[1];
	y = ( mach64DB->height << 2 ) - yy[1];
	dma_ptr[7] = SWAP( (x<<16) | (y&0xffff) );
	dma_ptr += 8;

	dma_ptr[0] = SWAP( ADRINDEX( MACH64_VERTEX_3_S ) | (6<<16) );
	for ( j = 1 ; j < 7 ; j++ ) {
		dma_ptr[j] = vertices[0][j];
	}
	x = xx[2];
	y = ( mach64DB->height << 2 ) - yy[2];
	dma_ptr[7] = SWAP( (x<<16) | (y&0xffff) );
	dma_ptr += 8;

	/* kick off triangle, update dma buffer directly */
	DMAOUTFLOAT( MACH64_ONE_OVER_AREA_UC, ooa );

	mach64glx.dma_buffer->bufferDwords += 3*8+2;

	/* ================================ */

	DMAGETPTR( 1*8+2 );

	/* calculate coords, ooa */
	win = VB->Win.data[v1];
	xx[2] = (int)((win[0]-ix)*4);
	yy[2] = (int)((win[1]-iy)*4);

	/* reprogram the third vertex using precalculated values */
	dma_ptr[0] = SWAP( ADRINDEX( MACH64_VERTEX_3_S ) | (6<<16) );
	for ( j = 1 ; j < 7 ; j++ ) {
		dma_ptr[j] = vertices[1][j];
	}
	x = xx[2];
	y = ( mach64DB->height << 2 ) - yy[2];
	dma_ptr[7] = SWAP( (x<<16) | (y&0xffff) );
	dma_ptr += 8;

	/* kick off triangle, update dma buffer directly */
	DMAOUTFLOAT( MACH64_ONE_OVER_AREA_UC, -ooa );

	mach64glx.dma_buffer->bufferDwords += 1*8+2;
}


static void mach64Points( GLcontext *ctx, GLuint first, GLuint last )
{
	const struct vertex_buffer *VB;
	float		ooa, offset, sz;
	int		i, j;
	GLubyte 	*col, *spec;
	GLfloat 	(*tc)[4];
	GLfloat 	*win;
	int		x, y;
	int		xx[3], yy[3];			/* 2 fractional bits for hardware */
	GLubyte		(*colorArray)[4];		/* different for two sided lighting */
	GLubyte		(*specularArray)[4];		/* different for two sided lighting */
	hwUI32		vertex[8];
	DMALOCALS;

	mach64glx.c_points++;
	VB = ctx->VB;
	sz = ctx->Point.Size * 0.5;

	if ( ctx->TriangleCaps & DD_TRI_OFFSET ) {
		offset = ctx->PolygonZoffset;
	} else {
		offset = 0;
	}

	if ( ctx->TriangleCaps & DD_TRI_LIGHT_TWOSIDE ) {
		int		facing;

		facing = ( ooa > 0.0 ) ^ ctx->Polygon.FrontBit;
		colorArray = VB->Color[facing]->data;
		specularArray = VB->Spec[facing];
	} else {
		colorArray = VB->Color[0]->data;
		specularArray = VB->Spec[0];
	}

	tc = VB->TexCoordPtr[0]->data;

	for ( i = first ; i <= last ; i++ ) {

		if ( VB->ClipMask[i] == 0 ) {

			/* precalculate the setup engine block */
			win = VB->Win.data[i];
			col = &(colorArray[i][0]);
			spec = &(specularArray[i][0]);

#ifdef __PPC___
			vertex[0] = 0;
			vertex[1] = SWAP(*(int *)&tc[i][0]);
			vertex[2] = SWAP(*(int *)&tc[i][1]);
			vertex[3] = SWAP(*(int *)&win[3]);
			vertex[4] = (spec[2]<<24) | (spec[1]<<16) | (spec[0]<<8) | spec[3];
			vertex[5] = SWAP(((int)(win[2] + offset)) << 16);
			vertex[6] = (col[2]<<24) | (col[1]<<16) | (col[0]<<8) | col[3];
			vertex[7] = 0;
#else
			vertex[0] = 0;
			vertex[1] = tc[i][0] * win[3];
			vertex[2] = tc[i][1] * win[3];
			vertex[3] = win[3];
			vertex[4] = (spec[3]<<24) | (spec[0]<<16) | (spec[1]<<8) | spec[2];
			vertex[5] = ((int)(win[2] + offset)) << 16;
			vertex[6] = (col[3]<<24) | (col[0]<<16) | (col[1]<<8) | col[2];
			vertex[7] = 0;
#endif

			/* ================================ */

			DMAGETPTR( 3*8+2 );

			/* calculate coords, ooa */
			xx[0] = (int)((win[0]-sz)*4);
			yy[0] = (int)((win[1]-sz)*4);

			xx[1] = (int)((win[0]+sz)*4);
			yy[1] = (int)((win[1]+sz)*4);

			xx[2] = (int)((win[0]+sz)*4);
			yy[2] = (int)((win[1]-sz)*4);

			ooa = 0.25 * 0.25 * ( ( xx[1] - xx[0] ) * ( yy[0] - yy[2] ) +
					      ( yy[1] - yy[0] ) * ( xx[2] - xx[0] ) );
			ooa = 1.0 / ooa;

			/* program the three vertices using precalculated values */
			dma_ptr[0] = SWAP( ADRINDEX( MACH64_VERTEX_1_S ) | (6<<16) );
			for ( j = 1 ; j < 7 ; j++ ) {
				dma_ptr[j] = vertex[j];
			}
			x = xx[0];
			y = ( mach64DB->height << 2 ) - yy[0];
			dma_ptr[7] = SWAP( (x<<16) | (y&0xffff) );
			dma_ptr += 8;

			dma_ptr[0] = SWAP( ADRINDEX( MACH64_VERTEX_2_S ) | (6<<16) );
			for ( j = 1 ; j < 7 ; j++ ) {
				dma_ptr[j] = vertex[j];
			}
			x = xx[1];
			y = ( mach64DB->height << 2 ) - yy[1];
			dma_ptr[7] = SWAP( (x<<16) | (y&0xffff) );
			dma_ptr += 8;

			dma_ptr[0] = SWAP( ADRINDEX( MACH64_VERTEX_3_S ) | (6<<16) );
			for ( j = 1 ; j < 7 ; j++ ) {
				dma_ptr[j] = vertex[j];
			}
			x = xx[2];
			y = ( mach64DB->height << 2 ) - yy[2];
			dma_ptr[7] = SWAP( (x<<16) | (y&0xffff) );
			dma_ptr += 8;

			/* kick off triangle, update dma buffer directly */
			DMAOUTFLOAT( MACH64_ONE_OVER_AREA_UC, ooa );

			mach64glx.dma_buffer->bufferDwords += 3*8+2;

			/* ================================ */

			DMAGETPTR( 1*8+2 );

			/* calculate coords, ooa */
			xx[2] = (int)((win[0]-sz)*4);
			yy[2] = (int)((win[1]+sz)*4);

			/* reprogram the third vertex using precalculated values */
			dma_ptr[0] = SWAP( ADRINDEX( MACH64_VERTEX_3_S ) | (6<<16) );
			for ( j = 1 ; j < 7 ; j++ ) {
				dma_ptr[j] = vertex[j];
			}
			x = xx[2];
			y = ( mach64DB->height << 2 ) - yy[2];
			dma_ptr[7] = SWAP( (x<<16) | (y&0xffff) );
			dma_ptr += 8;

			/* kick off triangle, update dma buffer directly */
			DMAOUTFLOAT( MACH64_ONE_OVER_AREA_UC, -ooa );

			mach64glx.dma_buffer->bufferDwords += 1*8+2;
		}
	}
}


/*
 * Null primitive functions for performance testing:
 */
static void mach64_null_quad( GLcontext *ctx, GLuint v0, GLuint v1, GLuint v2, GLuint v3, GLuint pv ) { }
static void mach64_null_triangle( GLcontext *ctx, GLuint v0, GLuint v1, GLuint v2, GLuint pv ) { }
static void mach64_null_line( GLcontext *ctx, GLuint v0, GLuint v1, GLuint pv ) { }
static void mach64_null_points( GLcontext *ctx, GLuint first, GLuint last ) { }

/*
 * At the moment, we only have monolithic rendering functions.  These will
 * be changed to specialized functions for the current rendering state.
 */
void mach64DDChooseRenderState( GLcontext *ctx )
{
	/* use null primitive functions for performance testing */
	if ( mach64glx.nullprims ) {
		ctx->Driver.QuadFunc = mach64_null_quad;
		ctx->Driver.TriangleFunc = mach64_null_triangle;
		ctx->Driver.LineFunc = mach64_null_line;
		ctx->Driver.PointsFunc = mach64_null_points;
		return;
	}

	/* use our accelerated functions */
	ctx->Driver.QuadFunc = mach64Quad;
	ctx->Driver.TriangleFunc = mach64Triangle;
	ctx->Driver.LineFunc = mach64Line;
	ctx->Driver.PointsFunc = mach64Points;
}


/*
 * Local Variables:
 * mode: c
 * tab-width: 8
 * c-basic-offset: 8
 * End:
 */
