/*---------------------------------------------------------------------
 *        [ Copyright (c) 1999 Alpha Processor Inc.] - Unpublished Work
 *          All rights reserved
 * 
 *    This file contains source code written by Alpha Processor, Inc.
 *    It may not be used without express written permission. The
 *    expression of the information contained herein is protected under
 *    federal copyright laws as an unpublished work and all copying
 *    without permission is prohibited and may be subject to criminal
 *    and civil penalties. Alpha Processor, Inc.  assumes no
 *    responsibility for errors, omissions, or damages caused by the use
 *    of these programs or from use of the information contained herein.
 *  
 *-------------------------------------------------------------------*/
/* memory test core routines, coded in assembler for maximum control and
 * bandwidth.  These routines couple with driver C routines in mem_tests.c */


#include <alpha/regdef.h>

/* bodge for wh64, which isn't covered by this version of gas */
#define WH64_t0	.long ( (0x18 << 26) | (0x1F << 21) | (0x01 << 16) | 0xF800 )
#define WH64_t3	.long ( (0x18 << 26) | (0x1F << 21) | (0x04 << 16) | 0xF800 )

	.text
	.arch	ev6
	.set noat

/*----------------------------------------------------------------------*/
/* uniq_asm */

/* Fill with address = data.  Currently, the reading back is done using 
 * the C routine, just to double-check that all is good. */

/* void uniq_wr_asm( ulong *start, ulong bytes ) */

// Critical assumption: 0mod8 bytes will be written here.
	.align	4
	.globl	uniq_wr_asm
	.ent	uniq_wr_asm
uniq_wr_asm:
	
	bis	a0, zero, t0
	srl	a1, 3, t3		// Number of quads to write
	srl	a1, 6, t4		// Times to traverse 8x unrolled loop
	addq	a0, a1, t2		/* get end address */
	beq	t4, $NoUnroll

	.align	4
$8x_loop:
	stq	t0, 0(t0)		// 1
	addq	t0, 8, t0
	stq	t0, 0(t0)		// 2
	addq	t0, 8, t0
	stq	t0, 0(t0)		// 3
	addq	t0, 8, t0
	stq	t0, 0(t0)		// 4
	addq	t0, 8, t0
	stq	t0, 0(t0)		// 5
	addq	t0, 8, t0
	stq	t0, 0(t0)		// 6
	addq	t0, 8, t0
	stq	t0, 0(t0)		// 7
	addq	t0, 8, t0
	stq	t0, 0(t0)		// 8
	addq	t0, 8, t0

	subq	t3, 8, t3
	subq	t4, 1, t4
	bgt	t4, $8x_loop

$NoUnroll:
	beq	t3, $NoQuads		// Number of individual quadwords
	stq	t0, 0(t0)
	addq	t0, 8, t0
	subq	t3, 1, t3
	bgt	t3, $NoUnroll

$NoQuads:
	mulq	a1, 1, v0		/* each address accessed once */
	ret	(ra)
	.end	uniq_wr_asm

#if 0
/* this is DU assembler for a copy loop that goes very well... */
L$31:
      ldl     r31, 256(r2)
      ldt     f1, (r2)
      lda     r6, 159(r3)
      lda     r8, -64(sp)
      ldt     f10, 8(r2)
      ldt     f11, 16(r2)
      cmplt   r6, r10, r7
      addl    j, 4, j                                 # r1, 4, r1  
      ldt     f12, 24(r2)                                   
      cmoveq  r7, r8, r6
      cmplt   j, r11, r16                             # r1, r11, r16
      lda     r2, 32(r2)
      lda     r3, 32(r3)
      wh64    (r6)                                   
      stt     f1, -32(r3)
      unop
      stt     f10, -24(r3)
      stt     f11, -16(r3)
      unop
      unop
      stt     f12, -8(r3)
      bne     r16, L$31                        
#endif

/*----------------------------------------------------------------------*/

/* Alternate between 0x0 and 0xfffff...f to create the greatest possible 
 * oscillation in bus data lines */

	.text
	.align	4
	.globl	energy_wr_asm
	.ent	energy_wr_asm
energy_wr_asm:

	bis	a0, zero, t0
	addq	a0, a1, t4		/* calculate end address */

	ornot	zero, zero, t1		/* all F's for between quadwords */
	
	/* produce A's and 5's for transitions within the quadword */

	bis	zero, zero, t6
	lda	t7, 0x5555(zero)
	sll	t7, 16, t6		/* t6 = 0x5555 0000 */
	bis	t7, t6, t6		/* t6 = 0x5555 5555 */
	sll	t6, 32, t7		/* t7 = 0x5555 5555 0000 0000 */
	bis	t7, t6, t6		/* t6 = 0x5555 5555 5555 5555 */
	ornot	zero, t6, t7		/* t7 = 0xAAAA AAAA AAAA AAAA */


	/* Write hint loop, do this every 32K (size of dcache set) */
en_wh:
	/* construct end address from current pointer */
	lda	t2, 4096(zero)		/* scaled add means 32K */
	s8addq	t2, t0, t2
	bis	t0, zero, t3

en_whloop:				/* each iteration allocs 1K of cache */
	WH64_t3
	addq	t3, 64, t3		/* one cacheline = 64 bytes */
	WH64_t3
	addq	t3, 64, t3		/* one cacheline = 64 bytes */
	WH64_t3
	addq	t3, 64, t3		/* one cacheline = 64 bytes */
	WH64_t3
	addq	t3, 64, t3		/* one cacheline = 64 bytes */
	WH64_t3
	addq	t3, 64, t3		/* one cacheline = 64 bytes */
	WH64_t3
	addq	t3, 64, t3		/* one cacheline = 64 bytes */
	WH64_t3
	addq	t3, 64, t3		/* one cacheline = 64 bytes */
	WH64_t3
	addq	t3, 64, t3		/* one cacheline = 64 bytes */

	subq	t2, t3, t5
	bgt	t5, en_whloop		/* go round again if not at end addr */
	
en_wrloop:				/* Store a cache line */
	stq	zero, 0(t0)		/* bcache is 128 bits */
	stq	zero, 8(t0)		/* so transitions on that size */

	stq	t1, 16(t0)
	stq	t1, 24(t0)

	stq	t6, 32(t0)		/* system bus is 64 bits */
	stq	t7, 40(t0)		/* so make AAA...555 transitions */

	stq	t7, 48(t0)		/* and complement of above for bcache */
	stq	t6, 56(t0)

	addq	t0, 64, t0		/* next cache block */
	subq	t2, t0, t5		/* are we done with 64K? */

	bgt	t5, en_wrloop		/* go round again if not at end addr */

	/* if we're here, we either need to WH64 another cachefull, or return */
	subq	t4, t0, t5
	bgt	t5, en_wh

	mulq	a1, 1, v0		/* each address accessed once */
	ret	(ra)
	.end	energy_wr_asm



/*----------------------------------------------------------------------*/
/* dpattern_asm - fill a cacheline according to a pre-generated fill
 * pattern */

	.text
	.align 	4
	.globl	dpattern_asm
	.ent	dpattern_asm

dpattern_asm:

        /* prepare data registers */
        ldq     t1, 0(a2)
        ldq     t2, 8(a2)
        ldq     t3, 16(a2)
        ldq     t4, 24(a2)
        ldq     t5, 32(a2)
        ldq     t6, 40(a2)
        ldq     t7, 48(a2)
        ldq     t8, 56(a2)

	bis	a0, zero, t0
	addq	a0, a1, t10		/* calculate end address */

dpattern_wrloop:

        stq     t1, 0(t0)             /* write a whole cache block */
        stq     t2, 8(t0)
        stq     t3, 16(t0)
        stq     t4, 24(t0)

        stq     t5, 32(t0)
        stq     t6, 40(t0)
        stq     t7, 48(t0)
        stq     t8, 56(t0)

        addq    t0, 64, t0              /* next cache block */
        subq    t10, t0, t9

        bgt     t9, dpattern_wrloop     /* go round again if not at end addr */

	mulq	a1, 1, v0		/* each addr accessed once */
        ret     (ra)
        .end    dpattern_asm



/*----------------------------------------------------------------------*/
/* icache test implementation */

#define USE_FP_REGS 0

	.text
	.align 4
	.globl noparray
	.ent noparray
noparray:
#ifdef USE_FP_REGS			/* USE_FP_REGS */
	addq	$0, 0, $0		/* 16 bytes */
	addf	$f31, $f31, $f0
	mulq	$1, 1, $1
	mulf	$f31, $f31, $f1

	addq	$2, 0, $2		/* 32 bytes */
	addf	$f31, $f31, $f2
	mulq	$3, 1, $3
	mulf	$f31, $f31, $f3

	addq	$4, 0, $4		/* 48 bytes */
	addf	$f31, $f31, $f4
	mulq	$5, 1, $5
	mulf	$f31, $f31, $f5

	addq	$6, 0, $6		/* 64 bytes */
	addf	$f31, $f31, $f6
	mulq	$7, 1, $7
	mulf	$f31, $f31, $f7
#else					/* USE_FP_REGS */
	bis	$0, $0, $0		/* 16 bytes */
	bis	$1, $1, $1
	bis	$2, $2, $2
	bis	$3, $3, $3

	bis	$4, $4, $4		/* 32 bytes */
	bis	$5, $5, $5
	bis	$6, $6, $6
	bis	$7, $7, $7

	bis	$0, $0, $0		/* 48 bytes */
	bis	$1, $1, $1
	bis	$2, $2, $2
	bis	$3, $3, $3

	bis	$4, $4, $4		/* 64 bytes */
	bis	$5, $5, $5
	bis	$6, $6, $6
	bis	$7, $7, $7
#endif					/* USE_FP_REGS */
	.end	noparray

	.globl	retinsn
retinsn:
        ret     (ra)

	.text
	.align 4
        .globl  icache_rd_asm
        .ent    icache_rd_asm
icache_rd_asm:

	/* the setup for this routine has already written a stream of 
	 * identity instructions to memory.  This code executes it.  Every
	 * 32K, there is a return instruction, which brings us back here.
	 * I run every 32K block twice, so that first time it is fetched 
	 * from memory and second time it is answered from the Icache */

#ifdef USE_FP_REGS			/* USE_FP_REGS */
	/* save some stuff on the stack - GCC likes to use FP registers */
	subq	sp, 64, sp
	stt	$f0, 0(sp)
	stt	$f1, 8(sp)
	stt	$f2, 16(sp)
	stt	$f3, 24(sp)
	stt	$f4, 32(sp)
	stt	$f5, 40(sp)
	stt	$f6, 48(sp)
	stt	$f7, 56(sp)
#endif					/* USE_FP_REGS */

	/* first, save away our return address from here */
	bis	ra, zero, t7		/* t7 (r8) is unused */
	lda	t8, 32(zero)
	sll	t8, 10, t8		/* construct 32K in t8 */
	bis	a1, zero, t9

icache_rdloop:

	jsr	ra, (a0)		/* first call from mem */
	jsr	ra, (a0)		/* second call from icache */

        addq    a0, t8, a0              /* next 32K block */
	subq	t9, t8, t9		/* 32K less to go */

        bgt     t9, icache_rdloop	/* go round again if not at end addr */

#ifdef USE_FP_REGS			/* USE_FP_REGS */
	/* restore FP regs */
	ldt	$f0, 0(sp)
	ldt	$f1, 8(sp)
	ldt	$f2, 16(sp)
	ldt	$f3, 24(sp)
	ldt	$f4, 32(sp)
	ldt	$f5, 40(sp)
	ldt	$f6, 48(sp)
	ldt	$f7, 56(sp)
	addq	sp, 64, sp
#endif					/* USE_FP_REGS */

	bis	t7, zero, ra		/* restore return address */
	mulq	a1, 2, v0		/* passed over twice */
        ret     (ra)

        .end    icache_rd_asm


/*===========================================================================*/
/*= graycode_memtest - do gray code memtest                                 =*/
/*===========================================================================*/
/*                           */
/* OVERVIEW:*/
/*                                                                           */
/*      This is the time critical section of Graycode memtest.  The file     */
/*      takes as an input the starting address and length in bytes of        */
/*      the section of memory to test.  The test is divided into three       */
/*      loops. The first loop write alternating graycodes/inverse graycode   */
/*      throughout the memory under test.  The second loop reads each        */
/*      location and writes back the inverse.  The third loop reads all      */
/*      memory. Note that no explicit data checking is performed. The        */
/*      EDC/ECC logic will detect any bit errors which occur.  The fault     */
/*      model is not to detect address shorts but to stress the memory path  */
/*      with high throughput and state transitions.                          */
/*                             						     */
/* inputs:                        */
/*              a0 - start address */
/*              a1 - length number of bytes to test */
/*                         */
/* returns:      							     */
/*              ra - return address*/
/*                                       */
/* register usage:        */
/*              v0  - graycode counter */
/*              t0  - next address graycode counter */
/*              t1  - saved/restored scratch*/
/*              t2  - saved/restored scratch*/
/*              t3  - saved/restored graycode caclulations*/
/*		t4  - return address for graycode write loop */
/*              t8  - graycode caclulations*/
/*              t9  - graycode caclulations                          */
/*              t10 - test address             */
/*              t11 - scratch*/
/*              AT  - constant -1*/
/*                       */

	/* size_t gc_wr_asm( void *start_addr, size_t bytes ) */

	.globl	gc_wr_asm
	.ent	gc_wr_asm

gc_wr_asm:			

	lda     AT, -1(zero)		/* load AT = -1 for inverse graycode */
	addq	a0, a1, t9		/* t9 marks end of test array */


	/*------------------------------------------------------------*/
	/* First loop - fill with graycode and inverse graycode */

	bis     zero, a0, t10		/* set the start address*/
        bis     zero, zero, t0  	/* clear t0=j      */

        /* This routine fills memory, writing 4x32-bit graycodes
         * then 4x32-bit inverse graycodes */

	/* This test works by writing 128bits (=width of bus to bcache)
	 * and then the inverse 128 bits, which is why the loop is structured
	 * in two parts */

gc_loop1:
        bis     zero, 4, v0
L11:    srl     t0, 1, t3       /* calulate the graycode - (j>>1)*/
        xor     t3, t0, t3      /* graycode - (j>>1)^j */
        stl     t3, 0(t10)      /* store the graycode*/
        addq    t0, 1, t0       /* j++*/
        addq    t10, 4, t10     /* j++*/
        subq    v0, 1, v0
        bne     v0, L11

        bis     zero, 4, v0
L12:    srl     t0, 1, t3       /* calulate the graycode - (j>>1)*/
        xor     t3, t0, t3      /* graycode - (j>>1)^j */
        xor     t3, AT, t3      /* inverse graycode - (j>>1)^j^-1*/
        stl     t3, 0(t10)      /* store the inverse graycode*/
        addq    t0, 1, t0       /* j++*/
        addq    t10, 4, t10     /* j++*/
        subq    v0, 1, v0
        bne     v0, L12

        cmpult  t10, t9, t11	/* end of test memory?*/
        bne     t11, gc_loop1	/* keep testing if not at end of memory*/


	/*------------------------------------------------------------*/
	/* Second loop - load, invert, write back */

	bis	zero, a0, t10		/* start address */

gc_loop2:
	ldq	t0, 0(t10)
	ornot	t0, zero, t0
	stq	t0, 0(t10)

	ldq	t1, 0x8(t10)
	ornot	t1, zero, t1
	stq	t1, 0x8(t10)

	ldq	t2, 0x10(t10)
	ornot	t2, zero, t2
	stq	t2, 0x10(t10)

	ldq	t3, 0x18(t10)
	ornot	t3, zero, t3
	stq	t3, 0x18(t10)

	addq	t10, 0x20, t10		/* another cacheline's worth */
	cmpult	t10, t9, t11
	bne	t11, gc_loop2


	/*------------------------------------------------------------*/
	/* Clean up and return */

	mulq	a1, 3, v0		/* byte count (1xload, 2xstore) */
	ret	zero, (ra)		/* back we go */

	.end	gc_wr_asm



	/* size_t gc_wr_asm( void *start_addr, size_t bytes ) */

	.globl	gc_rd_asm
	.ent	gc_rd_asm

gc_rd_asm: 				/* Read back and compare */

#if 1

	/* just read the stuff back, rely on ECC logic to trap errors */

	bis	a0, zero, t10		/* base address */
	addq	a0, a1, t9		/* end address */

gc_rdloop:

	ldq	t0, 0(t10)		/* 1 cacheline */
	ldq	t1, 0x8(t10)
	ldq	t2, 0x10(t10)
	ldq	t3, 0x18(t10)

	ldq	t4, 0x20(t10)		/* 2 cacheline */
	ldq	t5, 0x28(t10)
	ldq	t6, 0x30(t10)
	ldq	t7, 0x38(t10)

	addq	t10, 0x40, t10		/* another 2 cacheline's worth */
	cmpult	t10, t9, t11
	bne	t11, gc_rdloop


	/* clean up and return */

	mulq	a1, 1, v0		/* 1x load op per test element */
	ret	zero, (ra)

#else

	sll     a1, 3, a1
	bis     zero, a0, t10
	bis     zero, zero, v0
	addq    a0, a1, t11

L_COMP1:
	ldq     t1, 0(t10)
	ornot   zero, t1, t1
	stq     t1, 0(t10)
	addq    t10, 8, t10
	addq    v0, 8, v0

	cmpult  v0, a1, t3     /* end of test memory? */
	bne     t3, L_COMP1     /* keep testing if not at end of memory */
	bis     zero, zero, v0
L_COMP2:
	ldq     t2, 0(t11)
	ornot   zero, t2, t2
	stq     t2, 0(t11)
	addq    t11, 8, t11
	addq    v0, 8, v0

	cmpult   v0, a1, t3     /* end of test memory? */
	bne     t3, L_COMP2     /* keep testing if not at end of memory */

	bis     zero, a0, t10
	bis     zero, zero, v0
	addq    a0, a1, t11
L_COMP3:
	ldq     t1, 0(t10)
	ldq     t2, 0(t11)
	xor     t1, t2, t1      
	bne     t1, L_COMPERR
	addq    t10, 8, t10
	addq    t11, 8, t11
	addq    v0, 8, v0

	cmpult   v0, a1, t3     /* end of test memory? */
	bne     t3, L_COMP3     /* keep testing if not at end of memory */

	/* If we are here, we have finished an iteration.  Return */

	bis	zero, 0, v0	/* return value indicating success */
	ret	zero, (ra)

L_COMPERR:
	ornot	zero, zero, v0	/* generates not 0 = -1 as return code */
	ret	zero, (ra)

#endif

	.end	gc_rd_asm
