#include <stdio.h>

#include "mac_video.h"

#include <Retrace.h>

void mac_video_get_framebuffer(struct framebuffer *fb) {
    GDHandle gdevice;
    PixMapHandle framebuffer_pixmap;
    gdevice = GetGDevice();
    HLock((Handle)gdevice);
    framebuffer_pixmap = (*gdevice)->gdPMap;
    HLock((Handle)framebuffer_pixmap);
    fb->rowbytes = ((*framebuffer_pixmap)->rowBytes) & 0x3FFF;
    fb->b = (*framebuffer_pixmap)->baseAddr;
    fb->bpp = (*framebuffer_pixmap)->pixelSize;
	fb->w = (*framebuffer_pixmap)->bounds.right - (*framebuffer_pixmap)->bounds.left;
	fb->h = (*framebuffer_pixmap)->bounds.bottom - (*framebuffer_pixmap)->bounds.top;
    HUnlock((Handle)framebuffer_pixmap);
    HUnlock((Handle)gdevice);
}

struct framebuffer mac_video_screen;

VBLTask vbl_task;
int mac_video_vbl_enabled = 0;
volatile uint32_t vbl_tick_count = 0;
uint32_t vbl_next_tick_count = 1;
short vbl_slot_number;
short GetSlotNumber(GDHandle devHandle);
void vbl_func(VBLTask *recPtr);
void vbl_install(short slotNumber);

short GetSlotNumber(GDHandle devHandle) {
    short slotNumber;
    AuxDCEHandle myHandle;
    myHandle = (AuxDCEHandle)GetDCtlEntry( (*devHandle)->gdRefNum);
    slotNumber = (*myHandle)->dCtlSlot;
    return slotNumber;
}
void vbl_func(VBLTask *recPtr) {
    vbl_tick_count++;
    recPtr->vblCount = 1;
}

GDHandle gdevice_initial;
GDHandle gdevice_main;
GrafPtr port_initial;
WindowPtr window;

void vbl_install(short slotNumber) {
    vbl_task.qType = vType;
    vbl_task.vblAddr = NewVBLProc(vbl_func);
    vbl_task.vblCount = 1;
    /* from Inside Macintosh Processes 1994.pdf
    	SlotVInstall: for external monitors (and built-in)
    	VInstall: for built-in, if does not have built-in, it is a fake VBL at 60.15 hz
     */
    SlotVInstall((QElemPtr)&vbl_task, slotNumber);
}

void create_fullscreen_window(void) {
    Point p;
    Rect screen_rect;
	/* I have not tested multi-monitor setup, hopefully this code does the right thing */
    screen_rect = qd.screenBits.bounds;
    window = NewCWindow(NULL, &screen_rect, "\pHello, Macintosh!", true, plainDBox, (WindowPtr)-1L, false, 0);
    SetPort(window);
    p.h = 0;
    p.v = 0;
    ShieldCursor(&window->portRect, p);
}

void mac_video_init(int w, int h, int bpp, int use_vbl, int use_main_gdevice) {
    gdevice_initial = GetGDevice();
	if (use_main_gdevice) {
    	gdevice_main = GetMainDevice();
	    SetGDevice(gdevice_main);
    }
    GetPort(&port_initial);
	create_fullscreen_window();
    BeginUpdate(window); /* Disable screen updates */
	mac_video_vbl_enabled = use_vbl;
	mac_video_get_framebuffer(&mac_video_screen);
	if (use_vbl) {
		vbl_slot_number = GetSlotNumber(GetGDevice());
	    vbl_install(vbl_slot_number);
	}	
}

void mac_video_wait_vbl(void) {
	if (!mac_video_vbl_enabled) {
		return;
	}	
	while (vbl_tick_count < vbl_next_tick_count) {
    }
    vbl_next_tick_count = vbl_tick_count + 1;
}

void mac_video_cleanup(void) {
	if (mac_video_vbl_enabled) {
	    SlotVRemove((QElemPtr)&vbl_task, vbl_slot_number);	
	}
    EndUpdate(window);
    DisposeWindow(window);
    ShowCursor();
    SetGDevice(gdevice_initial);
    SetPort(port_initial);
}

void mac_video_init_backbuffer(struct framebuffer *fb, int w, int h, int bpp) {
	fb->rowbytes = w*(bpp>>3);
	fb->b = (void *)malloc(h*fb->rowbytes);
	if (!fb->b) {
		fprintf(stderr, "failed malloc\n");
		exit(1);
	}
	fb->w = w;
	fb->h = h;
	fb->bpp = bpp;
}

void blitter2x_slow(void *framebuffer, int framebuffer_rowbytes, void *buffer, int buffer_rowbytes, int height) ;

void mac_video_blit(struct framebuffer *fb) {
	blitter2x_slow(mac_video_screen.b, mac_video_screen.rowbytes, fb->b, fb->rowbytes, mac_video_screen.h); 
#if 0
	if (fb->rowbytes << 1 != mac_video_screen.rowbytes) {
		return;
	}
	blitter2x(mac_video_screen.b, mac_video_screen.rowbytes, fb->b, mac_video_screen.h); 
#endif
}

void blitter2x_slow(void *framebuffer, int framebuffer_rowbytes, void *buffer, int buffer_rowbytes, int height) {
    uint32_t *src = (void *)buffer;
    int dst_row_elements = framebuffer_rowbytes >> 3;
    double *dst_row0 = (double *)framebuffer;
    double *dst_row1 = (double *)framebuffer + dst_row_elements;
    double tmp_ab[1];
    double tmp_cd[1];
    uint32_t *tmp_ab_i = (void *)tmp_ab;
    uint32_t *tmp_cd_i = (void *)tmp_cd;
    int j = height;
    while (j--) {
        int i = buffer_rowbytes >> 3;
        while (i--) {
            uint32_t a, b, ab, aa, bb;
            uint32_t c, d, cd, cc, dd;
            ab = *src++;
            b = ab & 0xffff;
            a = (ab >> 16) & 0xffff;
            aa = a | (a << 16);
            bb = b | (b << 16);
            tmp_ab_i[0] = aa;
            tmp_ab_i[1] = bb;
            *dst_row0++ = *tmp_ab;
            *dst_row1++ = *tmp_ab;

            cd = *src++;
            d = cd & 0xffff;
            c = (cd >> 16) & 0xffff;
            cc = c | (c << 16);
            dd = d | (d << 16);
            tmp_cd_i[0] = cc;
            tmp_cd_i[1] = dd;
            *dst_row0++ = *tmp_cd;
            *dst_row1++ = *tmp_cd;
        }
        dst_row0 += dst_row_elements;
        dst_row1 += dst_row_elements;
    }
}

/* TODO this is wip, remains rearranging what was loaded:
		pixels a,b,c,d -> a,a,b,b,c,c,d,d
*/
#if 1
unsigned int asm blitter2x(
	register void *r3 /* dst */,
	register unsigned int r4 /* dst_rowbytes */, 
	register void *r5 /* src */,
	register int r6 /* dst_height */ )
{
    /* assume: dst_rowbytes == src_rowbytes*2
     * assume: src_height == dts_height*2
     * we go through each src scanline and:
     * - read next 4 src pixels 
     * - write next 8 dst pixels (scale by 2 in x)
     * - write one scanline further down, next 8 dst pixels (scale by 2 also in y
     * repeat for (src_rowbytes / (4 pixels * 2 bytes per pixel)) times
     */
    /*  r3 dst
        r4 dst_rowbytes
        r5 src
        r6 dst_height
        r7 current dst scanline
        r8 next dst scanline */
    mr r7, r3
    add r8, r3, r4
y_loop:

#if 1
    /* write 64 bytes at a time */
    srwi r9, r4, 6
    mtctr r9
    x_loop:
        /* read 8 words (32 bytes): 31,30,29,28,27,26,25,24 */
        lmw r24, 0(r5)
        addi r5, r5, 32
        /* write 16 words (64 bytes): 31,30,29,28,27,26,25,24, 23,22,21,20,19,18,17,16 */

		/* r24: a,b
		   r16: a,a
		   r18: b,b		
		 */		
		rlwinm r16, r24,  0, 17, 31
		rlwimi r16, r24, 16,  0, 16
		mr r17, r16	
		rlwinm r18, r24, 16,  0, 16
		rlwimi r18, r24,  0, 17, 31

        mr r16, r24
        mr r17, r24
        mr r18, r25
        mr r19, r25
        mr r20, r26
        mr r21, r26
        mr r22, r27
        mr r23, r27     
        mr r24, r28
        mr r25, r28
        mr r26, r29
        mr r27, r29
        mr r28, r30
        mr r29, r30
        mr r30, r31
        stmw r16, 0(r7)
        addi r7, r7, 64
        stmw r16, 0(r8)
        addi r8, r8, 64
#endif

#if 0
    /* write 32 bytes at a time */
    srwi r9, r4, 5
    mtctr r9
    x_loop:
		
	#if 1
        /* read 4 pixels = 8 bytes */
        lfd f0, 0(r5)
        /* read 4 pixels = 8 bytes */
        lfd f1, 8(r5)
        addi r5, r5, 16
        /* NOTE this is not correct but just testing */
	#else
        /* read 4 words (16 bytes): 31,30,29,28 */
        lmw r28, 0(r5)
		/*	31	30 	29 	28
			ab	cd	ef	gh
			
			27	26	25	24	23	22
			aa	bb	cc	dd	ee	ff
		*/
		/* TODO repack that into reg 22-27 like above, store to tmp space double then write double */
	#endif
	
        /* r7: upper scanline, write 16 pixels = 32 bytes */
        stfd f0, 0(r7)
        stfd f0, 8(r7)
        stfd f1, 16(r7)
        stfd f1, 24(r7)
        addi r7, r7, 32

        /* r8: next scanline, write 16 pixels = 32 bytes */
        stfd f0, 0(r8)      
        stfd f0, 8(r8)     
        stfd f1, 16(r8)    
        stfd f1, 24(r8)
        addi r8, r8, 32
#endif

        bdnz x_loop

    /* we wrote 2 scanlines, advance to next 2 scanlines */
    add r7, r7, r4
    add r8, r8, r4

    subi r6, r6, 2;
    cmpwi r6, 0
    bne y_loop
}

unsigned int asm blitter2x_interlaced(
	register void *r3, /* dst */
	register uint32_t r4, /* dst_rowbytes */
	register void *r5, /* src */
	register int r6, /* dst_height */
	register uint32_t *r7 /* tmp 32 bytes */
 ) {
    /* assume: dst_rowbytes == src_rowbytes*2
     * assume: src_height == dts_height*2
     * we go through each src scanline and:
     * - read next 4 src pixels 
     * - write next 8 dst pixels (scale by 2 in x)
     * - write one scanline further down, next 8 dst pixels (scale by 2 also in y
     * repeat for (src_rowbytes / (4 pixels * 2 bytes per pixel)) times
     */
    /*  r3 dst
        r4 dst_rowbytes
        r5 src
        r6 dst_height
        r7 ptr to tmp mem (32 bytes)
        r9 tmp register
        r10 current dst scanline
        r11 next dst scanline */
    mr r10, r3
    add r11, r3, r4
y_loop:

    /* write 16 bytes at a time */
    srwi r9, r4, 4
    mtctr r9

    x_loop:
    
        /* read 4 pixels = 8 bytes */
        /* pixels: aa bb cc dd */
        lfd f0, 0(r5)
        addi r5, r5, 8
        stfd f0, 0(r7)
        lwz r12, 0(r7)
        lwz r14, 4(r7)

        /* r12: aabb => r12 = aaaa r13 = bbbb */
        mr r13, r12
        rlwimi r12, r12, 16, 16, 31
        stw r12, 0(r7)
        rlwimi r13, r13, 16, 0, 15
        stw r13, 4(r7)

        /* r14: ccdd => r14 = cccc r15 = dddd */
        mr r15, r14
        rlwimi r14, r14, 16, 16, 31
        stw r14, 8(r7)
        rlwimi r15, r15, 16, 0, 15

        stw r15, 12(r7)
    
        lfd f0, 0(r7)
        lfd f1, 8(r7)

        /* write 8 pixels = 16 bytes */
        stfd f0, 0(r10)
        stfd f1, 8(r10)
        
        addi r10, r10, 16


        bdnz x_loop

    /* skip a scanline */
    add r10, r10, r4

    subi r6, r6, 2;
    cmpwi r6, 0
    bne y_loop
}

#endif
