;"true" by Kuemmel for Outline 2018
;true colour version at 640x480 using also SSE instructions 
;run's at about 45 FPS on an Athlon XP1800 with WinXP, but strangely slower on the latest Intel CPU's with FreeDos...
;main algorithm based on a shader by kusma https://www.shadertoy.com/view/4d33RM
;and the discussion on pout here => http://www.pouet.net/topic.php?which=10564

res_x  EQU 640
res_y  EQU 480
org  100h

mov si,data_section
mov di,si			;for palette generation

;---full creamy palette look up table generation - 4 Bytes RGBx * 256 entries
palette:			
	mov ah,cl
	shr ah,1
	mov al,ah
	or al,01001111b
	stosw			;es is ds
	mov al,255
	stosw
loop palette
salc				;clear al for texture generation

push 0a000h			;screen
pop  es 			
push 08000h			;texture 256*256 Bytes
pop  fs

;---seamless texture generation "borrowed" and slightly modified from 'lattice' by baze/3SC (2001)
mov dl,3			;initial seed 
texture:			
	mov	bx,cx
	rcl	dl,cl
	mov	ah,dl
	sar	ah,4
	adc	al,ah
	adc	al,byte[fs:bx+128]		;+127 looks also good
	shr	al,1
	mov	byte[fs:bx],al
	not	bh
	mov	byte[fs:bx],al
loop texture

;"---set screenmode by TomCatAbaddon
mov bx,121h		;set screen 640*480*32 true colour
video:
mov ax,4f02h
int 10h
mov bl,12h
cmp ah,bh
je video		;safety, skipping this works for XP but not for DOSBox
								
fld1			;1
xorps xmm2,xmm2 ;clear xmm2 for colouring later...

main_loop:
xor dx,dx		;dx is the screen bank address =>640*480*4/65565 = 18.75 banks needed
xor di,di		;init first pixel for each screen
mov ax,res_y
y_loop:
	push ax
	sub ax,240					;st0			|st1	|st2	|st3	|st4	|st5	|st6	|st7	
	mov word[bp+si],ax			;1	
	fild word[bp+si]			;y				|1
	fld  st0					;y				|y		|1
	fmul st0,st0				;y*y			|y		|1
	mov ax,res_x
	x_loop:
		push ax
		sub ax,320
		test di,di
		mov word[bp+si],ax		;reordered
		jnz skip_bank_switch
			xor bx,bx			;needs to be clear !
			mov ax,4F05h
			int 10h 			;next 64 KByte bank, bx needs to be zero !
			inc dx	
		skip_bank_switch:
		fild word[bp+si]		;x				|y*y	|y		|1
		fld st0 				;x				|x		|y*y	|y	|1	
		fmul st0,st0			;x*x			|x		|y*y	|y	|1	
		fadd st0,st2			;x*x+y*y		|x		|y*y	|y	|1	
		fmul dword[si-4]		;(x*x+y*y)*mu	|x		|y*y	|y	|1	
		fsubr st0,st4			;1-(x*x+y*y)*mu |x		|y*y	|y	|1	
		fabs					;t=abs(1-...)	|x		|y*y	|y	|1	
		fst dword[si]			;t				|x		|y*y	|y	|1	
		fsqrt					;e=sqrt(t)		|x		|y*y	|y	|1	
		fsubr st0,st4			;1-e			|x		|y*y	|y	|1	
		fadd st0,st4			;2-e			|x		|y*y	|y	|1
		fmul st1,st0			;2-e			|x*(2-e)|y*y	|y	|1	
		fmul st0,st3			;y*(2-e)		|x*(2-e)|y*y	|y	|1	
		fistp word[bp+si]		;x*(2-e)		|y*y	|y		|1		
		fistp word[bp+si+2]		;y*y			|y		|1
		movd xmm0,dword[si]		;t										|0,0,0,0	;reordered. movups also works, is one byte shorter,
																					;but needs to load a dqword. movss is also okay
		mov ebx,dword[bp+si]	;may be more bytes but faster memory address...notsure...
		mov bh,bl
		shr ebx,8
		sub bl,cl				;inc x_movement
		add bh,cl				;inc y_movement
		movzx bx,byte[fs:bx]
		shl bx,2
		movd xmm1,dword[si+bx]	;t				|32B RGBx,...			|0,0,0,0	
		shufps xmm0,xmm0,0		;t,t,t,t		|32B RGBx,...			|0,0,0,0	
		punpcklbw xmm1,xmm2		;t,t,t,t		|16B R,G,B,...			|0,0,0,0	;could be replaced with pshufb when you have SSSE3	
		punpcklbw xmm1,xmm2		;t,t,t,t		|8B  R,G,B,...			|0,0,0,0	;but then needs another variable for byte distribution	
		cvtdq2ps xmm1,xmm1		;t,t,t,t		|R,G,B,...				|0,0,0,0	
		mulps xmm1,xmm1 		;t,t,t,t		|R*R,G*G,B*B,...		|0,0,0,0	
		mulps xmm1,xmm0 		;t,t,t,t		|R*R*t,G*G*t,B*B*t,...	|0,0,0,0	
		rsqrtps xmm3,xmm1		;estimated sqrt is much faster just costs another muliply 
		mulps xmm1,xmm3 		;as sqrt(x) = x * 1/sqrt(x)
		cvtps2dq xmm1,xmm1		;t,t,t,t		|sqrt(R*R*t),sqrt(B*B*t)|0,0,0,0	
		packuswb xmm1,xmm1		;t,t,t,t		|16B RGBx				|0,0,0,0	
		packuswb xmm1,xmm1		;t,t,t,t		|8B RGBx				|0,0,0,0	
		movd dword[es:di],xmm1	;store on screen
		pop ax
		add di,4
		dec ax
	jnz x_loop
	fcompp						;|1
	pop ax
	dec ax
jnz y_loop

inc cx
inc cx		;update global movement counter

;---check keyboard 
check_keyboard:
in al,0x60				;check for ESC
dec al	
jnz main_loop
exit:
mov al,3				;got the bytes left...ah is zero
int 10h
ret
dw 0x37fb				;is approx. 0.00003
data_section: 
