;'plattenbau 3000 sse' at 1024x768x8Bit by Kuemmel
;high speed version 2
;16 pixel loop + early exit
;inner loop with only add, ends up longer
;requires SSE-Level SSE4.1
org 100h
use16
width=1024
height=768

;---parameters
effect_speed_shift=10	;default: 10
effect_01=100000000b	;default: 100000000b should kind of fit to effect_speed_shift
effect_02=111111111b	;default: 111111111b should kind of fit to effect_speed_shift
depth_initial=8 		;default: 8
depth_steps=255 	    ;raycasting steps 0...255 (could be raised, costs +1 Byte)
bg_colour=54			;choose background colour from standard VGA palette
x_offset=512-130	    ;default 320 (center x: 512)
y_offset=384+50 	    ;default 384 (center y: 384)
xy_speed=9				;movement speed adjust

;---create words with 0,1,2,3,4,5,6,7 for SSE x-offset mask
mov di,0x3e0	;aligned to 16 Byte
loop01234567:
   stosw
   inc ax
loop loop01234567

;---switch to 1024x768x8Bit before palette is changed
mov ax,0x4f02
mov bx,0x105
int 10h
push 0xa000
pop es

;---create background colour (palette entry 0...looks expensive but shorter than done in SSE
;if background colour black is okay this can be skipped totally
mov ax,0x1015
mov bl,bg_colour	;doesn't seem to care that bh is set from before
int 10h 			;get RGB in dh,ch,cl
mov al,10h
xor bx,bx			 ;set RGB in dh,ch,cl (clear bh for later also)
int 10h

;---constants/variable address
xor bp,bp	 ;init global timer to 0...can be ommitted if you need two bytes...
mov si,0x300	;=0x200 aligned to 16 Byte...only save if code size maximum is <=256 Bytes !!!
xorps xmm6,xmm6     ;init sse timer = 0 ...seems zero at FreeDOS fresh start on all tested systems

;---main intro frame loop
main_loop:
shld ax,bp,effect_speed_shift
and al,00000011b
or  al,00010000b		;mask caluclation/variation => 000100??b
mov ah,al
mov word[si],ax
mov word[si+2],ax		
pshufd xmm7,[si],0		;needed on all 16 bytes
movaps [si+64],xmm7		;backup...could be done without SSE but needs a loop...

;change geometry for cube/platte/depth
test bp,effect_02
jne skip_effect_02
   xor byte[si-(0x300-(effect_loc0a-2))],1	 ;toggles between 54 and 55 => andps/andnps
   xor byte[si-(0x300-(effect_loc0b-2))],1	 ;toggles between 54 and 55 => andps/andnps
   xor byte[si-(0x300-(effect_loc2-2))],4    ;toggles between 0xfd and 0xf9 => psubw/paddw
skip_effect_02:

cwd						;init bank dx, ax is always positive here
xor di,di				;init screen bank
mov cx,-height+y_offset
loopy:
  ;switch screenbank if needed, needs bx=0, is zero here always
  ;due to width=1024 => 65536/1024=64 it's save to do this outside x loop
  test di,di
  jnz skip_bank_switch
	mov ax,0x4f05
	int 10h
	inc dx
  skip_bank_switch:

  mov ax,-width+x_offset
  loopx:
	pcmpeqw xmm4,xmm4
	movaps [si+48],xmm4		;backup...could be done without SSE but needs a loop...
	psllw  xmm4,depth_initial

	mov word[si],cx
	mov word[si+2],cx
	pshufd xmm3,[si],0
	test bp,effect_01		;change geometry for 'platte'
	jz skip_effect_01
		paddw xmm3,[si+64]
	skip_effect_01:
	movaps [si+32],xmm3		;backup...could be done without SSE but needs a loop...

	mov word[si],ax
	mov word[si+2],ax
	pshufd xmm2,[si],0
	movaps xmm7,xmm2
	paddw  xmm2,[si+0xe0]	;x_a = x+ 0|x+ 1|x+ 2|x+ 3|x+ 4|x+ 5|x+ 6|x+ 7
	paddw  xmm7,[si+0xf0]	;x_b = x+ 8|x+ 9|x+10|x+11|x+12|x+13|x+14|x+15
	movaps [si],xmm2		;store aligned x_a
	movaps [si+16],xmm7		;store aligned x_b	

	pmullw xmm2,xmm4		;(x_a-center)*depth
	pmullw xmm7,xmm4		;(y_b-center)*depth
	pmullw xmm3,xmm4		;(y  -center)*depth
	paddw  xmm2,xmm6		;x_a movement
	paddw  xmm7,xmm6		;x_b movement
	paddw  xmm3,xmm6		;y movement
	effect_loc2:
	xorps  xmm1,xmm1		;hit_colours = 0

	mov bl,255
	depth_loop:
		movaps	 xmm5,xmm2
		movaps	 xmm0,xmm7
		andps	 xmm5,xmm3			;x_a AND y
		andps	 xmm0,xmm3			;x_b AND y
		psraw	 xmm5,10			;shift x_a
		psraw	 xmm0,10			;shift x_b
		andps	 xmm5,xmm4			;cube geometry
		effect_loc0a:
		andps	 xmm0,xmm4			;cube geometry
		effect_loc0b:
		psubw	 xmm2,[si]			;x_a*depth = x_a*depth - x
		packsswb xmm5,xmm0			;current color from words to bytes
		psubw	 xmm7,[si+16]		;x_b*depth = x_b*depth - x
		movaps	 xmm0,xmm1			;hit_colours
		psubw	 xmm3,[si+32]		;y  *depth = y	*depth - y
		andnps	 xmm0,[si+64]		;mask only if hit_colour NOT set already
		ptest	 xmm0,xmm0
		jz exit_depth_loop
		paddw	 xmm4,[si+48]		;depth = depth - 1
		andps	 xmm0,xmm5			;check if hit occurred => if current color contains the mask
		pcmpeqb  xmm0,[si+64]		;if hit occurred set byte to 11111111
		pblendvb xmm1,xmm5,xmm0 	;SSE4.1 update only the 11111111 byte's of hit_colours
		dec bx
	jnz depth_loop			;using LOOP is at least 10% slower !!!
	exit_depth_loop:
	xor bx,bx
	movaps [es:di],xmm1		;plot all 16 pixel bytes
    add ax,16				;16 pixels per x loop
	add di,16
  cmp ax,x_offset	;only for width=1024 => si=512=1024/2=width/2
  jl loopx			;far jmps could be optimized by a call subroutine, put huge penalty on speed...
inc cx
cmp cx,y_offset
jne loopy		;far jmps could be optimized by a call subroutine, put huge penalty on speed...

xchg ax,dx		;to clear ah for exit, as dh is zero here

;---vsync for timing & flicker reduce
mov dx,03dah
vsync:
  in al,dx
  test al,8
jz vsync

;---timing
pcmpeqw xmm5,xmm5
psllw xmm5,xy_speed	;adjust speed
psubw xmm6,xmm5 	;inc x/y timer
inc bp				;inc depth timer

check_keyboard:
in al,0x60
dec ax
jnz main_loop	;far jmps could be optimized by a call subroutine, put huge penalty on speed...
mov al,3		;set text mode...can be omitted if needed, costs 4 bytes...
int 10h
ret