;*DDK*************************************************************************/
;
; COPYRIGHT (C) Microsoft Corporation, 1989
; COPYRIGHT    Copyright (C) 1995 IBM Corporation
;
;    The following IBM OS/2 WARP source code is provided to you solely for
;    the purpose of assisting you in your development of OS/2 WARP device
;    drivers. You may use this code in accordance with the IBM License
;    Agreement provided in the IBM Device Driver Source Kit for OS/2. This
;    Copyright statement may not be removed.;
;*****************************************************************************/
        page    ,132
;/*****************************************************************************
;*
;* SOURCE FILE NAME = CBLT.ASM
;*
;* DESCRIPTIVE NAME = Compile a BLT subroutine onto the stack
;*
;*
;* VERSION      V2.0
;*
;* DATE         
;*
;* DESCRIPTION  This file contains two subroutines which build a small program on 
;*              the stack to accomplish the requested BLT.
;*                                                                                     
;*              This file is part of a set that makes up the BitBLT function   
;*              at driver-level.                                                       
;*
;* FUNCTIONS    CBLT 
;*              phase_align_generate
;*              y_update 
;*
;* NOTES        NONE
;*
;* STRUCTURES   NONE
;*
;* EXTERNAL REFERENCES
;*
;*              NONE
;*
;* EXTERNAL FUNCTIONS
;*
;*              NONE
;*
;* CHANGE ACTIVITY =
;*   DATE      FLAG        APAR   CHANGE DESCRIPTION
;*   --------  ----------  -----  --------------------------------------
;*   mm/dd/yy  @Vr.mpppxx  xxxxx  xxxxxxx
;*   02/22/8?                     Walt Moore [waltm] Wrote it for         in
;*                                distant past.
;*   07/12/86                     Wes Rupel [wesleyr] Made it a subroutine 
;*                                (extracted from an enourmous bitblt.asm)
;*   07/20/87                     Wes Rupel [wesleyr] Added 4-plane support.
;*   08/16/87                     Wes Rupel [wesleyr] Bitmap Color Conversion 
;*                                uses image color
;*   03/05/88                     Wes Rupel [wesleyr] Added Gray usMix support.
;*   03/05/88                     Wes Rupel [wesleyr] Subroutinized it so that
;*                                mono_to_color coversion can optionally be done
;*                                after phase align (needed for transparency).
;*   03/07/88                     Wes Rupel [wesleyr] Added Transparency for 
;*                                ImageData
;*   03/30/88                     Wes Rupel [wesleyr] Made BackColor/ForeColor 
;*                                now 0/1 rather than 1/0 (in mono bitmaps)
;*   10/26/89                     Viroon Touranachun [viroont] Modified the part 
;*                                that generates the "REP" inner loop for 
;*                                "Source Copy" to generate 32-bit data transfer 
;*                                routine using iAPX386's MOVSD and STOSD 
;*                                instructions.
;*
;*****************************************************************************/

        .xlist
        include cmacros.inc
        include pmgre.inc
DINCL_BB_ROPS   equ     1
        include driver.inc
        include display.inc
        include 8514.inc
        include 8514mem.inc
        include assert.mac
        include njmp.mac
        include oemblt.inc
        .list


sBegin  Data
        externW ipc_index_mask          ; mask to just bpp
sEnd    Data


sBegin  Bitmap
        assumes cs,Bitmap

        include constant.blt

        I_USE_386   equ     066h
        I_ROR_AX_N  equ     0C8C1h

        include devdata.blt
DEFINE_ROPCODE equ  1
        include roptable.blt            ;Equs only, no table

;/*
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;**       jmp_cx_nz   - Code template for near jump if CX-1 <> 0
;**
;**       jmp_cx_nz will skip the following near jump if CX-1 is zero.
;**       CX will be left updated by this code.
;**
;**       jmp_cx_nz is used by both the inner loop code and the outer
;**       loop code if a loop instruction cannot be used.
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;*/

jmp_cx_nz:
        dec     cx                      ;Decrement counter
        jz      $+5
        db      I_JMP_NEAR              ;JMP opcode

JMP_CX_NZ_LEN   =       $-jmp_cx_nz     ;Length of procedure


;/*
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;**       phase_align - Template for phase alignment code
;**
;**       The following code is the template that performs the phase
;**       alignment masking.  The source has already been aligned to
;**       the destination.
;**
;**       A copy of the aligned source is made.  The phase mask is then
;**       applied to the source and the copy.  The previously unused
;**       bits are ORed into the used bits of the current source, and
;**       the unused bits of the current source then become the unused
;**       bits for the next source.
;**
;**
;**       It assumes:
;**
;**               BP  =  phase alignment mask
;**               AL  =  current byte to mask
;**               BH  =  old unused bits
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;*/

phase_align:
        mov     ah,al                   ;Make a copy of aligned source
        and     ax,bp                   ;Masked used, unused bits
        or      al,bh                   ;Mask in old unused bits
        mov     bh,ah                   ;Save new unused bits

PHASE_ALIGN_LEN equ     $-phase_align   ;Length of procedure


;/*
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;**       masked_store - Template for storing first and last bytes of BLT
;**
;**       The following code is a template for storing the first and last
;**       bytes of a BLT.  The unaltered bits are saved and the altered
;**       bits set in the byte, then the byte is stored.
;**
;**
;**       It assumes:
;**
;**               AL  =  The byte to be BLTed to the destination bitmap.
;**                      All necessary logic operations have been performed
;**                      on this byte.
;**
;**               AH  =  The destination byte.
;**
;**       The AND immediate will be fixed up.
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;*/

masked_store_mono:
        and     ax,0FFFFh               ;Mask altered/unaltered bits
        or      al,ah                   ;Combine the bits
        stosb                           ;And store the result

MASKED_STORE_LEN_MONO   equ     $-masked_store_mono;Length of the template
MASKED_STORE_MASK_MONO  equ     -5              ;Offset to where mask goes



;/*
;** Gray Rop logical action template.  This is used instead of one of the
;** ropcode rops when the GRAY_ROP MIX is requested.
;*/

gray_rop_template_mono:
        mov     ah,es:[di]
        mov     al,ah
        xor     al,dl
        and     al,dh
        xor     al,ah
LENGTH_GRAY_ROP_TEMPLATE_MONO   =       $ - gray_rop_template_mono

gray_rop_template_8bpp:
        mov     al,es:[di]              ; fetch destination byte
        rol     dh,1                    ; get mask pixel to carry
        jnc     @F                      ; was mask bg pixel, propagate dest
        mov     al,dl                   ; else substitute bg color
@@:
LENGTH_GRAY_ROP_TEMPLATE_8BPP   =       $ - gray_rop_template_8bpp

;/*
;** transparency_template is a piece of code that will be appended to the
;** logical action template from the ropcode to achieve transparency.
;** The transparency mask is assumed to be in DH.  Where the mask is "1"
;** the result of the logical action is used.  Where the mask is "0"
;** the destination is not altered.
;*/

transparency_template_mono:
        mov     ah,es:[di]
        xor     al,ah
        and     al,dh
        xor     al,ah
LENGTH_TRANSPARENCY_TEMPLATE_MONO       =       $ - transparency_template_mono

;/*
;** the template to be used when fetching an 8bpp pixel from the hw
;** (when the destination is also 8bpp)
;*/

cblt_fetch_hw_8bpp:
        xchg    si,dx           ; save pattern,bgcolor -- load COLOR_1
        in      ax,dx           ; get next pixel(s) from hw
        xchg    si,dx           ; restore pattern,bgcolor
CBLT_FETCH_HW_8BPP_LEN  =       $ - cblt_fetch_hw_8bpp

ifdef   4BPP_USED

;/*
;** the template to be used when fetching a 4bpp pixel from the hw
;** (when the destination is also 4bpp)
;*/

        shl     al,4            ; left pixel to hi nibble
        and     ah,0fh          ; isolate right pixel
        or      al,ah           ; combine both in al
CBLT_FETCH_HW_4BPP_LEN  =       $ - cblt_fetch_hw_8bpp
endif;  4BPP_USED


;/*
;** the template to be used when fetching an 8bpp pixel
;** (when the destination is mono)
;** NOTE: This code is assuming bitmap source and stepping left -- it gets
;**       patched if source is the device and/or stepping right.
;*/


cblt_fetch_8bpp_to_mono:
        mov     cx,000ffh       ; count of edge pixels
POS_CBLT_FETCH_8BPP_TO_MONO_COUNT       =       $ - cblt_fetch_8bpp_to_mono - 2
@@:     lodsb                   ; fetch a source byte
POS_CBLT_FETCH_8BPP_TO_MONO_FETCH       =       $ - cblt_fetch_8bpp_to_mono - 1
        xor     al,0ffh         ; AL  = (bgcolor) ? 0 : >0
POS_CBLT_FETCH_8BPP_TO_MONO_BGCOLOR     =       $ - cblt_fetch_8bpp_to_mono - 1
        add     al,0ffh         ; 'C' = (bgcolor) ? 0 : 1
        sbb     ax,ax           ; AL = ('C' == 1) ? 0ffh : 0
        not     ah              ; AH = ('C' == 1) ? 0 : 0ffh
        and     ax,0ffffh       ; isolate result
POS_CBLT_FETCH_8BPP_TO_MONO_MONOBITS    =       $ - cblt_fetch_8bpp_to_mono - 2
        or      al,ah           ; into AL
        shr     al,1            ; now into 'C'
        rcr     bl,1            ; shift into building mono byte
POS_CBLT_FETCH_8BPP_TO_MONO_RCX =       $ - cblt_fetch_8bpp_to_mono - 1
        loop    @B              ; for all source bytes to fetch in edge
        mov     al,bl           ; fetch built mono byte to source register
CBLT_FETCH_8BPP_TO_MONO_LEN     =       $ - cblt_fetch_8bpp_to_mono
CBLT_FETCH_8BPP_TO_MONO_COUNT   =       -(CBLT_FETCH_8BPP_TO_MONO_LEN-\
                                          POS_CBLT_FETCH_8BPP_TO_MONO_COUNT)
CBLT_FETCH_8BPP_TO_MONO_FETCH   =       -(CBLT_FETCH_8BPP_TO_MONO_LEN-\
                                          POS_CBLT_FETCH_8BPP_TO_MONO_FETCH)
CBLT_FETCH_8BPP_TO_MONO_BGCOLOR =       -(CBLT_FETCH_8BPP_TO_MONO_LEN-\
                                          POS_CBLT_FETCH_8BPP_TO_MONO_BGCOLOR)
CBLT_FETCH_8BPP_TO_MONO_MONOBITS =      -(CBLT_FETCH_8BPP_TO_MONO_LEN-\
                                          POS_CBLT_FETCH_8BPP_TO_MONO_MONOBITS)
CBLT_FETCH_8BPP_TO_MONO_RCX     =       -(CBLT_FETCH_8BPP_TO_MONO_LEN-\
                                          POS_CBLT_FETCH_8BPP_TO_MONO_RCX)

ifdef PALMGR

;/*
;** template used for palette translation
;*/

        CPUMode 386
cblt_palette:
        push    bx
        mov     bx,1000h
cblt_xlat_lo:
        xlat    gs:[bx]                           ; palette transform
        pop     bx
CBLT_PALETTE_LEN = $ - cblt_palette
CBLT_XLAT_LO = cblt_xlat_lo - $ - 2
        CPUMode 286
endif

        page
;/***************************************************************************
;*
;* FUNCTION NAME = CBLT
;*
;* DESCRIPTION   = Compile a BLT onto the stack.  
;*
;*                 Calls:
;*                       y_update
;*
;* INPUT         = ES:DI --> memory on stack to receive BLT program  
;* OUTPUT        = NONE
;*
;* RETURN-NORMAL = NONE
;* RETURN-ERROR  = NONE
;*
;**************************************************************************/

        assumes ds,nothing
        assumes es,nothing


;/*
;**       Note:   The definition of CBLT below is FAR in order to maintain
;**               the stack frame created for BITBLT, though it is reached
;**               with a NEAR call.
;*/

cProc   CBLT,<FAR,PUBLIC,NODATA>,<si,di>
        include frame.blt
cBegin  <nogen>

        mov     ax,cs                   ;Set data seg to CS so we can access
        mov     ds,ax                   ;  code without overrides
        assumes ds,Code                 ;!!!GSS do they use this?

        mov     fbMore,0                ; clean slate at start

        subttl  Compile - Outer Loop
        page

;/*
;**  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;**        Create the outerloop code.  The first part of this code will save
;**        the scan line count register, destination pointer, and the source
;**        pointer (if there is a source and it is not the device).
;** 
;** 
;**        The generated code should look like:
;** 
;**                push    cx              ;Save scan line count
;**                push    di              ;Save destination pointer
;**        <       push    si      >       ;Save source pointer
;**  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;*/

        mov     bl,fbF0
        mov     ax,I_PUSH_CX_PUSH_DI    ;Save scan line count, destination ptr
        stosw
        test    bl,F0_SRC_PRESENT       ;Is a source needed?
        jz      @F                      ;  No
        test    bl,F0_SRC_IS_DEV        ;Is the source the device?
        jnz     @F                      ;  Yes
        mov     al,I_PUSH_SI            ;  Memory src, save source pointer
        stosb
@@:


        subttl  Compile - Pattern Fetch
        page
;/*
;**- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;**  Set up any pattern fetch code that might be needed.
;**  The pattern code has many fixups, so it isn't taken from a
;**  template.  It is just stuffed as it is created.
;**
;**
;**  Entry:  None
;**
;**  Exit:   DH = pattern
;**
;**  Uses:   AX,BX,CX,DH,flags
;**
;**
;**  For color brushes:
;**
;**   *  mov     bx,XXXXh            ;Load segment (immediate) of the brush
;**   *  mov     ax,ds               ;Save DS
;**   *  mov     ds,bx               ;DS:BX --> brush
;**      mov     bx,YYYYh            ;Load offset (immediate) of the brush
;**      mov     dh,7[bx]            ;Get initial brush byte
;**   *  mov     ds,ax               ;Restore DS
;**
;**  For monochrome brushes:
;**  For masks:
;**
;**   +  mov     dl,bgcolor          ;bg color if gray rop
;**   *  mov     bx,XXXXh            ;Load segment (immediate) of the brush
;**   *  mov     ax,ds               ;Save DS
;**   *  mov     ds,bx               ;DS:BX --> brush
;**      mov     bx,YYYYh            ;Load offset (immediate) of the brush
;**      mov     dh,7[bx]            ;Get next brush byte
;**   *  mov     ds,ax               ;Restore DS
;**   =  not     dh                  ;invert fg/bg bits
;**   =  rol     dh,n                ;phase transparency mask
;**
;**      Instructions marked with "*" are not present if there is no
;**      source bitmap or if there is a source bitmap and it is the
;**      device. The bitmap is what DS would otherwise be used for.
;**
;**      Instructions marked with "+" are not present if the rop is not
;**      the special gray ropcode.
;**
;**      Instructions marked with "=" are only present if the ropcode is
;**      the gray ropcode and the destination is a color bitmap.
;**
;**- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;*/

cblt_pattern_fetch:
        test    bl,F0_PAT_PRESENT       ;Is a pattern needed?
        njz     cblt_initial_byte_fetch ;  No, skip pattern code

;/*
;** The special gray rop will erroneously lead to Color Pat Fetch.        
;** It should be mono fetch of the transparency mask.                     
;*/
                                                                        
        test    fsBlt,BBF_GRAY_ROP                                      
        jz      cblt_not_gray_rop                                       
                                                                        
color_fetch_template:                                                   
        and     fbF0,not F0_COLOR_PAT ; really a mono (1 plane) fetch
        and     bl,not F0_COLOR_PAT           ; really a mono (1 plane) fetch
        add     pBrush.lo,pa_abMask - pa_abColor ;-> Pat Transparency Mask

;/*
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;** Take this oportunity to compile the "color fetch" code needed by the
;** Gray Rop.  This will compile code which will expand the background color
;** to all 0s or all 1s into DL if the destination is mono, else will just
;** stuff the background color into DL for color destinations.
;**
;** For color destinations:
;**
;**   mov dl,03h     ; grab the background color as an immediate value
;**
;** For mono destinations:
;**
;**   mov   dl,{0,0ffh}; 0 or ffh depending on lo bit of background color
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;*/
        mov     ah,byte ptr ipcBrushBoth[0]                             
        test    bl,F0_DEST_IS_COLOR                                     
        jnz     @F                                                      
        shr     ah,1                                                    
        sbb     ah,ah                                                   
@@:     mov     al,I_MOV_DL_BYTE_I   ; mov dl,PatBackColor(assume color)
        stosw                                                           
cblt_not_gray_rop:                                                      

        test    bl,F0_COLOR_PAT         ; color pattern fetch ?
        jz      @F                      ; no...
        test    fbBrushAccel,PA_SINGLE_CLR; solid brush ?
        jz      @F                      ; no...
        mov     al,I_MOV_DH_BYTE_I      ; else move fgcolor into DH for blt
        mov     ah,byte ptr ipcBrushBoth[0]
        stosw
        and     fbF0,NOT F0_PAT_PRESENT; pattern never needs to be
        and     bl,NOT F0_PAT_PRESENT   ; considered again
        jmp     cblt_pattern_fetch_end
@@:

        mov     dl_addr_pbrush,di       ; save -> brush offset in code

        test    bl,F0_SRC_PRESENT       ;Is there a source?             
        jz      cbltpf_no_source                                        
        test    bl,F0_SRC_IS_DEV        ;Is the source the device?      
        jnz     cbltpf_no_source                                        
                                                                        
        mov     al,I_MOV_BX_WORD_I      ;mov bx,pBrush.hi               
        stosb                                                           
        mov     ax,pBrush.hi                                            
        stosw

        CPUMode 386                                                     
        mov     eax,I_MOV_AX_DS+(I_MOV_DS_BX shl 16)    ;mov ax,ds      
        stosd                                           ;mov ds,bx      
        CPUMode 286

cbltpf_no_source:                                                       

        mov     al,I_MOV_BX_WORD_I      ;mov bx,pBrush.lo
        stosb
        mov     ax,pBrush.lo    ; start offset to brush
        test    bl,F0_COLOR_PAT         ; fetching color pattern ?
        jz      cblt_stuff_brush_addr   ; no...

        sub     dx,dx                   ; assume no middle,last brush fetches
        mov     dl_addr_pbrush_off_m,dx
        mov     dl_addr_pbrush_off_l,dx
        or      fbMore,F1_INNER_ONCE

        mov     dl,yPatRow              ; set initial pattern row
        and     dx,00000111b            ; keep pattern row in range

ifdef   4BPP_USED
        shl     dx,2                    ; assume 4bpp
        test    dl_hwFlags,HW_8_BPP     ; correct ?
        jz      @F                      ; yes...
        shl     dx,1                    ; no -- 8 bytes / pattern row
else
        shl     dx,3                    ; we only use 8bpp mode
endif;  4BPP_USED

@@:     add     ax,dx                   ; -> start of 1st pattern row
        mov     dl_addr_pbrush_off_f,di ; save -> brush offset

cblt_stuff_brush_addr:
ifdef   FIREWALLS
        mov     cx,di                   ; save offset of brush offset
endif
        stosw                           ; stuff initial brush offset

        mov     ax,I_MOV_DH_BX_DISP8    ;mov dh,n[bx]
        stosw
BRUSH_INDEX_LESS_BRUSH_OFF      equ     4
ifdef   FIREWALLS
        add     cx,BRUSH_INDEX_LESS_BRUSH_OFF
        assert  cx,E,di
endif
        mov     npbPatRow,di    ;Save address of the brush index

        mov     al,byte ptr xDst        ; x origin of destination if color pat
        test    bl,F0_COLOR_PAT
        jnz     @F                      ; will be a color pattern fetch...
        mov     al,yPatRow              ; initial pattern row if mono pat
@@:     and     al,00000111b            ;Set brush index mask
        stosb

        test    bl,F0_SRC_PRESENT       ;Is there a source?             
        jz      @F                                                      
        test    bl,F0_SRC_IS_DEV        ;Is the source the device?      
        jnz     @F                                                      
        mov     ax,I_MOV_DS_AX          ;mov ds,ax
        stosw
@@:
;/*
;** gray rop mask needs to be correctly phased for color destinations     
;*/
                                                                        
        test    fsBlt,BBF_GRAY_ROP                                      
        jz      @F                      ; not special gray rop code     
        test    bl,F0_DEST_IS_COLOR                                     
        jz      @F                      ; gray rop, dest is mono        
        mov     dl,byte ptr xDst                                        
        and     dl,00000111b            ; mod PATTERNSIZE               
        jz      @F                      ; no mask phasing needed        
        mov     ax,I_ROL_DH_N           ; rol dh,n                      
        stosw                                                           
        mov     al,dl                                                   
        stosb                                                           
@@:                                                                     

        mov     dl_addr_brush_end,di    ; -> addr 1 beyond brush fetch code
cblt_pattern_fetch_end:


        subttl  Compile - Initial Byte Fetch
        page
        missing_code    <4bpp considerations -- Initial Byte Fetch>
;/*
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;**       Create the initial byte code.  This may consist of one or two
;**       initial fetches (if there is a source), followed by the required
;**       logic action.  The code should look something like:
;**
;**       BLTouterloop:
;**       <       mov     bp,mPhase >  ;Load phase mask for entire loop
;**       <       xor     bh,bh       >   ;Clear previous unused bits
;**
;**       ;       Perform first byte fetch
;**
;**       <       lodsb               >   ;Get source byte
;**       <       color<==>mono munge >   ;Color <==> mono conversion
;**       <       phase alignment     >   ;Align bits as needed
;**
;**       ;       If an optional second fetch is needed, perform one
;**
;**       <       lodsb               >   ;Get source byte
;**       <       color to mono munge >   ;Color to mono munging
;**       <       phase alignment     >   ;Align bits as needed
;**
;**               logical action          ;Perform logical action required
;**
;**               mov     ah,es:[di]      ;Get destination
;**               and     ax,cx           ;Saved unaltered bits
;**               or      al,ah           ;  and mask in altered bits
;**               stosb                   ;Save the result
;**
;**
;**       The starting address of the first fetch/logical combination will be
;**       saved so that the code can be copied later instead of recreating it
;**       (if there are two fecthes, the first fetch will not be copied)
;**
;**       The length of the code up to the masking for altered/unaltered bits
;**       will be saved so the code can be copied into the inner loop.
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;*/


cblt_initial_byte_fetch:

        mov     cFetchCode,0    ; unknown fetch code size at start
        mov     npFetchStart,di         ; save starting address of action

        test    bl,F0_SRC_PRESENT       ; is there a source?
        jnz     cblt_src_is_present     ;  yes, generate fetch code
        jmp     cblt_logical_action     ;  no, don't generate fetch code
cblt_src_is_present:

        test    bl,F0_GAG_CHOKE         ; color conversion ?
        jnz     @F                      ; yes -- no phasing needed...
        cmp     iHorzPhase,0            ; is the phase 0? (also get the phase)
        jz      @F                      ; yes -- no phase alignment needed...

        mov     al,I_MOV_BP_WORD_I      ; set up the phase mask
        stosb
        mov     ax,mPhase               ; place the mask into the instruction
        stosw
        mov     ax,I_XOR_BH_BH          ; clear previous unused bits
        stosw
        mov     npFetchStart,di         ; phase mask not part of fetch
@@:

;/*
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;**     Generate the required sequence of instructions for a fetch
;**     sequence.  Only the minimum code required is generated.
;**
;**     The code generated will look something like the following:
;**
;**     BLTfetch:
;**     <       lodsb                 > ;Get the next byte
;**     <       color munging         > ;Mono <==> color munging
;**
;**     ;       If the phase alignment isn't zero, then generate the minimum
;**     ;       phase alignment needed.  RORs or ROLs will be generated,
;**     ;       depending on the fastest sequence.  If the phase alignment
;**     ;       is zero, than no phase alignment code will be generated.
;**
;**     <       ror     al,1          > ;Rotate as needed
;**     <       ror     al,1          > ;Rotate as needed
;**     <       ror     al,1          > ;Rotate as needed
;**     <       ror     al,1          > ;Rotate as needed
;**     <       mov     ah,al         > ;Mask used, unused bits
;**     <       and     ax,bp         > ;(BP) = phase mask
;**     <       or      al,bh         > ;Mask in old unused bits
;**     <       mov     bh,ah         > ;Save new unused bits
;**
;**
;**     The nice thing about the above is it is possible for the fetch to
;**     degenerate into a simple LODSB instruction.
;**
;**     If this was a iAPX80286 implementation, if would be faster to
;**     make three or four rotates into a "ror al,n" instruction.
;**
;**     Currently:      BL = fbF0
;**                     BH = usMix[3]
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;*/

        mov     bh,byte ptr fsBlt[1]     ; We will test BH several times
        .errnz  low BBF_TRANS
        .errnz  low BBF_ANTI_TRANS
        .errnz  low BBF_TRANS_NO_CC
        test    bl,F0_GAG_CHOKE          ; Color conversion?
        jnz     cblt_color_convert       ;   Yes, gag and choke on it

;/*
;**  No Color Conversion.
;**  If it is mono to mono transparent then we want to go filter thru the
;**  first part of the mono_to_color code just to get the
;**  tranparency stuff.  Being here means no color conversion and transparent
;**  means mono source, so if it is transparent than it is mono to mono.
;*/

        test    bh,(BBF_TRANS or BBF_ANTI_TRANS) shr 8
        jnz     cblt_gotta_transparency ; it's mono_to_mono transparency
        jmp     cblt_no_color_conversion

cblt_color_convert:
        test    bl,F0_SRC_IS_COLOR
        jnz     cblt_color_to_mono
cblt_gotta_transparency:
        jmp     cblt_mono_to_color


        subttl  Compile - Initial Byte Fetch, Color ==> Mono
        page
cblt_color_to_mono:

;/*
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;**       Generate the code to go from color to mono.  Color to mono
;**       should map all colors that are background to 0's (black), and
;**       all colors which aren't background to 1's (white).
;**
;**       The generated code for bitmaps should look something like:
;**
;** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ;
;**  Currently:
;**               DH = phase
;**
;** according to source type slice in the correct code
;**
;** generated code looks as follows...
;**
;**       8bpp device source:
;**       -------------------
;**               xchg    si,dx
;**               mov     cx,edgepixels
;**       @@:     in      ax,dx
;**               xor     al,bgcolor      ; AL  = (bgcolor) ? 0 : >0
;**               add     al,0ffh         ; 'C' = (bgcolor) ? 0 : 1
;**               sbb     ax,ax
;**               not     ah              ; AH = ('C' == 1) ? 0 : 0ffh
;**               and     ax,FgBgRepMono  ; isolate result
;**               or      al,ah           ; into AL
;**               shr     al,1            ; now into 'C'
;**              *rcr     bl,1            ; add mono bit in 'C' into mono byte
;**               loop    @B
;**               mov     al,bl
;**               xchg    si,dx
;**
;**       8bpp bitmap source:
;**       -------------------
;**               mov     cx,nn
;**       @@:     lodsb
;**               xor     al,bgcolor      ; AL  = (bgcolor) ? 0 : >0
;**               add     al,0ffh         ; 'C' = (bgcolor) ? 0 : 1
;**               sbb     ax,ax
;**               not     ah              ; AH = ('C' == 1) ? 0 : 0ffh
;**               and     ax,FgBgRepMono  ; isolate result
;**               or      al,ah           ; into AL
;**               shr     al,1            ; now into 'C'
;**              *rcr     bl,1            ; add mono bit in 'C' into mono byte
;**               loop    @B
;**               mov     al,bl
;**
;** The RCR instruction (denoted with a *) in the code above is for the stepping
;** left case. If stepping right it will become a RCL instruction.
;**
;** NOTE: We expect that initialization code in the blt upper levels has already
;** accounted for the case where the destination dc image attributes foreground
;** and background MONOBITs are equal (ie: both 0 or both 1). In those cases the
;** source fetch can be ignored. The upper level therefore will have swizzled
;** the writing mode to be one of DDx or DDxn, both of which do not involve the
;** source and thus will not fall through to this code!
;*/

        test    bl,F0_SRC_IS_DEV        ; is source the device ?
        jz      @F                      ; no...
        mov     ax,I_XCHG_SI_DX         ; yes -- need to get variable data port
        stosw
@@:

        mov     cx,CBLT_FETCH_8BPP_TO_MONO_LEN
        mov     cFetchCode,cx
        mov     si,BitmapOFFSET cblt_fetch_8bpp_to_mono

        CPUMode 386
if      CBLT_FETCH_8BPP_TO_MONO_LEN GE 8
        shr     cx,2
        rep     movsd
else
if      CBLT_FETCH_8BPP_TO_MONO_LEN AND 4
        movsd
endif
endif
if      CBLT_FETCH_8BPP_TO_MONO_LEN AND 2
        movsw
endif
if       CBLT_FETCH_8BPP_TO_MONO_LEN AND 1
        movsb
endif
        CPUMode 286

;/*
;** get bit # in starting byte and convert to # pixels in the starting byte
;*/

        mov     dl,gl_start_bit         ; starting bit #
        inc     dl                      ; assume stepping left
        cmp     iStepDir,STEPLEFT
        je      @F                      ; we assumed correctly...
        neg     dl                      ; else adjust as though stepping right
        add     dl,8+1                  ; the +1 to adj for the STEPLEFT assume
        xor     byte ptr es:[di][CBLT_FETCH_8BPP_TO_MONO_RCX],XOR_RCL_TO_RCR
@@:     sub     dh,dh                   ; zero-extend max pixels in 1st byte
        cmp     dx,cxExt                ; is > width of entire blt ?
        jbe     @F                      ; no -- = 0 or more pixels
        mov     dh,dl
        mov     dl,byte ptr cxExt       ; yes -- truncate pixels to fetch
        sub     dh,dl                   ; normalization rotation adjustment
@@:     mov     es:[di][CBLT_FETCH_8BPP_TO_MONO_COUNT],dl

        mov     dl,ipcBkgnd.ipc_bClr    ; fetch source dc background color
        mov     es:[di][CBLT_FETCH_8BPP_TO_MONO_BGCOLOR],dl

        .errnz  high MONO_BIT - 00000001b
        mov     cl,ipcImageFore.ipc_bStatus
        shr     cl,1                    ; isolate fg mono bit in 'C'
        sbb     cl,cl                   ; replicated dest fg MONOBITs
        mov     ch,ipcImageBack.ipc_bStatus
        shr     ch,1                    ; isolate bg mono bit in 'C'
        sbb     ch,ch                   ; replicated dest bg MONOBITs
        mov     es:[di][CBLT_FETCH_8BPP_TO_MONO_MONOBITS],cx

        test    bl,F0_SRC_IS_DEV        ; is source the device ?
        jz      @F                      ; no...
        mov     byte ptr es:[di][CBLT_FETCH_8BPP_TO_MONO_FETCH],I_IN_AX_DX
        assert  ax,E,I_XCHG_SI_DX       ; we expect AX to still == I_XCHG_SI_DX
        stosw                           ; restore original DX via XCHG SI,DX
        add     cFetchCode,4    ; to account for (2) XCHG SI,DX instrs.
        .errnz  (I_XCHG_SI_DX LE 0ffh)
@@:

        or      dh,dh                   ; only a first byte in blt ?
        jz      cblt_first_not_only     ; no...
        mov     ax,I_ROL_AL_N           ; assume stepping right
        cmp     iStepDir,STEPRIGHT
        je      @F                      ; we are stepping right...
        mov     ah,HIGH I_ROR_AL_N      ; else we are stepping left
        .errnz  LOW I_ROR_AL_N - LOW I_ROL_AL_N
@@:     stosw                           ; stuff instr. to normalize pixels
        mov     al,dh                   ; get rotate count
        stosb
cblt_first_not_only:

        or      fbMore,(F1_COLOR_MONO+F1_INNER_ONCE)
        and     fbFetch,NOT FF_TWO_INIT_FETCHES
        jmp     cblt_logical_action     ; go create logic code...


        subttl  Compile - Initial Byte Fetch, Mono ==> Color
        page

;/*
;**       The conversion is mono to color. (And it is complex)            ;
;*/

cblt_mono_to_color:

        test    bl,F0_DEST_IS_COLOR
        jnz     cblt_mono_to_color_color; it's really a color dest...

;/*
;** Take care of the mono to mono case which filtered through here just to
;** get the transparency code added.
;*/

        mov     al,I_LODSB              ; fetch the source byte
        stosb

;/*
;** To make transparent rops needed for ImageData we need to do the phase
;** alignment before the color conversion.
;*/

        call    phase_align_generate

;/*
;** For transparency the compiled code must save the source
;** (now that it has been phase aligned) to use as a mask.  It will save
;** it in DH (where the pattern would be if there was one -- transparency
;** is being added for ImageData which does not use a pattern).
;*/

        mov     ax,I_MOV_DH_AL          ; op code for MOV DH,AL
        stosw
        test    bh,HIGH BBF_ANTI_TRANS
        jz      @F
        mov     ax,I_NOT_DH             ; op code for NOT DH
        stosw
@@:     jmp     already_phase_aligned

cblt_mono_to_color_color:

        or      fbMore,(F1_MONO_COLOR+F1_INNER_ONCE)
        and     fbFetch,NOT FF_TWO_INIT_FETCHES

;/*
;** The pattern fetch code must become part of the fetch code because
;** a pattern byte must be fetched for each destination pixel stored. We
;** will effect this on the initial byte fetch by moving up in the instance
;** data area the brush fetch code to allow the insertion of src byte fetch
;** code in addition to code for the load of a loop counter.
;*/

        sub     si,si                   ; assume no pattern

        test    bl,F0_PAT_PRESENT       ; only set if none or non-solid pattern
        jz      cblt_mono_to_color_no_pattern; no pattern...

;/*
;** The presence of the pattern fetch code in this initial byte fetch loop
;** requires that we also update the pattern index for each destination
;** byte processed.
;*/

        mov     ax,(I_SS_OVERRIDE+(I_MOV_AL_MEM*256))
        stosw                           ;mov al,ss:[xxxx]
        mov     si,npbPatRow
        add     si,MONO_8BPP_FETCH_LEN
        mov     npbPatRow,si
        mov     ax,si
        stosw
        mov     al,I_ADD_AL_BYTE_I      ; update brush index
        mov     ah,INCREASE             ; assume STEPRIGHT
        .errnz   INCREASE - 1           ; must be a 1
        .errnz   DECREASE + 1           ; must be a -1
        cmp     iStepDir,STEPRIGHT
        je      @F
        neg     ah                      ; step increment is opposite direction
@@:     stosw

        CPUMode 386
        mov     eax,(I_AND_AL_BYTE_I+(7*256))+((I_SS_OVERRIDE+(I_MOV_MEM_AL*256)) shl 16)
        stosd                           ; and al,BrushIndexMask
                                        ;mov ss:[xxxx],bl
        CPUMode 286

        mov     ax,si
        stosw

;/*
;** Now we relocate the brush fetch code, vacating space for the initial mono
;** byte fetch, save, and normalization code.
;*/

        mov     cx,di                   ; -> end of brush fetch code
        sub     cx,dl_addr_pbrush       ; now # bytes of code to relocate
        dec     di                      ; we are moving backward
        mov     si,di                   ; -> end of brush fetch code
        mov     ax,MONO_8BPP_FETCH_LEN  ; length of code for mono to 8 bpp fetch
        add     di,ax                   ; -> end of relocated code
        add     dl_addr_pbrush,ax       ; adjust -> start of brush fetch code
        add     dl_addr_brush_end,ax    ; ditto -> end of brush fetch code
        add     dl_addr_pbrush_off_f,ax ; ditto -> first brush fetch offset
        mov     ax,cx                   ; save length of brush fetch code

        std                             ; work backwards
        rep     movs es:byte ptr[di],es:byte ptr[si]; relocate brush fetch code
        cld                             ; ensure working upwards again

        inc     si                      ; now, we are moving forward again
        mov     di,si                   ; DI is -> where to put src byte fetch
        mov     npFetchStart,di         ; this is new start of fetch,logic op
        mov     si,ax                   ; SI is # bytes to add to get to next ip
cblt_mono_to_color_no_pattern:

ifdef   FIREWALLS
        push    di                      ; save start of fetch code
        mov     cx,di                   ; here too
endif
        mov     al,I_LODSB              ; fetch a src byte of <= 8 mono pixels
        stosb
        mov     ax,I_MOV_CH_AL          ; store fetched byte in CH
        stosw
        mov     ax,I_ROL_CH_N           ; normalize src byte (assume STEPRIGHT)
        mov     dl,byte ptr xSrc
        and     dl,07h                  ; rotate left count
        cmp     iStepDir,STEPRIGHT; assumption correct ?
        je      @F                      ; yes...
        mov     ah,HIGH I_ROR_CH_N      ; no -- convert to opposite rotation
        add     dl,byte ptr cxExt       ; adjust to right end of scan + 1
        neg     dl                      ; adjust for stepping left
        and     dl,7                    ;  by making 8 - stepright count
@@:     stosw
MONO_8BPP_NORMALIZE_OFF equ     5       ; offset to rotation count for norm.
ifdef   FIREWALLS
        add     cx,MONO_8BPP_NORMALIZE_OFF
        assert  cx,E,di
endif
        mov     al,dl
        stosb
        mov     al,I_MOV_CL_BYTE_I      ; stuff loop counter
        neg     dl                      ; 8 - rotation factor = # pixels to out
        add     dl,8
        sub     dh,dh                   ; zero-extend for compare
        cmp     dx,cxExt                ; blt width >= # pixels max to process?
        jbe     @F                      ; yes...
        mov     dl,byte ptr cxExt       ; no -- adjust pixels to process
@@:     mov     ah,dl
        stosw                           ; stuff loop count
MONO_8BPP_LOOP_COUNT_OFF equ    7       ; offset to loop count byte
MONO_8BPP_FETCH_LEN     equ     8
ifdef   FIREWALLS
        add     cx,MONO_8BPP_LOOP_COUNT_OFF - MONO_8BPP_NORMALIZE_OFF + 1
        assert  cx,E,di                 ; check offset of counter byte
        pop     ax                      ; get back -> before fetch code
        add     ax,MONO_8BPP_FETCH_LEN  ; add in length of fetch code
        assert  ax,E,di                 ; should equal current ->
endif
        push    di                      ; save -> top of loop (will be popped
                                        ; off in the store logic)
        add     di,si                   ; get back -> next addr to build code

;/*
;**  The code has now been altered so that it looks somewhat like this:
;** 
;**        <mono byte fetch, store, normalization>
;**   loop:<brush fetch>
;**        <brush index update>
;** 
;**  We have to add code in the loop which pulls mono pixels out of the mono
;**  byte fetched, and converts it to either a foreground and/or background
;**  color index. This must precede the insertion of the logical action code.
;**  There are three cases:
;** 
;**   no transparency:
;**   ----------------
;**        mov     al,<fgcolor>            ; assume a 1 bit in mono src
;**        ro{l,r} ch,1                    ; get next mono pixel to carry
;**        jc      @F                      ; is fg pixel...
;**        mov     bl,<bgcolor>            ; else we load a bg pixel index
;**    @@:
;** 
;**   transparency:
;**   -------------
;**        ro{l,r} ch,1                    ; get next mono pixel to carry
;**        jnc     @F                      ; skip <logic op>,<store> if transparent
;**        mov     al,<fgcolor>            ; else load fg pixel index
;** 
;**   anti-transparency:
;**   ------------------
;**        ro{l,r} ch,1                    ; get next mono pixel to carry
;**        jc      @F                      ; skip <logic op>,<store> if transparent
;**        mov     al,<bgcolor>            ; else load bg pixel index
;*/

        mov     dx,ipcImageBoth ; DH = bgcolor, DL = fgcolor
        mov     cl,iStepDir     ; cache step direction
        mov     ch,byte ptr fsBlt[1]
        .errnz  BBF_TRANS and 255
        .errnz  BBF_ANTI_TRANS and 255
        test    ch,HIGH (BBF_TRANS or BBF_ANTI_TRANS)
        jnz     cblt_mono_to_color_trans; do transparency stuff...

;/*
;** The no transparency case --
;*/

        mov     al,I_MOV_AL_BYTE_I      ; MOV AL,<fgcolor>
        mov     ah,dl
        stosw
        mov     ax,I_ROL_CH_1           ; fetch mono pixel (assume STEPRIGHT)
        cmp     cl,INCREASE             ; is that right ?
        je      @F                      ; yes...
        mov     ah,HIGH I_ROR_CH_1      ; no -- rotate the other direction
        .errnz  LOW I_ROL_CH_1 - LOW I_ROR_CH_1
@@:     stosw
        mov     ax,I_JC_P4H             ; JC $+4
        stosw
        mov     al,I_MOV_AL_BYTE_I      ; MOV AL,<bgcolor>
        mov     ah,dh
        stosw
        jmp     cblt_logical_action

cblt_mono_to_color_trans:

;/*
;** Swizzle colors in case we are doing transparency without color conversion
;** (for ImageData dual pass blts).
;*/

        .errnz  LOW BBF_TRANS_NO_CC
        test    ch,HIGH BBF_TRANS_NO_CC
        jz      @F
        mov     dx,0ffffh
@@:

;/*
;** The transparency or anti-transparency case --
;*/

        mov     ax,I_ROL_CH_1           ; fetch mono pixel (assume STEPRIGHT)
        cmp     cl,INCREASE             ; is that right ?
        je      @F                      ; yes...
        mov     ah,HIGH I_ROR_CH_1      ; no -- rotate the other direction
        .errnz  LOW I_ROL_CH_1 - LOW I_ROR_CH_1
@@:     stosw
        mov     al,I_JNC                 ; assume transparency case
        test    ch,HIGH BBF_TRANS
        jnz     @F                      ; assumption was correct...
        mov     al,I_JC                 ; was anti-transparency case
        mov     dl,dh                   ; so will be using bgcolor
@@:     stosw                           ; offset is filled in at <store> time
        push    di                      ; save -> beyond where offset lives
        mov     al,I_MOV_AL_BYTE_I      ; pixel not transparent
        mov     ah,dl                   ; fetch color to load
        stosw
        jmp     short cblt_logical_action


;/*
;**       There is no conversion. We are going mono to mono or color to
;**       color.
;*/

cblt_no_color_conversion:

;/*
;**  If the source is a color device then the normal fetch sequence becomes
;**  the following:
;** 
;**                xchg    si,dx           ; save pattern,bgcolor -- load COLOR_1
;**                mov     dx,COLOR_1      ; load hw variable data port
;**                in      ax,dx           ; get next pixel(s) from hw
;**                xchg    si,dx           ; restore pattern,bgcolor - save COLOR_1
;**        *       shl     al,4            ; left pixel to hi nibble
;**        *       and     ah,0fh          ; isolate right pixel
;**        *       or      al,ah           ; combine both in al
;** 
;**  Instructions marked with a (*) will not be present if the source is 8 bpp.
;*/

        test    bl,F0_SRC_IS_DEV           ; source is the device ?
        jz      cblt_fetch_mem             ; no...

        mov     si,BitmapOFFSET cblt_fetch_hw_8bpp; common 4/8 bpp fetch logic
        mov     cx,CBLT_FETCH_HW_8BPP_LEN

ifdef   4BPP_USED
        test    dl_hwFlags,HW_8_BPP        ; if 4bpp, extra code is needed
        jnz     @F                         ; it was 8bpp...
        mov     cx,CBLT_FETCH_HW_4BPP_LEN
endif;  4BPP_USED

@@:     mov     cFetchCode,cx              ; save size of fetch code

        CPUMode 386
if      CBLT_FETCH_HW_8BPP_LEN GE 8
        shr     cx,2
        rep     movsd
else
if      CBLT_FETCH_HW_8BPP_LEN AND 4
        movsd
endif
endif
if      CBLT_FETCH_HW_8BPP_LEN AND 2
        movsw
endif
if      CBLT_FETCH_HW_8BPP_LEN AND 1
        movsb
endif
        CPUMode 286

        jmp     short cblt_phase_align

;/*
;** Just need to generate the normal fetch sequence (lodsb)
;*/

cblt_fetch_mem:
public cblt_fetch_mem
        mov     cFetchCode,1            ; size of normal fetch sequence
        mov     al,I_LODSB              ; generate source fetch
        stosb

ifdef PALMGR
;/*
;** any palette translation?
;*/

        mov     ax,pmappal.lo
        or      ax,pmappal.hi
        jz      cblt_phase_align

        mov     si,BitmapOFFSET cblt_palette  ;yes
        mov     cx,CBLT_PALETTE_LEN
        add     cFetchCode,cx

        CPUMode 386
if      CBLT_PALETTE_LEN GE 8
        shr     cx,2
        rep     movsd
else
if      CBLT_PALETTE_LEN AND 4
        movsd
endif
endif
if      CBLT_PALETTE_LEN AND 2
        movsw
endif
if      CBLT_PALETTE_LEN AND 1
        movsb
endif
        CPUMode 286

        mov     ax,pmappal.lo
        mov     es:CBLT_XLAT_LO[di],ax
endif

cblt_phase_align:
        call    phase_align_generate

already_phase_aligned:
        test    fbFetch,FF_TWO_INIT_FETCHES          ; Generate another fetch?
        jz      cblt_logical_action                  ;  No

;/*
;** A second fetch needs to be stuffed.  Copy the one just created.
;*/

        mov     si,di                   ;Get start of fetch logic
        xchg    si,npFetchStart         ;Set new start, get old
        mov     cx,di                   ;Compute how long fetch is
        sub     cx,si                   ;  and move the bytes

        CPUMode 386
        mov     ax,cx
        shr     cx,2
        rep     movs es:dword ptr [di],es:dword ptr [si]
        mov     cx,ax
        and     cx,3
        rep     movs es:byte ptr [di],es:byte ptr [si]
        CPUMode 286


        subttl  Compile - ROP Generation
        page

;/*
;**       Create the logic action code
;**
;**       The given ROP will be converted into the actual code that
;**       performs the ROP.
;*/


cblt_logical_action:

        mov     ax,usMixData            ; get rop data

;/*
;** The special gray rop will erroneously lead to Color Pat Fetch.             
;** It should be mono fetch of the transparency mask.                          
;*/                                                                            
                                                                               
        test    fsBlt,BBF_GRAY_ROP                                             
        jz      cblt_non_gray_rop_template                                     
                                                                               
cblt_gray_rop_template:                                                        
        test    bl,F0_DEST_IS_COLOR                                            
        jz      cblt_gray_rop_mono                                             
                                                                               
        mov     si,BitmapOFFSET gray_rop_template_8bpp                        

        CPUMode 386
if      LENGTH_GRAY_ROP_TEMPLATE_8BPP GE 8
        mov     cx,LENGTH_GRAY_ROP_TEMPLATE_8BPP/4
        rep     movsd
else
if      LENGTH_GRAY_ROP_TEMPLATE_8BPP AND 4
        movsd
endif
endif
if      LENGTH_GRAY_ROP_TEMPLATE_8BPP AND 2
        movsw
endif
if      LENGTH_GRAY_ROP_TEMPLATE_8BPP AND 1
        movsb
endif
        CPUMode 286
        jmp     short   cblt_srccopy

cblt_gray_rop_mono:
        mov     si,BitmapOFFSET gray_rop_template_mono

        CPUMode 386
if      LENGTH_GRAY_ROP_TEMPLATE_MONO GE 8
        mov     cx,LENGTH_GRAY_ROP_TEMPLATE_MONO/4
        rep     movsd
else
if      LENGTH_GRAY_ROP_TEMPLATE_MONO AND 4
        movsd
endif
endif
if      LENGTH_GRAY_ROP_TEMPLATE_MONO AND 2
        movsw
endif
if      LENGTH_GRAY_ROP_TEMPLATE_MONO AND 1
        movsb
endif
        CPUMode 286
        jmp     short   cblt_srccopy                


cblt_non_gray_rop_template:
        xchg    ah,al                   ; swap rop hi/lo bytes
        mov     si,ax                   ; get count of number of bits to move
        and     si,HIGH ROPLength
        shr     si,2
        .errnz   ROPLength - 0001110000000000b
        mov     cl,byte ptr ropcode[si] ; get length into cx
        xor     ch,ch

        xchg    ah,al                   ; restore rop hi/lo bytes
        mov     si,ax                   ; get offset of the template
        and     si,ROPOffset
        jz      cblt_srccopy            ; source copy
        lea     si,ropcode[si]          ; --> the template

        CPUMode 386
cblt_copy_template:
        mov     dx,cx
        shr     cx,2
        rep     movsd                   ; Move the template
        mov     cx,dx
        and     cx,3
        rep     movsb                   ; Move the template
        CPUMode 286

cblt_srccopy:
        or      ah,ah                   ; Generate a negate?
        .errnz  (NEGATE_NEEDED AND 8000h) - 8000h
        jns     cblt_no_NOT             ;  No
        mov     ax,I_NOT_AL
        stosw
cblt_no_NOT:

;/*
;** Add transparency mask if needed:                                      
;*/
                                                                        
        test    fsBlt,BBF_TRANS or BBF_ANTI_TRANS                       
        jz      cblt_transparency_not_needed                            
        test    bl,F0_DEST_IS_COLOR                                     
        jnz     cblt_transparency_not_needed                            
                                                                        
        mov     si,BitmapOFFSET transparency_template_mono  

        CPUMode 386
if      LENGTH_TRANSPARENCY_TEMPLATE_MONO GE 8
        mov     cx,LENGTH_TRANSPARENCY_TEMPLATE_MONO/4
        rep     movsd
else
if      LENGTH_TRANSPARENCY_TEMPLATE_MONO AND 4
        movsd
endif
endif
if      LENGTH_TRANSPARENCY_TEMPLATE_MONO AND 2
        movsw
endif
if      LENGTH_TRANSPARENCY_TEMPLATE_MONO AND 1
        movsb
endif
        CPUMode 286

cblt_transparency_not_needed:                                           

        mov     npFetchEnd,di           ; Save end of fetch/logic operation


        subttl  Compile - Mask And Save
        page

;/*
;**     Generate code to mask and save the result.  If the destination
;**     isn't in a register, it will be loaded from ES:[DI] first.  The
;**     mask operation will then be performed, and the result stored.
;*/

        test    bl,F0_DEST_IS_COLOR     ; color destination?
        jz      cblt_mask_store_needed  ; no...

        mov     al,I_STOSB              ; assume color to color or mono to
                                        ; to color with no transparency

        test    bl,F0_GAG_CHOKE         ; converting mono to color ?
        jnz     @F                      ; yes...

;/*
;** Color to color blt just requires a straight 8bpp stuff. We don't yet
;** handle the 4bpp case.
;*/

        stosb
        jmp     short cblt_no_mask_store_needed

;/*
;** Mono to color conversion requires a more complex store operation because
;** of the looping logic required.
;*/

@@:     test    fsBlt,BBF_TRANS or BBF_ANTI_TRANS
        jnz     @F                      ; handle transparency below...

        stosb                           ; store pixel in non-transparent case
        jmp     short cblt_store_loop_logic; continue with looping logic...

@@:     mov     al,I_ES_OVERRIDE        ; store to destination
        stosb
        mov     ax,I_MOV_DEST_AL
        stosw
        pop     si                      ; -> 1 beyond JC or JNC instruction
        mov     ax,di
        sub     ax,si                   ; now offset to the INC DI
        mov     es:[si][-1],al          ; stuff relative jump offset
        mov     al,I_INC_DI             ; assume increment destination pointer
        cmp     iStepDir,INCREASE       ; going right?
        je      @F                      ; yes...
        mov     al,I_DEC_DI             ; make sure we will be stepping left
@@:     stosb

cblt_store_loop_logic:

        mov     ax,I_DEC_CL             ; decrement pixel in byte counter
        stosw
        pop     ax                      ; -> top of loop (pushed in fetch code)
        sub     ax,di                   ; calculate # bytes back from current ->
        sub     al,2                    ; account for JNZ instruction
        mov     ah,al
        mov     al,I_JNZ                ; jump back to top of loop if more
        stosw
        mov     npFetchEnd,di           ; fetch,logic stuff includes all now
        mov     ax,di                   ; calculate fetch code size
        sub     ax,npFetchStart
        mov     cFetchCode,ax           ; and store for last byte processing
        jmp     short cblt_no_mask_store_needed

;/*
;** Mono destinations require that the result of the fetch,logic operation
;** be masked, with unused portions being pipelined to ensuing operations, and
;** only used portions being combined with the current destination.
;*/

cblt_mask_store_needed:

        mov     al,I_ES_OVERRIDE        ;Load destination in AH
        stosb
        mov     ax,I_MOV_AH_DEST
        stosw

        CPUMode 386
        mov     si,BitmapOFFSET masked_store_mono; add masked store template
        movsd
        movsw
        .errnz  MASKED_STORE_LEN_MONO - 6; must be six bytes long
        CPUMode 286

        mov     ax,mStart               ; stuff start mask into the template
        xchg    ah,al
        mov     es:MASKED_STORE_MASK_MONO[di],ax

cblt_no_mask_store_needed:

        mov     npFetchEnds,di          ; Save end of fetch/logic/store operation


        subttl  Compile - Inner Loop Generation
        page

;/*
;**       Now for the hard stuff; The inner loop (said with a "gasp!").
;**
;**       If there is no innerloop, then no code will be generated
;**       (now that's fast!).
;*/


cblt_5000:
        mov     dx,cInnerByte           ; Get the loop count
        or      dx,dx                   ; If the count is null
        njz     cblt_inner_loop_done    ; don't generate any code.

;/*
;**     We have something for a loop count.  If this just happens to be
;**     a source copy (S) with a phase of zero, then the innerloop degenerates
;**     to a repeated MOVSB instruction.  This little special case is
;**     worth checking for and handling!
;**
;**     Also, if this is one of the special cases {P, Pn, DDx, DDxn}, then it
;**     will also be special cased since these are all pattern fills (pattern,
;**     not pattern, 0, 1).
;**
;**     The same code can be shared for these routines, with the exception
;**     that patterns use a STOSx instruction instead of a MOVSx instruction
;**     and need a value loaded in AX
;**
;**     For the special cases {P, Pn, DDx, DDxn}, color conversion is
;**     not possible, so ignore it for them.
;*/

        mov     bh,byte ptr usMix       ; Get the raster op

        cmp     bh,BB_ROP_S
        je      cblt_rop_s

        mov     ax,((0ffh shl 8) + I_MOV_AX_WORD_I); assume all 1s fill
        cmp     bh,BB_ROP_DDxn
        je      @F

        not     ah                      ; assumes all 0s fill
        cmp     bh,BB_ROP_DDx
        jne     cblt_not_0s_1s

@@:     stosw                           ; MOV AX,{0ffffh,00000h}
        mov     al,ah
        stosb
        mov     si,I_STOSB              ; set up for repeated code processor
        jmp     short cblt_check_length
        
cblt_not_0s_1s:

        cmp     bh,BB_ROP_P
        je      @F                      ; can be special-cased
        cmp     bh,BB_ROP_Pn
        njne     cblt_cannot_rep        ; cannot special case it
@@:     test    bl,F0_COLOR_PAT         ; using color pattern fetch ?
        njnz     cblt_cannot_rep        ; yes -- then cannot use fast code...

        mov     si,I_STOSB              ; Set up for repeated code processor

        CPUMode 386
        mov     eax,I_MOV_AL_DH+(I_MOV_AH_AL shl 16) ; fetch pattern byte
        stosd                                        ; and replicate it in AX
        CPUMode 286

        cmp     bh,BB_ROP_P             ; writing just pattern
        je      cblt_check_length

        mov     ax,I_NOT_AX             ; writing inverse pattern
        stosw

;/*
;**   Now, we have the fill in AX; we still need to duplicate it to
;**   make EAX = AX:AX if necessary.
;*/

cblt_check_length:
        cmp     dx,4                    ; If we going to use STOSD
        jl      cblt_5080               ; we need to fill in EAX

        .386P

        mov     cx,I_MOV_AH_AL
        mov     eax, (08h shl 24) + (I_ROR_AX_N shl 8) + I_USE_386
        stosd                           ; ROR EAX,8 => EAX = AL:X:X:AL
        xchg    eax,ecx
        stosw                           ; MOV AH,AL => EAX = AL:X:AL:AL
        xchg    eax,ecx
        stosd                           ; ROR EAX,8 => EAX = AL:AL:X:AL
        xchg    eax,ecx
        stosw                           ; MOV AH,AL => EAX = AL:AL:AL:AL

        .286P

        jmp     short cblt_5080


cblt_rop_s:

;/*
;** Several circumstances prevent us from utilizing the rep code. These are:
;**
;**       a) Device source (ie: we set up the hardware in byte mode which
;**          means that insb with or without rep will not work properly).
;**
;**       b) Color conversion because too much other      must go on for us
;**          to be able to use the more efficient code.
;**
;**       c) Nonzero phase because masking must occur.
;*/

ifdef PALMGR
;/*
;**       d) there needs to be palette translation
;*/
endif

ifdef PALMGR
        mov     ax,pmappal.lo           ; any palette translation?
        or      ax,pmappal.hi
        njnz    cblt_cannot_rep
endif

        test    bl,(F0_SRC_IS_DEV or F0_GAG_CHOKE); first two conditions?
        njnz    cblt_cannot_rep         ; either one gets us out of here...
        cmp     iHorzPhase,0            ; is horizontal phase zero?
        njne    cblt_cannot_rep         ; no -- can't condense source copy...

        mov     si,I_MOVSB              ; Set register for moving bytes

cblt_can_use_rep:

;/*
;**     This is a source copy or pattern fill.   Process an odd byte with
;**     a MOVSB or STOSB, then process the rest of the bytes with a REP
;**     MOVSW or a REP STOSW.  If the REP isn't needed, leave it out.
;**
;**     Don't get caught on this like I did!  If the direction of the
;**     BLT is from right to left (decrementing addresses), then both
;**     the source and destination pointers must be decremented by one
;**     so that the next two bytes are processed, not the next byte and
;**     the byte just processed.  Also, after all words have been processed,
;**     the source and destination pointers must be incremented by one to
;**     point to the last byte (since the last MOVSW or STOSW would have
;**     decremented both pointers by 2).
;**
;**     If the target machine is an 8086, then it would be well worth the
;**     extra logic to align the fields on word boundaries before the MOVSxs
;**     if at all possible.
;**
;**     The generated code should look something like:
;**
;**     WARP8:                                         ;This code for moving left to right
;**             movsb                                  ;Dword alignment
;**             ld      cx,cInnerByte/4 ;Set word count
;**             rep                                    ;If a count, then repeat is needed
;**             movsd                                  ;Move words until done
;**             movsb                                  ;Process odd byte
;**
;**
;**     WARP8:                                         ;This code for moving right to left
;**             movsb                                  ;Dword alignment
;**             dec     si                             ;adjust pointer for moving words
;**             dec     di
;**             dec     si                             ;adjust pointer for moving words
;**             dec     di
;**             dec     si                             ;adjust pointer for moving words
;**             dec     di
;**             ld      cx,cInnerByte/4 ;Set word count
;**             rep                                    ;If a count, then repeat is needed
;**             movsd                                  ;Move words until done
;**             inc     si                             ;adjust since words were moved
;**             inc     di
;**             inc     si                             ;adjust since words were moved
;**             inc     di
;**             inc     si                             ;adjust since words were moved
;**             inc     di
;**             movsb                                  ;Process odd byte
;**
;**
;**     Of course, if any part of the above routine isn't needed, it isn't
;**     generated (i.e. the generated code might just be a single MOVSB)
;*/

;/*
;** The following piece of code is used to speed up the source blt for the
;** width of more than 4 bytes by using MOVSD. However, if we want to
;** fully utilize its speed, we have to start fetching from the DWORD
;** boundary.
;*/


        .386P

        cmp     dx,4
        jl      cblt_5130               ;Less than 4 bytes then use MOVSB
        mov     ax,si                   ;We want to start MOVSD at a DWORD
        mov     ah,al                   ;boundary, so we may do some prefetch

cblt_dword_alignment:
        mov     cx,devSrc.lp_bits.lo    ;Get the source's starting point
        mov     bl,iStepDir             ;See which direction we are going
        .errnz  STEPLEFT                ;Get the right increment in BX, i.e.
        .errnz  STEPRIGHT-1             ;   BX = +1 for left-to-right
        shl     bl,1                    ;   BX = -1 for right-to-left
        dec     bl
        add     cl,bl                   ;The first byte is alredy fetched
        test    fbFetch,FF_TWO_INIT_FETCHES
        jz      @F
        add     cl,bl                   ;Yes, update to the current point
@@:
        or      bl,bl                   ;Get the offset from the next DWORD
        js      @F                      ;boundary!!
        not     cl                      ;For StepRight, the offset equals
        add     cl,4                    ;(4-current position) mod 4
@@:                                     ;For StepLeft, the offset equals
        inc     cl                      ;(current position + 1) mod 4
        shr     cl,1
        jnc     @F                      ;If there is an odd byte,
        stosb                           ;fetch it first
        dec     dx
@@:
        shr     cl,1
        jnc     cblt_5080               ;If there is an odd word,
        stosw                           ;fetch it once more
        dec     dx
        dec     dx

cblt_5080:
        mov     ax,si
        mov     cx,dx                   ;Preserve it for future use
        shr     cx,2                    ;Check the number of DWORD to blt

cblt_5090:                              ; This is being used as a double jmp pt for jz's
        jz      cblt_5130               ;No more bytes to move
        xor     bx,bx                   ;Flag as stepping from left to right
        cmp     bl,iStepDir             ;Moving from the right to the left?
        errnz   STEPLEFT                ;  (left direction must be zero)
        jnz     cblt_5100               ;  No
        mov     eax,I_DEC_SI_DEC_DI + (I_DEC_SI_DEC_DI shl 16)
        stosw                           ;  Yes, decrement both pointers
        stosd
        mov     ebx,I_INC_SI_INC_DI  + (I_INC_SI_INC_DI shl 16)
                                        ;Set up to increment the pointers later

cblt_5100:
        cmp     cx,1                    ;Move one word or many words?
        jz      cblt_5120               ;  Only one word
        mov     al,I_MOV_CX_WORD_I      ;  Many words, load count
        mov     ah,cl
        stosw
        mov     al,ch                   ;Set MSB of count
        mov     ah,I_REP                ;  and a repeat instruction
        stosw

cblt_5120:
        mov     ax,si                   ;Set the word instruction
        inc     ax
        or      ax,I_USE_386 shl 8      ;Prefix the 16-bit data transfer instr.
        xchg    ah,al                   ;with 66 to make it become 32-bit
        stosw

        .errnz   I_MOVSW - I_MOVSB - 1  ; The word form of the instruction
        .errnz   I_STOSW - I_STOSB - 1  ;   must be the byte form + 1

        or      bx,bx                   ; Need to increment the pointers?
        jz      cblt_5130               ;   No
        mov     eax,ebx                 ;   Yes, increment both pointers
        stosd
        stosw

cblt_5130:
        mov     ax,si
        shr     dx,1                    ;Byte count / 2 for words
        jnc     @F                      ;  No odd byte to move
        stosb

@@:                                     ; This is being used as a double jmp pt for jz's
        shr     dx,1                    ;move another two odd bytes if needed
        jnc     cblt_5140
        mov     ah,al                   ;We use two 8-bit transfer because we
        stosw                           ;don't want to adjust SI and DI

        .286P

cblt_no_inner_loop_code:
cblt_5140:
        jmp     cblt_inner_loop_done    ; Done setting up the innerloop
        page

;/*
;**     There is some count for the innerloop of the BLT.  Generate the
;**     required BLT. Two or four copies of the BLT will be placed on the
;**     stack.   This allows the LOOP instruction at the end to be distributed
;**     over two or four bytes instead of 1, saving 11 or 12 clocks for each
;**     byte (for 4).  Multiply 12 clocks by ~ 16K and you save a lot of
;**     clocks!
;**
;**     If there are less than four (two) bytes to be BLTed, then no looping
;**     instructions will be generated.  If there are more than four (two)
;**     bytes, then there is the possibility of an initial jump instruction
;**     to enter the loop to handle the modulo n result of the loop count.
;**
;**     The innerloop code will look something like:
;**
;**
;**     <       mov     cx,loopcount/n> ;load count if >n innerloop bytes
;**     <       jmp     short ???     > ;If a first jump is needed, do one
;**
;**     BLTloop:
;**             replicate initial byte BLT code up to n times
;**
;**     <       loop    BLTloop >       ;Loop until all bytes processed
;*/


cblt_cannot_rep:

;/*
;** If inner loop once code, use CX as loop counter but must push it for  
;** ensuing pop also.                                                     
;*/
                                                                        
        mov     bh,fbMore       ; will be used a few times      
        test    bh,F1_INNER_ONCE        ; only 1 fetch copy in loop ?   
        jz      @F                      ; no...                         
                                                                        
        mov     al,I_MOV_CX_WORD_I      ; store loop count in CX        
        stosb                                                           
        mov     ax,dx                   ; fetch inner loop byte count   
        stosw                                                           
                                                                        
        mov     dx,di                   ; save offset of loop start    
                                                                        
        mov     al,I_PUSH_CX            ; save loop count               
        stosb                                                           
@@:                                                                     

;/*
;** If fetching a non-solid color pattern then a pattern fetch must occur
;** for each byte processed.
;*/
                                                                        
        test    bl,F0_PAT_PRESENT       ; is pattern being used ?       
        jz      cblt_inner_no_pat       ; no...                          
        test    bl,F0_COLOR_PAT         ; color pattern fetch ?          
        jz      cblt_inner_no_pat       ; no...                          
        test    bh,F1_MONO_COLOR        ; converting mono to color?      
        jnz     cblt_inner_no_pat       ; yes -- ignore this code...     
                                                                         
;/*
;** We have DI points to the beginning of the next pattern fetch code, and
;** we want to have dl_addr_pbrush_off_m points to the brush offset within
;** this code.
;*/
                                                                         
        BRUSH_OFF_FROM_START_FETCH equ 1                                 
                                                                         
        mov     ax,BRUSH_OFF_FROM_START_FETCH                            
                                                                         
;/*
;** Remember? in case of a memory source we insert the brush selector's
;** loading code. So compensate that offset if required.
;*/
                                                                         
        BRUSH_SELECTOR_LOAD equ 7                                        
                                                                         
        test    bl,F0_SRC_PRESENT       ;Is there a source?              
        jz      @F                                                       
        test    bl,F0_SRC_IS_DEV        ;Is the source the device?       
        jnz     @F                      ;we prefix some code             
        add     ax,BRUSH_SELECTOR_LOAD  ;for memory source               
@@:                                                                      
        add     di,ax                   ; point to where brush offset is 
        mov     dl_addr_pbrush_off_m,di ; store for later reference      
        sub     di,ax                   ; restore current code offset    
                                                                         
        mov     ax,npbPatRow    ; -> the 7 of 7[BX]             ;        
        mov     si,dl_addr_pbrush       ; fetch brush code from here     
        sub     ax,si                   ; offset to the index            
        add     ax,di                   ; now -> where index will live   
                                                                         
        CPUMode 386                                                      
        xor     ecx,ecx                                                  
        mov     cx,dl_addr_brush_end    ; calculate # bytes to move      
        sub     cx,si                                                    
        ror     ecx,2                                                    
        rep     movs es:dword ptr [di],es:dword ptr [si]                 
        rol     ecx,2                                                    
        rep     movs es:byte ptr [di],es:byte ptr [si]                   
        CPUMode 286                                                      
                                                                         
        mov     si,ax                   ; point to index                 
        mov     ch,iDir         ; set brush index               ;        
        add     es:byte ptr [si],ch     ; point to next brush byte       
        .errnz   INCREASE - 1           ; must be a 1                    
        .errnz   DECREASE + 1           ; must be a -1                   
        and     es:byte ptr [si],7      ; keep mod 8                     
                                                                         
        mov     ax,(I_SS_OVERRIDE+(I_MOV_AL_MEM*256))                    
        stosw                           ;mov al,ss:[xxxx]                
        mov     ax,si                                                    
        stosw                                                            
        mov     al,I_ADD_AL_BYTE_I                                       
        mov     ah,ch                   ; set brush index                
        .errnz   INCREASE - 1           ; must be a 1                    
        .errnz   DECREASE + 1           ; must be a -1                   
        stosw                                                            

        CPUMode 386
        mov     eax,(I_AND_AL_BYTE_I+(7*256))+((I_SS_OVERRIDE+(I_MOV_MEM_AL*256)) shl 16)
                                        ; and al,BrushIndexMask          
        stosd                           ;mov ss:[xxxx],al                
        CPUMode 286                                                      
                                                                         
        mov     ax,si                                                    
        stosw                                                            
cblt_inner_no_pat:                                                       
                                                                         
;/*
;** Certain circumstances (ie: color conversion, color pattern fetch)
;** are too unwieldy to replicate in the inner loop, because of number
;** of fixups and/or size of fetch code. These cases will be isolated
;** here and the inner loop will be just one copy.
;*/
                                                                         
        test    bh,F1_INNER_ONCE        ; 1 inner loop copy ?            
        jz      cblt_inner_multiple     ; no...                          
                                                                         
        CPUMode 386                                                      
        xor     ecx,ecx                                                  
        mov     cx,npFetchEnd           ; get size of the fetch code     
        mov     si,npFetchStart         ; and load -> to it also         
        sub     cx,si                                                    
        mov     ax,di                   ; keep copy to start of fetch    
        ror     ecx,2                                                    
        rep     movs es:dword ptr [di],es:dword ptr [si]                 
        rol     ecx,2                                                    
        rep     movs es:byte ptr [di],es:byte ptr [si]                   
        CPUMode 286                                                      
                                                                         
        test    bh,F1_COLOR_MONO                                         
        jz      cblt_inner_not_color_mono                                
                                                                         
        mov     si,ax                   ; ES:SI is -> fetch code start   
        test    bl,F0_SRC_IS_DEV        ; if the source is the device    
        jz      @F                      ; we must offset into fetch code 
        add     si,2                    ; by the size of XCHG SI,DX      
        .errnz  (I_XCHG_SI_DX LE 0ffh)                                   
@@:     mov     es:word ptr[si][POS_CBLT_FETCH_8BPP_TO_MONO_COUNT],8     
cblt_inner_not_color_mono:                                               
                                                                         
        test    bh,F1_MONO_COLOR        ; converting mono to color ?     
        jz      cblt_inner_not_mono_color; no...                         
                                                                         
        mov     si,ax                   ; ES:SI is -> fetch code start   
                                                                         
        test    bl,F0_PAT_PRESENT       ; pattern to account for ?       
        jz      @F                      ; no...                          
        mov     ch,es:[si][MONO_8BPP_LOOP_COUNT_OFF]; # initial pixels   
        mov     ax,npbPatRow    ; -> initial brush index -      ;        
        sub     ax,dl_addr_pbrush       ; -> brush start is index offset 
        add     si,ax                   ; offset to it less fetch code   
        mov     cl,es:[si][MONO_8BPP_FETCH_LEN]; fetch brush index       
        add     cl,ch                   ; update brush index             
        and     cl,SIZE_PATTERN-1       ; keep mod SIZE_PATTERN          
        mov     es:[si][MONO_8BPP_FETCH_LEN],cl; store new brush index   
        lea     cx,es:[si][MONO_8BPP_FETCH_LEN-BRUSH_INDEX_LESS_BRUSH_OFF]
        mov     dl_addr_pbrush_off_m,cx ; save -> inner loop brush offset
        sub     si,ax                   ; rewind fetch code start ->    
@@:                                                                     
                                                                        
        mov     es:byte ptr[si][MONO_8BPP_NORMALIZE_OFF],0              
        mov     es:byte ptr[si][MONO_8BPP_LOOP_COUNT_OFF],8; full bytes  
        jmp     short @F                ; skip store logic...            
cblt_inner_not_mono_color:                                               
                                                                         
        mov     al,I_STOSB              ; store logic                    
        stosb                                                            
                                                                         
@@:     mov     al,I_POP_CX             ; recover fetch counter          
        stosb                                                            
                                                                         
        mov     si,dx                   ; SI is -> start of loop         
        jmp     short cblt_5590         ; to common code below...        
cblt_inner_multiple:                                                     

                                                                         
        mov     bx,npFetchEnd           ; compute size of the fetch code
        sub     bx,npFetchStart
        inc     bx                      ; a stosb will be appended
        mov     si,4                    ; assume replication 4 times
        mov     cl,2                    ;  (shift count two bits left)
        cmp     bx,32                   ; small enough for 4 times?
        jc      cblt_5520               ;  yes, replicate 4 times
        shr     si,1                    ;  no,  replicate 2 times
        dec     cx

cblt_5520:
        cmp     dx,si                   ;Generate a loop?
        jle     cblt_5540               ;  No, just copy code
        mov     al,I_MOV_CX_WORD_I
        stosb                           ;mov cx,loopcount/n
        mov     ax,dx                   ;Compute loop count
        shr     ax,cl
        stosw
        shl     ax,cl                   ;See if loopcount MOD n is 0
        sub     ax,dx
        jz      cblt_5540               ;Zero, no odd count to handle

;/*
;**     There is an odd portion of bytes to be processed.  Increment
;**     the loop counter for the odd pass through the loop and then
;**     compute the displacement for entering the loop.
;**
;**     To compute the displacement, subtract the number of odd bytes
;**     from the modulus being used  (i.e. 4-3=1).  This gives the
;**     number of bytes to skip over the first time through the loop.
;**
;**     Multiply this by the number of bytes for a logic sequence,
;**     and the result will be the displacement for the jump.
;*/

        inc     word ptr es:-2[di]      ;Not zero, adjust for partial loop
        add     ax,si                   ;Compute where to enter the loop at
        mul     bl
        mov     cx,ax
        mov     al,I_JMP_NEAR           ;Stuff jump instruction
        stosb
        mov     ax,cx                   ;Stuff displacement for jump
        stosw

;/*
;**       Currently:      DX = loop count
;**                       SI = loop modulus
;**                       BX = size of one logic operation
;**                       DI --> next location in the loop
;*/

cblt_5540:
        mov     cx,bx                   ;Set move count
        mov     bx,dx                   ;Set maximum for move
        cmp     bx,si                   ;Is the max > what's left?
        jle     cblt_5560               ;  No, just use what's left
        mov     bx,si                   ;  Yes, copy the max

cblt_5560:
        sub     dx,si                   ;If dx > 0, then loop logic needed
        mov     si,npFetchStart         ;--> fetch code to copy

        CPUMode 386
        mov     ax,cx                   ;Save a copy of fetch length
        shr     cx,2
        rep     movs es:dword ptr [di],es:dword ptr [si];Move fetch code and stuff stosb
        mov     cx,ax
        and     cx,3
        rep     movs es:byte ptr [di],es:byte ptr [si];Move fetch code and stuff stosb
        CPUMode 286

        mov     si,di                   ;--> new source (and top of loop)
        sub     si,ax
        mov     byte ptr es:-1[di],I_STOSB
        dec     bl                      ;One copy has been made
        mul     bl                      ;Compute # bytes left to move

        CPUMode 386
        mov     cx,ax                   ;Set move count
        shr     cx,2
        rep     movs es:dword ptr [di],es:dword ptr [si];Move fetch code and stuff stosb
        mov     cx,ax
        and     cx,3
        rep     movs es:byte ptr [di],es:byte ptr [si];Move fetch code and stuff stosb
        CPUMode 286

        sub     si,ax                   ;Restore pointer to start of loop

;/*
;**     The innermost BLT code has been created and needs the looping
;**     logic added to it.  If there is any looping to be done, then
;**     generate the loop code.  The code within the innerloop may be
;**     greater than 126 bytes, so a LOOP instruction may not be used
;**     in this case.
;**
;**     Currently:
;**             DX      =       # bytes remaining to process
;**             SI      =       -> start of loop
;**             DI      =       -> next place to compile code
;*/

cblt_5580:
        or      dx,dx                   ; need a loop?
        jle     cblt_inner_loop_done    ;   no, don't generate one

cblt_5590:
        mov     ax,si                   ; compute offset of loop
        sub     ax,di
        cmp     ax,-125                 ; can this be a short label?
        jc      cblt_5600               ;   no, must make it a near jmp

        sub     al,2                    ; bias offset by length of LOOP inst.
        mov     ah,al
        mov     al,I_LOOP
        stosw                           ; set the loop instruction
        jmp     short cblt_inner_loop_done; go process the last byte code

cblt_5600:

        CPUMode 386
        mov     si,BitmapOFFSET jmp_cx_nz ; move in the dec CX jnz code
        movsd
        errnz   JMP_CX_NZ_LEN-4         ; must be four bytes long
        sub     ax,6                    ; adjust jump bias
        stosw                           ;  and store it into jump
        CPUMode 286


cblt_inner_loop_done:


        subttl  Compile - Last Byte Processing
        page

;/*
;**     All the innerloop stuff has been processed.  Now generate the code for
;**     the final byte if there is one.  This code is almost identical to the
;**     code for the first byte except there will only be one fetch (if a
;**     fetch is needed at all).
;**
;**     The code generated will look something like:
;**
;**     <       fetch           >       ; Get source byte
;**     <       align           >       ; Align source if needed
;**             action                  ; Perform desired action
;**             mask and store
;*/

        mov     bh,fbF0                 ; will be used a lot
        mov     bl,fbMore               ; ditto

        public  last_byte_processing    
last_byte_processing:                   

        mov     dx,mLast                ; get last byte mask
        or      dl,dl                   ; is there a last byte to be processed?
        njz     cblt_no_last_byte       ; no...


        mov     cl,bh                                   
        and     cl,F0_DEST_IS_COLOR or F0_SRC_IS_COLOR  
        xor     cl,F0_DEST_IS_COLOR or F0_SRC_IS_COLOR  
        njz     cblt_no_last_byte       

;/*
;** A non-solid color pattern fetch must occur for last byte processed.   
;*/
                                                                         
        test    bh,F0_PAT_PRESENT       ; is pattern being used ?        
        jz      cblt_last_no_pat        ; no...                          
        test    bh,F0_COLOR_PAT         ; color pattern fetch ?          
        jz      cblt_last_no_pat        ; no...                          
        test    bl,F1_MONO_COLOR        ; color converting?              
        jnz     cblt_last_no_pat        ; yes -- don't update brush yet  
                                                                         
;/*
;** We have DI points to the beginning of the next pattern fetch code, and
;** we want to have dl_addr_pbrush_off_l points to the brush offset within
;** this code.
;*/
                                                                         
        BRUSH_OFF_FROM_START_FETCH equ 1                                 
                                                                         
        mov     ax,BRUSH_OFF_FROM_START_FETCH                            
                                                                         
;/*
;** Remember? in case of a memory source we insert the brush selector's
;** loading code. So compensate that offset if required.
;*/
                                                                         
        BRUSH_SELECTOR_LOAD equ 7                                        
                                                                         
        test    bl,F0_SRC_PRESENT       ;Is there a source?              
        jz      @F                                                       
        test    bl,F0_SRC_IS_DEV        ;Is the source the device?       
        jnz     @F                      ;we prefix some code             
        add     ax,BRUSH_SELECTOR_LOAD  ;for memory source               
@@:                                                                      
        add     di,ax                   ; point to where brush offset is 
        mov     dl_addr_pbrush_off_l,di ; store for later reference      
        sub     di,ax                   ; restore current code offset    
                                                                        
        mov     ax,npbPatRow    ; -> the 7 of 7[BX]                   
        mov     si,dl_addr_pbrush       ; fetch brush code from here     
        sub     ax,si                   ; offset to the index            
        add     ax,di                   ; now -> where index will live   
                                                                         
        CPUMode 386                                                      
        xor     ecx,ecx                                                  
        mov     cx,dl_addr_brush_end    ; calculate # bytes to move      
        sub     cx,si                                                    
        ror     ecx,2                                                    
        rep     movs es:dword ptr [di],es:dword ptr [si]                 
        rol     ecx,2                                                    
        rep     movs es:byte ptr [di],es:byte ptr [si]                   
        CPUMode 286                                                      
                                                                         
        mov     si,ax                   ; ES:SI is -> index              
        mov     al,es:byte ptr [si]     ; fetch starting index           
        add     al,byte ptr cInnerByte; offset by bytes output  
        and     al,7                    ; keep mod pattern size          
        mov     es:byte ptr [si],al     ; store back                     
cblt_last_no_pat:                                                        

        mov     si,npFetchStart         ; -> start of fetch,logic code
        test    bh,F0_SRC_PRESENT       ; was there a source ?
        jnz     @F                      ; yes...
cblt_was_no_fetch_relay:
        jmp     cblt_was_no_fetch       ; no fetch if no source...
@@:

        mov     cx,cFetchCode   ; size of the fetch code only

        test    bl,F1_COLOR_CONVERSION  ; test for color conversion
        jnz     cblt_include_fetch      ; yes -- always include the fetch...
        test    fbFetch,FF_NO_LAST_FETCH
        jz      cblt_include_fetch

        add     si,cx                   ; assume skipping fetch
        cmp     iHorzPhase,0            ; phase zero case is not combined
                                        ; into innerloop as it should be.
                                        ; if the final byte is full then we
                                        ; better not remove the lodsb ( i.e.
        jne     cblt_was_no_fetch_relay ; 0-0=0 would make us think we could)
        sub     si,cx                   ; will be including fetch

cblt_include_fetch:

        mov     ax,di                   ; save -> start of fetch code

        CPUMode 386
        shr     cx,2
        rep     movs es:dword ptr[di],es:dword ptr[si]; copy fetch code
        mov     cx,cFetchCode           ; size of the fetch code only
        and     cx,3
        rep     movs es:byte ptr[di],es:byte ptr[si]; copy fetch code
        CPUMode 286

;/*
;** If color converting from mono to color we have some fixups to do.
;*/
                                                                        
        test    bl,F1_MONO_COLOR        ; converting mono -> color ?    
        jz      cblt_last_not_mono_color; no...                         
                                                                        
        mov     si,ax                   ; ES:SI is -> last byte fetch   
                                                                        
        test    bh,F0_PAT_PRESENT       ; pattern to account for ?      
        jz      @F                      ; no...                         
        mov     ch,es:[si][MONO_8BPP_LOOP_COUNT_OFF]; # initial pixels  
        mov     cl,byte ptr cInnerByte; # inner loop bytes      
        shl     cl,3                    ; now # inner loop pixels       
        add     ch,cl                   ; now # pixels up to last byte  
        mov     ax,npbPatRow    ; -> initial brush index -      
        sub     ax,dl_addr_pbrush       ; -> brush start is index offset
        add     si,ax                   ; offset to it less fetch code  
        mov     cl,es:[si][MONO_8BPP_FETCH_LEN]; fetch brush index      
        add     cl,ch                   ; update brush index            
        and     cl,SIZE_PATTERN-1       ; keep mod SIZE_PATTERN         
        mov     es:[si][MONO_8BPP_FETCH_LEN],cl; store new brush index  
        lea     cx,es:[si][MONO_8BPP_FETCH_LEN-BRUSH_INDEX_LESS_BRUSH_OFF]
        mov     dl_addr_pbrush_off_l,cx ; save -> last byte brush offset
        sub     si,ax                   ; rewind fetch code start ->    
@@:                                                                     
                                                                        
        mov     es:byte ptr[si][MONO_8BPP_NORMALIZE_OFF],0              
        mov     dl,byte ptr xSrc        ; assume stepping left          
        and     dl,7                    ; get pixels in left end        
        neg     dl                                                      
        add     dl,8                                                    
        cmp     iStepDir,STEPLEFT; step left correct ?          
        je      @F                      ; yes...                        
        mov     dl,byte ptr xSrc        ; get pixels in right end byte  
        add     dl,byte ptr cxExt       ; now last pixel addr + 1       
        dec     dl                      ; now just last pixel addr      
        and     dl,7                    ; keep mod 8                    
        inc     dl                      ; absolute # pixels             
@@:     mov     es:byte ptr[si][MONO_8BPP_LOOP_COUNT_OFF],dl            
                                                                        
        jmp     short cblt_last_done    ; skip remainder...             
cblt_last_not_mono_color:                                               

;/*
;** If color converting from color to mono we have some fixups to do.
;*/
                                                                        
        test    bl,F1_COLOR_MONO        ; converting color -> mono ?    
        jz      cblt_last_not_color_mono; no...                         
                                                                        
;/*
;** get bit # in last byte and convert to # pixels in the last byte
;*/
                                                                        
        mov     bl,gl_last_bit          ; last bit #                    
        inc     bl                      ; assume stepping right         
        cmp     iStepDir,STEPRIGHT                              ; 
        je      @F                      ; we assumed correctly...       
        neg     bl                      ; else adjust for stepping left 
        add     bl,8+1                  ; the +1 to adj for the         
@@:                                     ;       STEPRIGHT assume        
                                                                        
        sub     ax,ax                   ; assume src is bitmap          
        test    bh,F0_SRC_IS_DEV                                        
        jz      @F                                                      
        mov     ax,2                    ; no -- src is device           
        .errnz  (I_XCHG_SI_DX LE 0ffh)                                  
@@:     sub     di,ax                   ; adjust for XCHG SI,DX ?       
        mov     es:[di][CBLT_FETCH_8BPP_TO_MONO_COUNT],bl               
        add     di,ax                   ; reverse adj. for XCHG SI,DX ? 
cblt_last_color_mono:                                                   
                                                                        
        neg     bl                      ; 8 - bit # = normalization rot 
        add     bl,8                                                    
        mov     ax,I_ROL_AL_N           ; assume norm. to left end byte 
        cmp     iStepDir,STEPRIGHT                              ; 
        je      @F                      ; stepping right...             
        mov     ah,HIGH I_ROR_AL_N      ; must norm. to right end byte  
        .errnz  LOW I_ROR_AL_N - LOW I_ROL_AL_N                         
@@:     stosw                                                           
        mov     al,bl                   ; normalization rotation count  
        stosb                                                           
cblt_last_not_color_mono:                                               

cblt_was_no_fetch:

        mov     cx,npFetchEnds          ; calculate bytes in logic,store code
        sub     cx,npFetchStart
        sub     cx,cFetchCode

        CPUMode 386
        mov     ax,cx
        shr     cx,2
        rep     movs es:dword ptr[di],es:dword ptr[si]; copy logic,store code
        mov     cx,ax
        and     cx,3
        rep     movs es:byte ptr[di],es:byte ptr[si]; copy logic,store code
        CPUMode 286

        mov     es:MASKED_STORE_MASK_MONO[di],dx; stuff last byte mask
cblt_8bpp_to_8bpp:           ;!!!GS   Do we want the above instruction if
cblt_last_done:              ;!!!GS   the dest is color? (but src mono)


        subttl  Compile - Looping Logic
        page

;/*
;**     Looping logic.
;**
;**     The looping logic must handle monochrome bitmaps, color bitmaps,
;**     huge bitmaps, the device, the presence or absence of a source
;**     or pattern, and mono <==> color interactions.
;**
;**     The type of looping logic is always based on the destination.
;*/

;/*
;**       Get saved parameters off of the stack.
;**
;**       <       pop     si            > ;Get source pointer
;**               pop     di              ;Get destination pointer
;**               pop     cx              ;Get loop count
;*/

cblt_no_last_byte:

        test    bh,F0_SRC_PRESENT       ;Is a source needed?
        jz      @F                      ;  No
        test    bh,F0_SRC_IS_DEV        ;Is it the device?
        jnz     @F                      ;  Yes

        mov     al,I_POP_SI             ;  Memory src, get source pointer
        stosb
@@:

        mov     ax,I_POP_DI_POP_CX      ;Get destination pointer
        stosw                           ;Get loop count


        subttl  Looping Logic - Brush Update
        page

;/*
;**  If a pattern was involved in the blt, two actions must be accounted
;**  for at the end of each scanline depending on whether or not we had a
;**  mono or color pattern fetch.
;** 
;**  Mono pattern fetches require that we update the index into the pattern
;**  at the end of each scanline. This is done here very simply.
;** 
;**  Color pattern fetches are a bit more complicated. We advanced the pattern
;**  index for each byte we processed in the blt along the scanline. The
;**  starting index is the same for each scanline. HOWEVER, the scan of the
;**  pattern that we index into must be updated. This is what we do at this
;**  time.
;*/

        test    bh,F0_PAT_PRESENT       ; is a pattern involved?
        jnz     @F
        jmp     cblt_6300               ; no...
@@:

        test    bh,F0_COLOR_PAT         ; is it a color pattern fetch ?
        jnz     @F                      ; yes...

;/*
;** mono or mask brushes just need to be updated to the next byte in the brush:
;*/

        mov     ax,I_SS_OVERRIDE+(I_MOV_AL_MEM*256)
        stosw                           ;mov al,ss:[xxxx]
        mov     dx,npbPatRow
        mov     ax,dx
        stosw
        mov     al,I_ADD_AL_BYTE_I
        mov     ah,iDir         ; add al,bias
        errnz   INCREASE-1              ; must be a 1
        errnz   DECREASE+1              ; must be a -1
        stosw

        CPUMode 386
        mov     eax,0700h+I_AND_AL_BYTE_I+((I_SS_OVERRIDE+(I_MOV_MEM_AL*256)) shl 16)
                                        ;and al,00000111b
        stosd                           ;mov ss:[xxxx],al
        CPUMode 286

        mov     ax,dx
        stosw
        jmp     cblt_6300

;/*
;**  color brushes are more complicated. We must update to the next row of the
;**  brush (ie: change the offset in the MOV BX,YYYY instructions) in each of the
;**  possible three places in the compiled code where the color pattern fetch may
;**  occur. The first fetch is always present, while the inner loop fetch and the
;**  last byte fetch may not.
;** 
;**  In addition, we may possibly have to reset the indices for the first,middle,
;**  and last brush fetches back to their initial values. (This is because the
;**  middle brush fetch is within a loop that advances the index to the next
;**  brush byte; this may also be the case for the first and last brush fetches
;**  if we are doing mono to color conversion).
;*/

        missing_code    <add in code for brush fetch index reset>

@@:     

;/*
;** Adjust initial brush fetch starting offset:
;*/

        mov     ax,(I_SS_OVERRIDE+(I_MOV_AX_MEM*256))
        stosw
        mov     si,dl_addr_pbrush_off_f ; -> first brush offset
        mov     ax,si
        stosw
        mov     al,I_ADD_AX_WORD_I
        mov     dh,iDir
        or      dh,dh
        jns     @F
        mov     al,I_SUB_AX_WORD_I
@@:     stosb
        mov     ax,SIZE_PATTERN
        stosw
        mov     al,I_CMP_AX_WORD_I
        stosb
        mov     cx,pBrush.off
        mov     ax,cx
        or      dh,dh
        js      @F
        add     ax,SIZE_PATTERN * SIZE_PATTERN
@@:     stosw
        mov     ax,I_JC_P5H
        or      dh,dh
        jns     @F
        mov     ax,I_JNC_P5H
@@:     stosw
        mov     al,I_MOV_AX_WORD_I
        stosb
        mov     ax,cx
        or      dh,dh
        jns     @F
        add     ax,(SIZE_PATTERN * SIZE_PATTERN) - SIZE_PATTERN
@@:     stosw
        mov     ax,(I_SS_OVERRIDE+(I_MOV_MEM_AX*256))
        stosw
        mov     ax,si
        stosw

;/*
;** Reinitialize first brush fetch starting index:
;*/

        mov     al,I_SS_OVERRIDE
        stosb
        mov     ax,I_MOV_MEM_BYTE_I
        stosw
        add     si,BRUSH_INDEX_LESS_BRUSH_OFF
        mov     ax,si
        stosw
        mov     al,es:[si]
        stosb                           ; initial brush starting index

        mov     si,dl_addr_pbrush_off_m
        or      si,si
        jz      @F                      ; no inner loop pattern fetch...

;/*
;** Adjust middle brush fetch starting offset:
;*/

        mov     ax,(I_SS_OVERRIDE+(I_MOV_MEM_AX*256))
        stosw
        mov     ax,si
        stosw

;/*
;** Reinitialize middle brush fetch starting index:
;*/

        mov     al,I_SS_OVERRIDE
        stosb
        mov     ax,I_MOV_MEM_BYTE_I
        stosw
        add     si,BRUSH_INDEX_LESS_BRUSH_OFF
        mov     ax,si
        stosw
        mov     al,es:[si]
        stosb                           ; initial brush starting index
@@:

        mov     si,dl_addr_pbrush_off_l
        or      si,si
        jz      @F                      ; no last byte pattern fetch...

;/*
;** Adjust last brush fetch starting offset:
;*/

        mov     ax,(I_SS_OVERRIDE+(I_MOV_MEM_AX*256))
        stosw
        mov     ax,si
        stosw

;/*
;** Reinitialize last brush fetch starting index:
;*/

        mov     al,I_SS_OVERRIDE
        stosb
        mov     ax,I_MOV_MEM_BYTE_I
        stosw
        add     si,BRUSH_INDEX_LESS_BRUSH_OFF
        mov     ax,si
        stosw
        mov     al,es:[si]
        stosb                           ; initial brush starting index
@@:

cblt_6300:


        subttl  Looping Logic - Scan Line Update
        page

;/*
;**     Generate the next scanline code.  The next scan line code must
;**     handle monochrome bitmaps, the device, huge bitmaps, the presence
;**     or absence of a source.
;**
;**     Also color bitmaps, and mono <==> color interactions.
;**
;**     <       add si,devSrc.next_scan> ;Normal source scan line update
;**     <       Huge Bitmap Update     > ;>64K source update code
;**             add di,devDst.next_scan  ;Normal destination scan line update
;**     <       Huge Bitmap Update     > ;>64K destination update code
;*/

        mov     ch,iDir         ;Load this for YUpdate code

        test    bh,F0_SRC_PRESENT       ;Is there a source?
        jz      cblt_6340               ;  No, skip source processing
        test    bh,F0_SRC_IS_DEV        ;  Yes, skip if it is the device
        jnz     cblt_6340

        mov     dx,I_ADD_SI_WORD_I      ;add si,increment
        mov     bx,((HIGH I_MOV_SI_AX)*256)+(HIGH I_LEA_AX_SI_DISP16)
        mov     cl,HIGH I_MOV_AX_DS
        push    bp
        lea     bp,devSrc
        call    y_update                ;Generate the Y scan line update code
        pop     bp                      ;Restore frame pointer

cblt_6340:
        mov     dx,I_ADD_DI_WORD_I      ;add reg,increment
        mov     bx,((HIGH I_MOV_DI_AX)*256)+(HIGH I_LEA_AX_DI_DISP16)
        mov     cl,HIGH I_MOV_AX_ES
        push    bp
        lea     bp,devDst               ;--> destination data
        call    y_update                ;Generate the Y scan line update code
        pop     bp                      ;Restore frame pointer

;/*
;**       Compile the scan line loop.  The code simply jumps to the start
;**       of the outer loop if more scans exist to be processed.
;*/

cblt_6380:
        mov     ax,pfnBlt.off   ;Compute relative offset of
        sub     ax,di                   ;  start of loop
        cmp     ax,-125                 ;Can this be a short label?
        jc      cblt_6400               ;  No, must make it a near jmp
        sub     al,2                    ;Bias offset by length of LOOP inst.
        mov     ah,al
        mov     al,I_LOOP
        stosw                           ;Set the loop instruction
        jmp     short cblt_6420

cblt_6400:
        mov     si,BitmapOFFSET jmp_cx_nz ;Move in the dec CX jnz code

        CPUMode 386
        movsd
        CPUMode 286

        errnz   JMP_CX_NZ_LEN-4         ;Must be four bytes long
        sub     ax,6                    ;Adjust jump bias
        stosw                           ;  and store it into jump

cblt_6420:
        mov     al,I_RET_FAR            ;Stuff the far return instruction
        stosb

;/*
;**------ ret --------
;*/
        db      I_RET_NEAR      ; near return
cEnd    <nogen>


        subttl  Phase Align Code Generation
        page
;/***************************************************************************
;*
;* FUNCTION NAME = phase_align_generate 
;*
;* DESCRIPTION   = Generate the phase alignment if any.                     
;*                                                                          
;*                 Registers Destroyed:  
;*                       AX,CX,SI        
;*
;* INPUT         = DH = phase alignment 
;* OUTPUT        = NONE
;*
;* RETURN-NORMAL = NONE
;* RETURN-ERROR  = NONE
;*
;**************************************************************************/

        assumes ds,nothing
        assumes es,nothing

        public  phase_align_generate
phase_align_generate        proc    near

        mov     cl,iHorzPhase           ; fetch horizontal phase
        or      cl,cl                   ; any phase alignment ?
        jz      cblt_phase0             ; no, so skip alignment...

        mov     ax,I_ROL_AL_N           ; assume rotate left n times
        cmp     cl,5                    ; 4 or less rotates?
        jc      @F                      ;  yes...
        neg     cl                      ;  no, compute ROR count
        add     cl,8
        mov     ah,HIGH I_ROR_AL_N
        .errnz   (LOW I_ROL_AL_N) - (LOW I_ROR_AL_N)
@@:     stosw                           ;Stuff the phase alignment rotates
        mov     al,cl                   ;  then the phase alignment code
        stosb

;/*
;** Do not generate phase masking if there is only 1 src AND only 1 dest byte.
;** This is not just an optimization, see comments where these flags are set.
;*/

        mov     al,fbFetch
        and     al,FF_ONLY_1_SRC_BYTE or FF_ONLY_1_DEST_BYTE
        xor     al,FF_ONLY_1_SRC_BYTE or FF_ONLY_1_DEST_BYTE
        jz      @F

        mov     si,BitmapOFFSET phase_align

        .386P
if      PHASE_ALIGN_LEN GE 8
        mov     cx,(PHASE_ALIGN_LEN SHR 2)
        rep     movs    dword ptr es:[di],dword ptr cs:[si]
else
if      PHASE_ALIGN_LEN AND 4
        movs    dword ptr es:[di],dword ptr cs:[si]
endif
endif
if      PHASE_ALIGN_LEN AND 2
        movs    word ptr es:[di],word ptr cs:[si]
endif
if      PHASE_ALIGN_LEN AND 1
        movs    byte ptr es:[di],byte ptr cs:[si]
endif
        .286P

@@:
cblt_phase0:
        ret
phase_align_generate        endp


        subttl  Scan Line Update Generation
        page
;/***************************************************************************
;*
;* FUNCTION NAME = y_update 
;*
;* DESCRIPTION   = 
;*                 
;*     Generate Y update code.
;*    
;*    
;*     The Y update code is generated as follows:
;*    
;*     For the display, small bitmaps, and huge bitmaps where the BLT
;*     doesn't span a segment bounday, all that need be done is add
;*     next_scan to the offset portion of the bits pointer. next_scan
;*     is a 2's complement if the BLT is Y-, so an addition can always
;*     be done.
;*    
;*         < add   si,next_scan >
;*           add   di,next_scan
;*    
;*    
;*     For huge bitmaps where the BLT spans a segment boundary, the
;*     above update must be performed, and the overflow/undeflow
;*     detected.  This isn't too hard to detect.
;*    
;*     For any huge bitmap, there can be a maximum of Planes*bmWidthBytes-1
;*     unused bytes in a 64K segment.  The minimum is 0.  The scan line
;*     update always updates to the first plane of the next (previous) scan.
;*    
;*    
;*     When the BLT is Y+, if the new offset is anywhere within the
;*     unused bytes of a segment, or in the first scan of a segment,
;*     then overflow must have occured:
;*    
;*           -bmFillBytes <= offset < Planes*bmWidthBytes
;*    
;*     Since the update is always made to the first plane of a scan,
;*     Planes in the above equation can be thrown out.  Also, if
;*     bmFillBytes is added to both sides of the equation:
;*    
;*           0 <= offset < bmWidthBytes+bmFillBytes   (unsigned compare)
;*    
;*     will be true if overflow occurs.  The Y+ overflow check will
;*     look like:
;*    
;*    
;*         lea ax,bmFillBytes[si]                      ;Adjust for fill bytes now
;*         cmp ax,bmWidthBytes+bmFillBytes             ;Overflow occur?
;*         jnc NoOverflow                              ;  No
;*         cmp cx,2                                    ;Any more scans?
;*         jnc NoOverflow                              ;  No, don't update selector
;*         add si,bmFillBytes                          ;Step over fill bytes
;*         mov ax,ds                                   ;Compute new selector
;*         add ax,bmSegmentIndex
;*         mov ds,ax
;*    
;*       NoOverflow:
;*    
;*    
;*    
;*     For Y- BLTs, the test is almost the same.  The equation becomes
;*    
;*        -(Planes*bmWidthBytes) > offset             (unsigned compare)
;*    
;*     then underflow occurs.  Planes in the above equation cannot be
;*     thrown out.  The Y- underflow check will look like:
;*    
;*         mov ax,si
;*         cmp ax,-(Planes*bmWidthBytes)               ;Overflow occur?
;*         jc  NoOverflow                              ;  No
;*         cmp cx,2                                    ;Any more scans?
;*         jnc NoOverflow                              ;  No, don't update selector
;*         add si,bmFillBytes                          ;Step over fill bytes
;*         mov ax,ds                                   ;Compute new selector
;*         add ax,bmSegmentIndex
;*         mov ds,ax
;*    
;*     bmFillBytes and bmSegment index will be the 2's complement by
;*     now if the BLT is Y-.
;*
;*                  Registers Preserved: 
;*                        DX,SI          
;*                  Registers Destroyed: 
;*                        AX,DI,flags    
;*
;* INPUT         = SS:BP --> source or destination data                 
;*                 SS:DI --> where to generate the code                 
;*                 DX     =  update register (add si,wordI & mov ax,si) 
;*                 BL     =  lea register (SI or DI)                    
;*                 BH     =  mov si,ax   or   mov di,ax register        
;*                 CL     =  segment register (DS or ES)                
;*                 CH     =  Direction                                  
;*
;* OUTPUT        = SS:BP --> source or destination data         
;*                 SS:DI --> where to generate the code         
;*                 BL     =  lea register (SI or DI)            
;*                 BH     =  mov si,ax   or   mov di,ax register
;*                 CL     =  segment register (DS or ES)        
;*                 CH     =  Direction                          
;*
;* RETURN-NORMAL = NONE
;* RETURN-ERROR  = NONE
;*
;**************************************************************************/

        assumes ds,nothing
        assumes es,nothing

        public  y_update
y_update        proc    near

;/*
;**       Stuff the scan line increment for the source or destination
;**
;**       <   add     si,1234h    >       ;Update source
;**       <   add     di,9ABCh    >       ;Update destination
;*/


        mov     ax,[bp].next_scan       ;Get the increment
        or      ax,ax                   ;If zero, don't generate the code
        jz      y_update_10
        xchg    ax,dx                   ;Set opcode
        stosw
        xchg    ax,dx                   ;Set increment
        stosw

y_update_10:
        test    [bp].dev_flags,DEV_SPANS_SEG
        jnz     spans_a_segment
        ret

;/*
;**       The BLT spans a segment.  The code to detect when the segment is
;**       crossed must be generated, as given above.
;*/

spans_a_segment:
        mov     ah,dh                   ;Set register for MOV
        errnz   <(HIGH I_ADD_SI_WORD_I) - (HIGH I_MOV_AX_SI)>
        errnz   <(HIGH I_ADD_DI_WORD_I) - (HIGH I_MOV_AX_DI)>

        mov     al,LOW I_MOV_AX_SI      ;Assume Y- BLT
        errnz   <(LOW I_ADD_SI_WORD_I) - (LOW I_ADD_DI_WORD_I)>

        cmp     ch,DECREASE             ;Y- BLT?
        je      y_update_30             ;  Yes

        mov     ah,bl                   ;lea reg, bmFillBytes
        mov     al,LOW I_LEA_AX_SI_DISP16
        errnz   <(LOW I_LEA_AX_SI_DISP16) - (LOW I_LEA_AX_DI_DISP16)>

        stosw
        mov     ax,fill_bytes[bp]

y_update_30:
        stosw

        mov     al,I_CMP_AX_WORD_I
        stosb
        mov     ax,comp_value[bp]
        stosw

        mov     al,comp_test[bp]
        mov     ah,HIGH I_JC_P12H
        stosw
        errnz   <(HIGH I_JC_P12H) - (HIGH I_JNC_P12H)>

        mov     ax,I_CMP_CX_2
        stosw

        mov     ax,2+((LOW I_JC_P0DH)*256)
        stosw

        mov     al,(HIGH I_JC_P0DH)
        stosb
        errnz   <(LOW I_MOV_SI_AX)-(LOW I_MOV_DI_AX)>

        xchg    ax,dx                   ;Get add si, or add di,
        stosw
        mov     ax,fill_bytes[bp]
        stosw

        mov     al,LOW I_MOV_AX_DS
        mov     ah,cl
        stosw
        errnz   <(LOW I_MOV_AX_DS)-(LOW I_MOV_AX_ES)>

        mov     al,I_ADD_AX_WORD_I
        stosb

        mov     ax,seg_index[bp]
        stosw

        mov     al,LOW I_MOV_DS_AX      ;mov SegmentReg,ax
        mov     ah,cl
        stosw
        errnz   <(LOW I_MOV_DS_AX)-(LOW I_MOV_ES_AX)>
        errnz   <(HIGH I_MOV_DS_AX)-(HIGH I_MOV_AX_DS)>
        errnz   <(HIGH I_MOV_ES_AX)-(HIGH I_MOV_AX_ES)>

y_update_40:
        ret

y_update        endp

        include cblt.pub

sEnd    Bitmap
end
