;
; AMD64-ABI Assembler versions of the darkness routines
;
; This will need modification to run in Windows because
; they rolled their own ABI
;
[BITS 64]
[SECTION .text]
; Declare public symbols
GLOBAL _darken_asm_32
GLOBAL darken_asm_32
GLOBAL _darken_asm_32s
GLOBAL darken_asm_32s
GLOBAL _darken_asm_16
GLOBAL darken_asm_16
GLOBAL _darken_asm_16s
GLOBAL darken_asm_16s
GLOBAL _darken_asm_blit16
GLOBAL darken_asm_blit16
GLOBAL _darken_asm_blit32
GLOBAL darken_asm_blit32
_darken_asm_32:
darken_asm_32:
; RDI = dest
; RSI = src
; RDX = len
mov rcx,rdx ; We want it in CX, actually
darkloop32:
xor rax,rax
xor rdx,rdx
mov al,[rsi] ; get the source pixel
mov edx,[rdi] ; get the dest pixel
; Replicate source level throughout EAX
mov ah,al
shl eax,8
mov al,ah
shl eax,8
mov al,ah
; Do the thing
movd mm0,edx
movd mm1,eax
psubusb mm0,mm1
movd eax,mm0
mov [rdi],eax ; Write output
; Next pixel
add rdi,4
inc rsi ; inc byte source
loop darkloop32
ret
;;
;; Single colour (not using a darkmap)
;;
_darken_asm_32s:
darken_asm_32s:
; RDI = dest
; RSI = src (colour, not address!)
; RDX = len
mov rcx,rdx ; We want it in CX, actually
; Do some pre-computation for the source colour level
mov rax,rsi
; Replicate source level throughout EAX
mov ah,al
shl eax,8
mov al,ah
shl eax,8
mov al,ah
and rax,0xffffff
mov rsi,rax
darkloop32s:
xor rdx,rdx
mov rax,rsi
aa:
mov edx,[rdi] ; get the dest pixel
ab:
ac:
; Do the thing
movd mm0,edx
movd mm1,eax
psubusb mm0,mm1
movd eax,mm0
; We use colour separation in the roof projector, and 0 is transparent
; So we need to make it non-zero unless it is supposed to be transparent
ad:
test eax,0xffffffff ; Is it zero?
jnz dark32noclip
test edx,0xffffffff ; If it's meant to be 0, don't adjust it
jz dark32noclip
or eax,0x01000000
dark32noclip:
mov [rdi],eax ; Write output
; Next pixel
add rdi,4
loop darkloop32s
ret
;;
;; 16bpp darkness code
;;
_darken_asm_16:
darken_asm_16:
push rbx
; RDI = dest
; RSI = src
; RDX = len
; RCX = LUT address
; We want RBX as the LUT and RCX as the count
mov rbx,rcx
mov rcx,rdx
; Now RCX = len
; And RBX = LUT address
; Clear high bits
xor rdx,rdx
xor rax,rax
darkloop16:
mov al,[rsi] ; get the source pixel
mov dx,[rdi] ; get the dest pixel
; Consult lookuptable to get correct lighting value
; shl 13 instead of shl 16 converts light level to 5-bit, effective shr 3
; ax = I32_clut[(ax*65536)+dx]
and rax,0xf8 ; ax = (ax)
shl rax,13 ; ax = (ax * 65536)
add rax,rdx ; ax = (ax * 65536)+dx
shl rax,1 ; align to 16 bit array
add rax,rbx ; I32_clut[(ax * 65536)+dx]
mov ax,[rax] ; ax = I32_clut[(ax * 65536)+dx]
mov [rdi],ax ; Write output
inc rdi ; inc word output
inc rdi
inc rsi ; inc byte source
loop darkloop16
pop rbx
ret
; Single colour (not using a darkmap)
_darken_asm_16s:
darken_asm_16s:
push rbx
; RDI = dest
; RSI = src colour, not address!
; RDX = len
; RCX = LUT address
; We want RBX as the LUT and RCX as the count
mov rbx,rcx
mov rcx,rdx
; Now RCX = len
; And RBX = LUT address
; Clear high bits
xor rdx,rdx
; Also trim the source colour, do some pre-computation
and rsi,0xf8 ; a = a
shl rsi,13 ; a = (a * 65536)
; shl 13 instead of shl 16 converts light level to 5-bit, effective shr 3
darkloop16s:
mov rax,rsi ; get the source pixel
mov dx,[rdi] ; get the dest pixel
; Consult lookuptable to get correct lighting value
; ax = I32_clut[(ax*65536)+dx]
add rax,rdx ; a = (a * 65536)+dx
shl rax,1 ; align to 16 bit array
add rax,rbx ; I32_clut[(a * 65536)+d]
mov dx,[rax] ; a = I32_clut[(a * 65536)+d]
mov [rdi],dx ; Write output
inc rdi ; inc word output
inc rdi
loop darkloop16s
pop rbx
ret
; Bitmap combining
; 16bpp
_darken_asm_blit16:
darken_asm_blit16:
; RDI = dest
; RSI = src colour, not address!
; RDX = len
mov rcx,rdx ; we want the length in CX
blitloop16:
lodsw
test ax,0xffff
jz blitskip16
mov [rdi],ax
blitskip16:
inc rdi
inc rdi
loop blitloop16
ret
; 32bpp
_darken_asm_blit32:
darken_asm_blit32:
; RDI = dest
; RSI = src colour, not address!
; RDX = len
mov rcx,rdx ; we want the length in CX
blitloop32:
lodsd
test eax,0xffffffff
jz blitskip32
mov [rdi],eax
blitskip32:
add rdi,4
loop blitloop32
ret