RISCI_ATOM
/
harvey
mirror of https://github.com/Harvey-OS/harvey.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697
							;    Copyright (C) 1989, 1992, 1993 Aladdin Enterprises.  All rights reserved.
; 
; This file is part of AFPL Ghostscript.
; 
; AFPL Ghostscript is distributed with NO WARRANTY OF ANY KIND.  No author or
; distributor accepts any responsibility for the consequences of using it, or
; for whether it serves any particular purpose or works at all, unless he or
; she says so in writing.  Refer to the Aladdin Free Public License (the
; "License") for full details.
; 
; Every copy of AFPL Ghostscript must include a copy of the License, normally
; in a plain ASCII text file named PUBLIC.  The License grants you the right
; to copy, modify and redistribute AFPL Ghostscript, but only under certain
; conditions described in the License.  Among other things, the License
; requires that the copyright notice and this notice be preserved on all
; copies.

; $Id: iutilasm.asm,v 1.2 2000/09/19 19:00:46 lpd Exp $
; iutilasm.asm
; Assembly code for Ghostscript interpreter on MS-DOS systems

	ifdef	FOR80386

	.286c

	endif

utilasm_TEXT	SEGMENT	WORD PUBLIC 'CODE'
	ASSUME	CS:utilasm_TEXT


	ifdef	FOR80386

; Macro for 32-bit operand prefix.
OP32	macro
	db	66h
	endm

	endif					; FOR80386

; Clear a register

clear	macro	reg
	xor	reg,reg
	endm


	ifdef	FOR80386

; Replace the multiply and divide routines in the Turbo C library
; if we are running on an 80386.

; Macro to swap the halves of a 32-bit register.
; Unfortunately, masm won't allow a shift instruction with a count of 16,
; so we have to code it in hex.
swap	macro	regno
	  OP32
	db	0c1h,0c0h+regno,16		; rol regno,16
	endm
regax	equ	0
regcx	equ	1
regdx	equ	2
regbx	equ	3


; Multiply (dx,ax) by (cx,bx) to (dx,ax).
	PUBLIC	LXMUL@
	PUBLIC	F_LXMUL@
F_LXMUL@ proc	far
LXMUL@	proc	far
	swap	regdx
	mov	dx,ax
	swap	regcx
	mov	cx,bx
	  OP32
	db	0fh,0afh,0d1h			; imul dx,cx
	  OP32
	mov	ax,dx
	swap	regdx
	ret
LXMUL@	endp
F_LXMUL@ endp


; Divide two stack operands, leave the result in (dx,ax).

	ifdef	DEBUG

setup32	macro
	mov	bx,sp
	push	bp
	mov	bp,sp
	  OP32
	mov	ax,ss:[bx+4]			; dividend
	endm

ret32	macro	n
	mov	sp,bp
	pop	bp
	ret	n
	endm

	else					; !DEBUG

setup32	macro
	mov	bx,sp
	  OP32
	mov	ax,ss:[bx+4]			; dividend
	endm

ret32	macro	n
	ret	n
	endm

	endif					; (!)DEBUG

	PUBLIC	LDIV@, LUDIV@, LMOD@, LUMOD@
	PUBLIC	F_LDIV@, F_LUDIV@, F_LMOD@, F_LUMOD@
F_LDIV@	proc	far
LDIV@	proc	far
	setup32
	  OP32
	cwd
	  OP32
	idiv	word ptr ss:[bx+8]		; divisor
	  OP32
	mov	dx,ax
	swap	regdx
	ret32	8
LDIV@	endp
F_LDIV@	endp
F_LUDIV@ proc	far
LUDIV@	proc	far
	setup32
	  OP32
	xor	dx,dx
	  OP32
	div	word ptr ss:[bx+8]		; divisor
	  OP32
	mov	dx,ax
	swap	regdx
	ret32	8
LUDIV@	endp
F_LUDIV@ endp
F_LMOD@	proc	far
LMOD@	proc	far
	setup32
	  OP32
	cwd
	  OP32
	idiv	word ptr ss:[bx+8]		; divisor
	  OP32
	mov	ax,dx
	swap	regdx
	ret32	8
LMOD@	endp
F_LMOD@	endp
F_LUMOD@ proc	far
LUMOD@	proc	far
	setup32
	  OP32
	xor	dx,dx
	  OP32
	div	word ptr ss:[bx+8]		; divisor
	  OP32
	mov	ax,dx
	swap	regdx
	ret32	8
LUMOD@	endp
F_LUMOD@ endp

	else					; !FOR80386

; Replace the divide routines in the Turbo C library,
; which do the division one bit at a time (!).

	PUBLIC	LDIV@, LMOD@, LUDIV@, LUMOD@
	PUBLIC	F_LDIV@, F_LMOD@, F_LUDIV@, F_LUMOD@

; Negate a long on the stack.
negbp	macro	offset
	neg	word ptr [bp+offset+2]		; high part
	neg	word ptr [bp+offset]		; low part
	sbb	word ptr [bp+offset+2],0
	endm

; Negate a long in (dx,ax).
negr	macro
	neg	dx
	neg	ax
	sbb	dx,0
	endm

; Divide two unsigned longs on the stack.
; Leave either the quotient or the remainder in (dx,ax).
; Operand offsets assume that bp (and only bp) has been pushed.
nlo	equ	6
nhi	equ	8
dlo	equ	10
dhi	equ	12

; We use an offset in bx to distinguish div from mod,
; and to indicate whether the result should be negated.
odiv	equ	0
omod	equ	2
odivneg	equ	4
omodneg	equ	6
F_LMOD@	proc	far
LMOD@	proc	far
	push	bp
	mov	bp,sp
	mov	bx,omod
			; Take abs of denominator
	cmp	byte ptr [bp+dhi+1],bh		; bh = 0
	jge	modpd
	negbp	dlo
modpd:			; Negate mod if numerator < 0
	cmp	byte ptr [bp+nhi+1],bh		; bh = 0
	jge	udiv
	mov	bx,omodneg
negnum:	negbp	nlo
	jmp	udiv
LMOD@	endp
F_LMOD@	endp
F_LUMOD@ proc	far
LUMOD@	proc	far
	mov	bx,omod
	jmp	udpush
LUMOD@	endp
F_LUMOD@ endp
F_LDIV@	proc	far
LDIV@	proc	far
	push	bp
	mov	bp,sp
	mov	bx,odiv
			; Negate quo if num^den < 0
	mov	ax,[bp+nhi]
	xor	ax,[bp+dhi]
	jge	divabs
	mov	bx,odivneg
divabs:			; Take abs of denominator
	cmp	byte ptr [bp+dhi+1],bh		; bh = 0
	jge	divpd
	negbp	dlo
divpd:			; Take abs of numerator
	cmp	byte ptr [bp+nhi+1],bh		; bh = 0
	jge	udiv
	jmp	negnum
LDIV@	endp
F_LDIV@	endp
F_LUDIV@ proc	far
LUDIV@	proc	far
	mov	bx,odiv
udpush:	push	bp
	mov	bp,sp
udiv:	push	bx				; odiv, omod, odivneg, omodneg
	mov	ax,[bp+nlo]
	mov	dx,[bp+nhi]
	mov	bx,[bp+dlo]
	mov	cx,[bp+dhi]
; Now we are dividing dx:ax by cx:bx.
; Check to see whether this is really a 32/16 division.
	or	cx,cx
	jnz	div2
; 32/16, check for 16- vs. 32-bit quotient
	cmp	dx,bx
	jae	div1
; 32/16 with 16-bit quotient, just do it.
	div	bx				; ax = quo, dx = rem
	pop	bx
	pop	bp
	jmp	cs:xx1[bx]
	even
xx1	dw	offset divx1
	dw	offset modx1
	dw	offset divx1neg
	dw	offset modx1neg
modx1:	mov	ax,dx
divx1:	xor	dx,dx
	ret	8
modx1neg: mov	ax,dx
divx1neg: xor	dx,dx
rneg:	negr
	ret	8
; 32/16 with 32-bit quotient, do in 2 parts.
div1:	mov	cx,ax				; save lo num
	mov	ax,dx
	xor	dx,dx
	div	bx				; ax = hi quo
	xchg	cx,ax				; save hi quo, get lo num
	div	bx				; ax = lo quo, dx = rem
	pop	bx
	pop	bp
	jmp	cs:xx1a[bx]
	even
xx1a	dw	offset divx1a
	dw	offset modx1
	dw	offset divx1aneg
	dw	offset modx1neg
divx1a:	mov	dx,cx				; hi quo
	ret	8
divx1aneg: mov	dx,cx
	jmp	rneg
; This is really a 32/32 bit division.
; (Note that the quotient cannot exceed 16 bits.)
; The following algorithm is taken from pp. 235-240 of Knuth, vol. 2
; (first edition).
; Start by normalizing the numerator and denominator.
div2:	or	ch,ch
	jz	div21				; ch == 0, but cl != 0
; Do 8 steps all at once.
	mov	bl,bh
	mov	bh,cl
	mov	cl,ch
	xor	ch,ch
	mov	al,ah
	mov	ah,dl
	mov	dl,dh
	xor	dh,dh
	rol	bx,1				; faster than jmp
div2a:	rcr	bx,1				; finish previous shift
div21:	shr	dx,1
	rcr	ax,1
	shr	cx,1
	jnz	div2a
	rcr	bx,1
; Now we can do a 32/16 divide.
div2x:	div	bx				; ax = quo, dx = rem
; Multiply by the denominator, and correct the result.
	mov	cx,ax				; save quotient
	mul	word ptr [bp+dhi]
	mov	bx,ax				; save lo part of hi product
	mov	ax,cx
	mul	word ptr [bp+dlo]
	add	dx,bx
; Now cx = trial quotient, (dx,ax) = cx * denominator.
	not	dx
	neg	ax
	cmc
	adc	dx,0				; double-precision neg
	jc	divz				; zero quotient
						; requires special handling
	add	ax,[bp+nlo]
	adc	dx,[bp+nhi]
	jc	divx
; Quotient is too large, adjust it.
div3:	dec	cx
	add	ax,[bp+dlo]
	adc	dx,[bp+dhi]
	jnc	div3
; All done.  (dx,ax) = remainder, cx = lo quotient.
divx:	pop	bx
	pop	bp
	jmp	cs:xx3[bx]
	even
xx3	dw	offset divx3
	dw	offset modx3
	dw	offset divx3neg
	dw	offset modx3neg
divx3:	mov	ax,cx
	xor	dx,dx
modx3:	ret	8
divx3neg: mov	ax,cx
	xor	dx,dx
modx3neg: jmp	rneg
; Handle zero quotient specially.
divz:	pop	bx
	jmp	cs:xxz[bx]
	even
xxz	dw	offset divxz
	dw	offset modxz
	dw	offset divxz
	dw	offset modxzneg
divxz:	pop	bp
	ret	8
modxzneg: negbp	nlo
modxz:	mov	ax,[bp+nlo]
	mov	dx,[bp+nhi]
	pop	bp
	ret	8
LUDIV@	endp
F_LUDIV@ endp

	endif					; FOR80386


	ifdef	NOFPU

; See gsmisc.c for the C version of this code.

; /*
;  * Floating multiply with fixed result, for avoiding floating point in
;  * common coordinate transformations.  Assumes IEEE representation,
;  * 16-bit short, 32-bit long.  Optimized for the case where the first
;  * operand has no more than 16 mantissa bits, e.g., where it is a user space
;  * coordinate (which are often integers).
;  *
;  * The assembly language version of this code is actually faster than
;  * the FPU, if the code is compiled with FPU_TYPE=0 (which requires taking
;  * a trap on every FPU operation).  If there is no FPU, the assembly
;  * language version of this code is over 10 times as fast as the
;  * emulated FPU.
;  */
; fixed
; fmul2fixed_(long /*float*/ a, long /*float*/ b)
; {

	PUBLIC	_fmul2fixed_
_fmul2fixed_ proc far
	push	bp
	mov	bp,sp
a	equ	6
alo	equ	a
ahi	equ	a+2
b	equ	10
blo	equ	b
bhi	equ	b+2
	push	si		; will hold ma
	push	di		; will hold mb

; 	int e = 260 + _fixed_shift - ((
; 		(((uint)(a >> 16)) & 0x7f80) + (((uint)(b >> 16)) & 0x7f80)
; 	  ) >> 7);

	mov	dx,[bp+ahi]
; dfmul2fixed enters here
fmf:	mov	cx,260+12
	mov	ax,[bp+bhi]
	and	ax,7f80h
	and	dx,7f80h
	add	ax,dx
	xchg	ah,al		; ror ax,7 without using cl
	rol	ax,1
	sub	cx,ax
	push	cx		; e

; 	ulong ma = (ushort)(a >> 8) | 0x8000;
; 	ulong mb = (ushort)(b >> 8) | 0x8000;

	mov	si,[bp+alo+1]	; unaligned
	clear	ax
	mov	di,[bp+blo+1]	; unaligned
	or	si,8000h
	or	di,8000h

; 	ulong p1 = ma * (b & 0xff);

	mov	al,[bp+blo]
	mul	si

;			(Do this later:)
; 	ulong p = ma * mb;

; 	if ( (byte)a )		/* >16 mantissa bits */

	cmp	byte ptr [bp+alo],0
	je	mshort

; 	{	ulong p2 = (a & 0xff) * mb;
; 		p += ((((uint)(byte)a * (uint)(byte)b) >> 8) + p1 + p2) >> 8;

	mov	cx,dx
	mov	bx,ax
	clear	ax
	mov	al,[bp+alo]
	clear	dx
	mov	dl,[bp+blo]
	mul	dx
	mov	dl,ah		; dx is zero
	add	bx,cx
	adc	cx,0
	clear	ax
	mov	al,[bp+alo]
	mul	di
	add	ax,bx
	adc	dx,cx

; 	}

mshort:

; 	else
; 		p += p1 >> 8;

	mov	bl,ah		; set (cx,bx) = (dx,ax) >> 8
	mov	bh,dl
	clear	cx
	mov	cl,dh
	mov	ax,si
	mul	di
	add	ax,bx
	adc	dx,cx

; 	if ( (uint)e < 32 )		/* e = -1 is possible */

	pop	cx		; e
	cmp	cx,16
	jb	shr1

; 	else if ( e >= 32 )		/* also detects a=0 or b=0 */

	cmp	cx,0
	jl	eneg
	sub	cx,16
	cmp	cx,16
	jge	shr0
	mov	ax,dx
	clear	dx
	shr	ax,cl
	jmp	ex

; 		return fixed_0;

shr0:	clear	ax
	clear	dx
	jmp	ex

; 	else
; 		p <<= -e;

	even
eneg:	neg	cx
	shl	dx,cl
	mov	bx,ax
	shl	ax,cl
	rol	bx,cl
	xor	bx,ax
	add	dx,bx
	jmp	ex

; 		p >>= e;

	even
shr1:	shr	ax,cl
	mov	bx,dx
	shr	dx,cl
	ror	bx,cl
	xor	bx,dx
	add	ax,bx

ex:

; 	return ((a ^ b) < 0 ? -p : p);

	mov	cx,[bp+ahi]
	xor	cx,[bp+bhi]
	jge	pos
	neg	dx
	neg	ax
	sbb	dx,0
pos:

; }

retu:	pop	di
	pop	si
	mov	sp,bp
	pop	bp
	ret

_fmul2fixed_ ENDP

; The same routine with the first argument a double rather than a float.
; The argument is split into two pieces to reduce data movement.

	PUBLIC	_dfmul2fixed_
_dfmul2fixed_ proc far
	push	bp
	mov	bp,sp
xalo	equ	6
;b	equ	10
xahi	equ	14
	push	si		; overlap this below
	push	di		; ditto

; Shuffle the arguments and then use fmul2fixed.

; Squeeze 3 exponent bits out of the top 35 bits of a.

	mov	dx,[bp+xahi+2]
	mov	bx,0c000h
	mov	ax,[bp+xahi]
	and	bx,dx
	mov	cx,[bp+xalo+2]
	and	dx,7ffh		; get rid of discarded bits
	add	cx,cx		; faster than shl!
	jz	cz		; detect common case
	adc	ax,ax		; faster than rcl!
	adc	dx,dx
	add	cx,cx
	adc	ax,ax
	adc	dx,dx
	add	cx,cx
	adc	ax,ax
	mov	[bp+alo],ax
	adc	dx,dx
	or	dx,bx
	mov	[bp+ahi],dx
	jmp	fmf
	even
cz:	adc	ax,ax
	adc	dx,dx
	add	ax,ax
	adc	dx,dx
	add	ax,ax
	mov	[bp+alo],ax
	adc	dx,dx
	or	dx,bx
	mov	[bp+ahi],dx
	jmp	fmf

_dfmul2fixed_ ENDP

	endif					; NOFPU


; Transpose an 8x8 bit matrix.  See gsmisc.c for the algorithm in C.
	PUBLIC	_memflip8x8
_memflip8x8 proc far
	push	ds
	push	si
	push	di
		; After pushing, the offsets of the parameters are:
		; byte *inp=10, int line_size=14, byte *outp=16, int dist=20.
	mov	si,sp
	mov	di,ss:[si+14]			; line_size
	lds	si,ss:[si+10]			; inp
		; We assign variables to registers as follows:
		; ax = AE, bx = BF, cx (or di) = CG, dx = DH.
		; Load the input data.  Initially we assign
		; ax = AB, bx = EF, cx (or di) = CD, dx = GH.
	mov	ah,[si]
iload	macro	reg
	add	si,di
	mov	reg,[si]
	endm
	iload	al
	iload	ch
	iload	cl
	iload	bh
	iload	bl
	iload	dh
	iload	dl
		; Transposition macro, see C code for explanation.
trans	macro	reg1,reg2,shift,mask
	mov	si,reg1
	shr	si,shift
	xor	si,reg2
	and	si,mask
	xor	reg2,si
	shl	si,shift
	xor	reg1,si
	endm
		; Do 4x4 transpositions
	mov	di,cx			; we need cl for the shift count
	mov	cl,4
	trans	bx,ax,cl,0f0fh
	trans	dx,di,cl,0f0fh
		; Swap B/E, D/G
	xchg	al,bh
	mov	cx,di
	xchg	cl,dh
		; Do 2x2 transpositions
	mov	di,cx				; need cl again
	mov	cl,2
	trans	di,ax,cl,3333h
	trans	dx,bx,cl,3333h
	mov	cx,di				; done shifting >1
		; Do 1x1 transpositions
	trans	bx,ax,1,5555h
	trans	dx,cx,1,5555h
		; Store result
	mov	si,sp
	mov	di,ss:[si+20]			; dist
	lds	si,ss:[si+16]			; outp
	mov	[si],ah
istore	macro	reg
	add	si,di
	mov	[si],reg
	endm
	istore	bh
	istore	ch
	istore	dh
	istore	al
	istore	bl
	istore	cl
	istore	dl
		; All done
	pop	di
	pop	si
	pop	ds
	ret
_memflip8x8 ENDP


utilasm_TEXT ENDS
	END