iutilasm.asm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697
  1. ; Copyright (C) 1989, 1992, 1993 Aladdin Enterprises. All rights reserved.
  2. ;
  3. ; This file is part of AFPL Ghostscript.
  4. ;
  5. ; AFPL Ghostscript is distributed with NO WARRANTY OF ANY KIND. No author or
  6. ; distributor accepts any responsibility for the consequences of using it, or
  7. ; for whether it serves any particular purpose or works at all, unless he or
  8. ; she says so in writing. Refer to the Aladdin Free Public License (the
  9. ; "License") for full details.
  10. ;
  11. ; Every copy of AFPL Ghostscript must include a copy of the License, normally
  12. ; in a plain ASCII text file named PUBLIC. The License grants you the right
  13. ; to copy, modify and redistribute AFPL Ghostscript, but only under certain
  14. ; conditions described in the License. Among other things, the License
  15. ; requires that the copyright notice and this notice be preserved on all
  16. ; copies.
  17. ; $Id: iutilasm.asm,v 1.2 2000/09/19 19:00:46 lpd Exp $
  18. ; iutilasm.asm
  19. ; Assembly code for Ghostscript interpreter on MS-DOS systems
  20. ifdef FOR80386
  21. .286c
  22. endif
  23. utilasm_TEXT SEGMENT WORD PUBLIC 'CODE'
  24. ASSUME CS:utilasm_TEXT
  25. ifdef FOR80386
  26. ; Macro for 32-bit operand prefix.
  27. OP32 macro
  28. db 66h
  29. endm
  30. endif ; FOR80386
  31. ; Clear a register
  32. clear macro reg
  33. xor reg,reg
  34. endm
  35. ifdef FOR80386
  36. ; Replace the multiply and divide routines in the Turbo C library
  37. ; if we are running on an 80386.
  38. ; Macro to swap the halves of a 32-bit register.
  39. ; Unfortunately, masm won't allow a shift instruction with a count of 16,
  40. ; so we have to code it in hex.
  41. swap macro regno
  42. OP32
  43. db 0c1h,0c0h+regno,16 ; rol regno,16
  44. endm
  45. regax equ 0
  46. regcx equ 1
  47. regdx equ 2
  48. regbx equ 3
  49. ; Multiply (dx,ax) by (cx,bx) to (dx,ax).
  50. PUBLIC LXMUL@
  51. PUBLIC F_LXMUL@
  52. F_LXMUL@ proc far
  53. LXMUL@ proc far
  54. swap regdx
  55. mov dx,ax
  56. swap regcx
  57. mov cx,bx
  58. OP32
  59. db 0fh,0afh,0d1h ; imul dx,cx
  60. OP32
  61. mov ax,dx
  62. swap regdx
  63. ret
  64. LXMUL@ endp
  65. F_LXMUL@ endp
  66. ; Divide two stack operands, leave the result in (dx,ax).
  67. ifdef DEBUG
  68. setup32 macro
  69. mov bx,sp
  70. push bp
  71. mov bp,sp
  72. OP32
  73. mov ax,ss:[bx+4] ; dividend
  74. endm
  75. ret32 macro n
  76. mov sp,bp
  77. pop bp
  78. ret n
  79. endm
  80. else ; !DEBUG
  81. setup32 macro
  82. mov bx,sp
  83. OP32
  84. mov ax,ss:[bx+4] ; dividend
  85. endm
  86. ret32 macro n
  87. ret n
  88. endm
  89. endif ; (!)DEBUG
  90. PUBLIC LDIV@, LUDIV@, LMOD@, LUMOD@
  91. PUBLIC F_LDIV@, F_LUDIV@, F_LMOD@, F_LUMOD@
  92. F_LDIV@ proc far
  93. LDIV@ proc far
  94. setup32
  95. OP32
  96. cwd
  97. OP32
  98. idiv word ptr ss:[bx+8] ; divisor
  99. OP32
  100. mov dx,ax
  101. swap regdx
  102. ret32 8
  103. LDIV@ endp
  104. F_LDIV@ endp
  105. F_LUDIV@ proc far
  106. LUDIV@ proc far
  107. setup32
  108. OP32
  109. xor dx,dx
  110. OP32
  111. div word ptr ss:[bx+8] ; divisor
  112. OP32
  113. mov dx,ax
  114. swap regdx
  115. ret32 8
  116. LUDIV@ endp
  117. F_LUDIV@ endp
  118. F_LMOD@ proc far
  119. LMOD@ proc far
  120. setup32
  121. OP32
  122. cwd
  123. OP32
  124. idiv word ptr ss:[bx+8] ; divisor
  125. OP32
  126. mov ax,dx
  127. swap regdx
  128. ret32 8
  129. LMOD@ endp
  130. F_LMOD@ endp
  131. F_LUMOD@ proc far
  132. LUMOD@ proc far
  133. setup32
  134. OP32
  135. xor dx,dx
  136. OP32
  137. div word ptr ss:[bx+8] ; divisor
  138. OP32
  139. mov ax,dx
  140. swap regdx
  141. ret32 8
  142. LUMOD@ endp
  143. F_LUMOD@ endp
  144. else ; !FOR80386
  145. ; Replace the divide routines in the Turbo C library,
  146. ; which do the division one bit at a time (!).
  147. PUBLIC LDIV@, LMOD@, LUDIV@, LUMOD@
  148. PUBLIC F_LDIV@, F_LMOD@, F_LUDIV@, F_LUMOD@
  149. ; Negate a long on the stack.
  150. negbp macro offset
  151. neg word ptr [bp+offset+2] ; high part
  152. neg word ptr [bp+offset] ; low part
  153. sbb word ptr [bp+offset+2],0
  154. endm
  155. ; Negate a long in (dx,ax).
  156. negr macro
  157. neg dx
  158. neg ax
  159. sbb dx,0
  160. endm
  161. ; Divide two unsigned longs on the stack.
  162. ; Leave either the quotient or the remainder in (dx,ax).
  163. ; Operand offsets assume that bp (and only bp) has been pushed.
  164. nlo equ 6
  165. nhi equ 8
  166. dlo equ 10
  167. dhi equ 12
  168. ; We use an offset in bx to distinguish div from mod,
  169. ; and to indicate whether the result should be negated.
  170. odiv equ 0
  171. omod equ 2
  172. odivneg equ 4
  173. omodneg equ 6
  174. F_LMOD@ proc far
  175. LMOD@ proc far
  176. push bp
  177. mov bp,sp
  178. mov bx,omod
  179. ; Take abs of denominator
  180. cmp byte ptr [bp+dhi+1],bh ; bh = 0
  181. jge modpd
  182. negbp dlo
  183. modpd: ; Negate mod if numerator < 0
  184. cmp byte ptr [bp+nhi+1],bh ; bh = 0
  185. jge udiv
  186. mov bx,omodneg
  187. negnum: negbp nlo
  188. jmp udiv
  189. LMOD@ endp
  190. F_LMOD@ endp
  191. F_LUMOD@ proc far
  192. LUMOD@ proc far
  193. mov bx,omod
  194. jmp udpush
  195. LUMOD@ endp
  196. F_LUMOD@ endp
  197. F_LDIV@ proc far
  198. LDIV@ proc far
  199. push bp
  200. mov bp,sp
  201. mov bx,odiv
  202. ; Negate quo if num^den < 0
  203. mov ax,[bp+nhi]
  204. xor ax,[bp+dhi]
  205. jge divabs
  206. mov bx,odivneg
  207. divabs: ; Take abs of denominator
  208. cmp byte ptr [bp+dhi+1],bh ; bh = 0
  209. jge divpd
  210. negbp dlo
  211. divpd: ; Take abs of numerator
  212. cmp byte ptr [bp+nhi+1],bh ; bh = 0
  213. jge udiv
  214. jmp negnum
  215. LDIV@ endp
  216. F_LDIV@ endp
  217. F_LUDIV@ proc far
  218. LUDIV@ proc far
  219. mov bx,odiv
  220. udpush: push bp
  221. mov bp,sp
  222. udiv: push bx ; odiv, omod, odivneg, omodneg
  223. mov ax,[bp+nlo]
  224. mov dx,[bp+nhi]
  225. mov bx,[bp+dlo]
  226. mov cx,[bp+dhi]
  227. ; Now we are dividing dx:ax by cx:bx.
  228. ; Check to see whether this is really a 32/16 division.
  229. or cx,cx
  230. jnz div2
  231. ; 32/16, check for 16- vs. 32-bit quotient
  232. cmp dx,bx
  233. jae div1
  234. ; 32/16 with 16-bit quotient, just do it.
  235. div bx ; ax = quo, dx = rem
  236. pop bx
  237. pop bp
  238. jmp cs:xx1[bx]
  239. even
  240. xx1 dw offset divx1
  241. dw offset modx1
  242. dw offset divx1neg
  243. dw offset modx1neg
  244. modx1: mov ax,dx
  245. divx1: xor dx,dx
  246. ret 8
  247. modx1neg: mov ax,dx
  248. divx1neg: xor dx,dx
  249. rneg: negr
  250. ret 8
  251. ; 32/16 with 32-bit quotient, do in 2 parts.
  252. div1: mov cx,ax ; save lo num
  253. mov ax,dx
  254. xor dx,dx
  255. div bx ; ax = hi quo
  256. xchg cx,ax ; save hi quo, get lo num
  257. div bx ; ax = lo quo, dx = rem
  258. pop bx
  259. pop bp
  260. jmp cs:xx1a[bx]
  261. even
  262. xx1a dw offset divx1a
  263. dw offset modx1
  264. dw offset divx1aneg
  265. dw offset modx1neg
  266. divx1a: mov dx,cx ; hi quo
  267. ret 8
  268. divx1aneg: mov dx,cx
  269. jmp rneg
  270. ; This is really a 32/32 bit division.
  271. ; (Note that the quotient cannot exceed 16 bits.)
  272. ; The following algorithm is taken from pp. 235-240 of Knuth, vol. 2
  273. ; (first edition).
  274. ; Start by normalizing the numerator and denominator.
  275. div2: or ch,ch
  276. jz div21 ; ch == 0, but cl != 0
  277. ; Do 8 steps all at once.
  278. mov bl,bh
  279. mov bh,cl
  280. mov cl,ch
  281. xor ch,ch
  282. mov al,ah
  283. mov ah,dl
  284. mov dl,dh
  285. xor dh,dh
  286. rol bx,1 ; faster than jmp
  287. div2a: rcr bx,1 ; finish previous shift
  288. div21: shr dx,1
  289. rcr ax,1
  290. shr cx,1
  291. jnz div2a
  292. rcr bx,1
  293. ; Now we can do a 32/16 divide.
  294. div2x: div bx ; ax = quo, dx = rem
  295. ; Multiply by the denominator, and correct the result.
  296. mov cx,ax ; save quotient
  297. mul word ptr [bp+dhi]
  298. mov bx,ax ; save lo part of hi product
  299. mov ax,cx
  300. mul word ptr [bp+dlo]
  301. add dx,bx
  302. ; Now cx = trial quotient, (dx,ax) = cx * denominator.
  303. not dx
  304. neg ax
  305. cmc
  306. adc dx,0 ; double-precision neg
  307. jc divz ; zero quotient
  308. ; requires special handling
  309. add ax,[bp+nlo]
  310. adc dx,[bp+nhi]
  311. jc divx
  312. ; Quotient is too large, adjust it.
  313. div3: dec cx
  314. add ax,[bp+dlo]
  315. adc dx,[bp+dhi]
  316. jnc div3
  317. ; All done. (dx,ax) = remainder, cx = lo quotient.
  318. divx: pop bx
  319. pop bp
  320. jmp cs:xx3[bx]
  321. even
  322. xx3 dw offset divx3
  323. dw offset modx3
  324. dw offset divx3neg
  325. dw offset modx3neg
  326. divx3: mov ax,cx
  327. xor dx,dx
  328. modx3: ret 8
  329. divx3neg: mov ax,cx
  330. xor dx,dx
  331. modx3neg: jmp rneg
  332. ; Handle zero quotient specially.
  333. divz: pop bx
  334. jmp cs:xxz[bx]
  335. even
  336. xxz dw offset divxz
  337. dw offset modxz
  338. dw offset divxz
  339. dw offset modxzneg
  340. divxz: pop bp
  341. ret 8
  342. modxzneg: negbp nlo
  343. modxz: mov ax,[bp+nlo]
  344. mov dx,[bp+nhi]
  345. pop bp
  346. ret 8
  347. LUDIV@ endp
  348. F_LUDIV@ endp
  349. endif ; FOR80386
  350. ifdef NOFPU
  351. ; See gsmisc.c for the C version of this code.
  352. ; /*
  353. ; * Floating multiply with fixed result, for avoiding floating point in
  354. ; * common coordinate transformations. Assumes IEEE representation,
  355. ; * 16-bit short, 32-bit long. Optimized for the case where the first
  356. ; * operand has no more than 16 mantissa bits, e.g., where it is a user space
  357. ; * coordinate (which are often integers).
  358. ; *
  359. ; * The assembly language version of this code is actually faster than
  360. ; * the FPU, if the code is compiled with FPU_TYPE=0 (which requires taking
  361. ; * a trap on every FPU operation). If there is no FPU, the assembly
  362. ; * language version of this code is over 10 times as fast as the
  363. ; * emulated FPU.
  364. ; */
  365. ; fixed
  366. ; fmul2fixed_(long /*float*/ a, long /*float*/ b)
  367. ; {
  368. PUBLIC _fmul2fixed_
  369. _fmul2fixed_ proc far
  370. push bp
  371. mov bp,sp
  372. a equ 6
  373. alo equ a
  374. ahi equ a+2
  375. b equ 10
  376. blo equ b
  377. bhi equ b+2
  378. push si ; will hold ma
  379. push di ; will hold mb
  380. ; int e = 260 + _fixed_shift - ((
  381. ; (((uint)(a >> 16)) & 0x7f80) + (((uint)(b >> 16)) & 0x7f80)
  382. ; ) >> 7);
  383. mov dx,[bp+ahi]
  384. ; dfmul2fixed enters here
  385. fmf: mov cx,260+12
  386. mov ax,[bp+bhi]
  387. and ax,7f80h
  388. and dx,7f80h
  389. add ax,dx
  390. xchg ah,al ; ror ax,7 without using cl
  391. rol ax,1
  392. sub cx,ax
  393. push cx ; e
  394. ; ulong ma = (ushort)(a >> 8) | 0x8000;
  395. ; ulong mb = (ushort)(b >> 8) | 0x8000;
  396. mov si,[bp+alo+1] ; unaligned
  397. clear ax
  398. mov di,[bp+blo+1] ; unaligned
  399. or si,8000h
  400. or di,8000h
  401. ; ulong p1 = ma * (b & 0xff);
  402. mov al,[bp+blo]
  403. mul si
  404. ; (Do this later:)
  405. ; ulong p = ma * mb;
  406. ; if ( (byte)a ) /* >16 mantissa bits */
  407. cmp byte ptr [bp+alo],0
  408. je mshort
  409. ; { ulong p2 = (a & 0xff) * mb;
  410. ; p += ((((uint)(byte)a * (uint)(byte)b) >> 8) + p1 + p2) >> 8;
  411. mov cx,dx
  412. mov bx,ax
  413. clear ax
  414. mov al,[bp+alo]
  415. clear dx
  416. mov dl,[bp+blo]
  417. mul dx
  418. mov dl,ah ; dx is zero
  419. add bx,cx
  420. adc cx,0
  421. clear ax
  422. mov al,[bp+alo]
  423. mul di
  424. add ax,bx
  425. adc dx,cx
  426. ; }
  427. mshort:
  428. ; else
  429. ; p += p1 >> 8;
  430. mov bl,ah ; set (cx,bx) = (dx,ax) >> 8
  431. mov bh,dl
  432. clear cx
  433. mov cl,dh
  434. mov ax,si
  435. mul di
  436. add ax,bx
  437. adc dx,cx
  438. ; if ( (uint)e < 32 ) /* e = -1 is possible */
  439. pop cx ; e
  440. cmp cx,16
  441. jb shr1
  442. ; else if ( e >= 32 ) /* also detects a=0 or b=0 */
  443. cmp cx,0
  444. jl eneg
  445. sub cx,16
  446. cmp cx,16
  447. jge shr0
  448. mov ax,dx
  449. clear dx
  450. shr ax,cl
  451. jmp ex
  452. ; return fixed_0;
  453. shr0: clear ax
  454. clear dx
  455. jmp ex
  456. ; else
  457. ; p <<= -e;
  458. even
  459. eneg: neg cx
  460. shl dx,cl
  461. mov bx,ax
  462. shl ax,cl
  463. rol bx,cl
  464. xor bx,ax
  465. add dx,bx
  466. jmp ex
  467. ; p >>= e;
  468. even
  469. shr1: shr ax,cl
  470. mov bx,dx
  471. shr dx,cl
  472. ror bx,cl
  473. xor bx,dx
  474. add ax,bx
  475. ex:
  476. ; return ((a ^ b) < 0 ? -p : p);
  477. mov cx,[bp+ahi]
  478. xor cx,[bp+bhi]
  479. jge pos
  480. neg dx
  481. neg ax
  482. sbb dx,0
  483. pos:
  484. ; }
  485. retu: pop di
  486. pop si
  487. mov sp,bp
  488. pop bp
  489. ret
  490. _fmul2fixed_ ENDP
  491. ; The same routine with the first argument a double rather than a float.
  492. ; The argument is split into two pieces to reduce data movement.
  493. PUBLIC _dfmul2fixed_
  494. _dfmul2fixed_ proc far
  495. push bp
  496. mov bp,sp
  497. xalo equ 6
  498. ;b equ 10
  499. xahi equ 14
  500. push si ; overlap this below
  501. push di ; ditto
  502. ; Shuffle the arguments and then use fmul2fixed.
  503. ; Squeeze 3 exponent bits out of the top 35 bits of a.
  504. mov dx,[bp+xahi+2]
  505. mov bx,0c000h
  506. mov ax,[bp+xahi]
  507. and bx,dx
  508. mov cx,[bp+xalo+2]
  509. and dx,7ffh ; get rid of discarded bits
  510. add cx,cx ; faster than shl!
  511. jz cz ; detect common case
  512. adc ax,ax ; faster than rcl!
  513. adc dx,dx
  514. add cx,cx
  515. adc ax,ax
  516. adc dx,dx
  517. add cx,cx
  518. adc ax,ax
  519. mov [bp+alo],ax
  520. adc dx,dx
  521. or dx,bx
  522. mov [bp+ahi],dx
  523. jmp fmf
  524. even
  525. cz: adc ax,ax
  526. adc dx,dx
  527. add ax,ax
  528. adc dx,dx
  529. add ax,ax
  530. mov [bp+alo],ax
  531. adc dx,dx
  532. or dx,bx
  533. mov [bp+ahi],dx
  534. jmp fmf
  535. _dfmul2fixed_ ENDP
  536. endif ; NOFPU
  537. ; Transpose an 8x8 bit matrix. See gsmisc.c for the algorithm in C.
  538. PUBLIC _memflip8x8
  539. _memflip8x8 proc far
  540. push ds
  541. push si
  542. push di
  543. ; After pushing, the offsets of the parameters are:
  544. ; byte *inp=10, int line_size=14, byte *outp=16, int dist=20.
  545. mov si,sp
  546. mov di,ss:[si+14] ; line_size
  547. lds si,ss:[si+10] ; inp
  548. ; We assign variables to registers as follows:
  549. ; ax = AE, bx = BF, cx (or di) = CG, dx = DH.
  550. ; Load the input data. Initially we assign
  551. ; ax = AB, bx = EF, cx (or di) = CD, dx = GH.
  552. mov ah,[si]
  553. iload macro reg
  554. add si,di
  555. mov reg,[si]
  556. endm
  557. iload al
  558. iload ch
  559. iload cl
  560. iload bh
  561. iload bl
  562. iload dh
  563. iload dl
  564. ; Transposition macro, see C code for explanation.
  565. trans macro reg1,reg2,shift,mask
  566. mov si,reg1
  567. shr si,shift
  568. xor si,reg2
  569. and si,mask
  570. xor reg2,si
  571. shl si,shift
  572. xor reg1,si
  573. endm
  574. ; Do 4x4 transpositions
  575. mov di,cx ; we need cl for the shift count
  576. mov cl,4
  577. trans bx,ax,cl,0f0fh
  578. trans dx,di,cl,0f0fh
  579. ; Swap B/E, D/G
  580. xchg al,bh
  581. mov cx,di
  582. xchg cl,dh
  583. ; Do 2x2 transpositions
  584. mov di,cx ; need cl again
  585. mov cl,2
  586. trans di,ax,cl,3333h
  587. trans dx,bx,cl,3333h
  588. mov cx,di ; done shifting >1
  589. ; Do 1x1 transpositions
  590. trans bx,ax,1,5555h
  591. trans dx,cx,1,5555h
  592. ; Store result
  593. mov si,sp
  594. mov di,ss:[si+20] ; dist
  595. lds si,ss:[si+16] ; outp
  596. mov [si],ah
  597. istore macro reg
  598. add si,di
  599. mov [si],reg
  600. endm
  601. istore bh
  602. istore ch
  603. istore dh
  604. istore al
  605. istore bl
  606. istore cl
  607. istore dl
  608. ; All done
  609. pop di
  610. pop si
  611. pop ds
  612. ret
  613. _memflip8x8 ENDP
  614. utilasm_TEXT ENDS
  615. END