Difference between revisions of "Z80 Routines:Math:Division"

From WikiTI
Jump to: navigation, search
(Fixed 24/8 and 32/8.)
(Added an optimized 32/16)
 
(2 intermediate revisions by one other user not shown)
Line 154: Line 154:
 
djnz Div32By16_Loop
 
djnz Div32By16_Loop
 
ret
 
ret
 +
</nowiki>
 +
 +
== 32/16 division, Speed Optimized ==
 +
 +
At the cost of 30 bytes (exactly double the size), we can save 600cc on average, a 19.5% speed up. However, the inputs are slightly different here!
 +
 +
<nowiki>
 +
div32_16:
 +
;DEIX/BC -> HLIX remainder DE
 +
;170+4*div32_16_sub8
 +
;min: 2182cc
 +
;max: 2790cc
 +
;avg: 2462cc
 +
;60 bytes
 +
 +
; Negate BC to allow add instead of sbc
 +
  xor a      ; 4
 +
; Need to set HL to 0 anyways, so save 2cc and a byte
 +
  ld h,a    ; 4
 +
  ld l,a    ; 4
 +
  sub c      ; 4
 +
  ld c,a    ; 4
 +
  sbc a,a    ; 4
 +
  sub b      ; 4
 +
  ld b,a    ; 4
 +
 +
 +
  ld a,d              ; 4
 +
  call div32_16_sub8  ; 17
 +
  rla                ; 4
 +
  ld d,a              ; 4
 +
 +
  ld a,e              ; 4
 +
  call div32_16_sub8  ; 17
 +
  rla                ; 4
 +
  ld e,a              ; 4
 +
 +
  ld a,ixh            ; 8
 +
  call div32_16_sub8  ; 17
 +
  rla                ; 4
 +
  ld ixh,a            ; 8
 +
 +
  ld a,ixl            ; 8
 +
  call div32_16_sub8  ; 17
 +
  rla                ; 4
 +
  ld ixl,a            ; 8
 +
 +
  ex de,hl  ; 4
 +
  ret        ; 10
 +
 +
div32_16_sub8:
 +
;119+8*div32_16_sub
 +
;min: 503cc
 +
;max: 655cc
 +
;avg: 573cc
 +
  call +_
 +
_:
 +
;17+2(17+2(div32_16_sub)))
 +
  call +_
 +
_:
 +
;17+2(div32_16_sub)
 +
  call div32_16_sub
 +
div32_16_sub:
 +
;48+{8,0+{0,19}}
 +
;min: 48cc
 +
;max: 67cc
 +
;avg: 56.75cc
 +
  rla        ; 4
 +
  adc hl,hl  ; 15
 +
  jr c,+_    ;12/7
 +
  add hl,bc  ; 11
 +
  ret c      ;11/5
 +
  sbc hl,bc  ; 15
 +
  ret        ; 10
 +
_:
 +
  add hl,bc  ; 11
 +
  scf        ; 4
 +
  ret        ; 10
 
  </nowiki>
 
  </nowiki>
  
Line 168: Line 246:
 
   add hl, hl
 
   add hl, hl
 
   rla
 
   rla
 +
  jr c, $+5
 
   cp c
 
   cp c
 
   jr c, $+4
 
   jr c, $+4

Latest revision as of 15:18, 30 September 2019


Introduction

All these routines use the restoring division algorithm, adapted to the z80 architecture to maximize speed. They can easily be unrolled to gain some speed.

8/8 division

The following routine divides d by e and places the quotient in d and the remainder in a

div_d_e:
   xor	a
   ld	b, 8

_loop:
   sla	d
   rla
   cp	e
   jr	c, $+4
   sub	e
   inc	d
   
   djnz	_loop
   
   ret
 

16/8 division

The following routine divides hl by c and places the quotient in hl and the remainder in a

div_hl_c:
   xor	a
   ld	b, 16

_loop:
   add	hl, hl
   rla
   jr	c, $+5
   cp	c
   jr	c, $+4

   sub	c
   inc	l
   
   djnz	_loop
   
   ret
 

16/16 division

The following routine divides ac by de and places the quotient in ac and the remainder in hl

div_ac_de:
   ld	hl, 0
   ld	b, 16

_loop:
   sll	c
   rla
   adc	hl, hl
   sbc	hl, de
   jr	nc, $+4
   add	hl, de
   dec	c
   
   djnz	_loop
   
   ret
 

24/8 division

The following routine divides ehl by d and places the quotient in ehl and the remainder in a

div_ehl_d:
   xor	a
   ld	b, 24

_loop:
   add	hl, hl
   rl	e
   rla
   jr	c, $+5
   cp	d
   jr	c, $+4

   sub	d
   inc	l
   
   djnz	_loop
   
   ret
 

32/8 division

The following routine divides dehl by c and places the quotient in dehl and the remainder in a

div_dehl_c:
   xor	a
   ld	b, 32

_loop:
   add	hl, hl
   rl	e
   rl	d
   rla
   jr	c, $+5
   cp	c
   jr	c, $+4

   sub	c
   inc	l
   
   djnz	_loop
   
   ret
 

32/16 division

The following routine divides acix by de and places the quotient in acix and the remainder in hl

Div32By16:
; IN:	ACIX=dividend, DE=divisor
; OUT:	ACIX=quotient, DE=divisor, HL=remainder, B=0
	ld	hl,0
	ld	b,32
Div32By16_Loop:
	add	ix,ix
	rl	c
	rla
	adc	hl,hl
	jr	c,Div32By16_Overflow
	sbc	hl,de
	jr	nc,Div32By16_SetBit
	add	hl,de
	djnz	Div32By16_Loop
	ret
Div32By16_Overflow:
	or	a
	sbc	hl,de
Div32By16_SetBit:
	.db	$DD,$2C		; inc ixl, change to inc ix to avoid undocumented
	djnz	Div32By16_Loop
	ret
 

32/16 division, Speed Optimized

At the cost of 30 bytes (exactly double the size), we can save 600cc on average, a 19.5% speed up. However, the inputs are slightly different here!

div32_16:
;DEIX/BC -> HLIX remainder DE
;170+4*div32_16_sub8
;min: 2182cc
;max: 2790cc
;avg: 2462cc
;60 bytes

; Negate BC to allow add instead of sbc
  xor a      ; 4
; Need to set HL to 0 anyways, so save 2cc and a byte
  ld h,a     ; 4
  ld l,a     ; 4
  sub c      ; 4
  ld c,a     ; 4
  sbc a,a    ; 4
  sub b      ; 4
  ld b,a     ; 4


  ld a,d              ; 4
  call div32_16_sub8  ; 17
  rla                 ; 4
  ld d,a              ; 4

  ld a,e              ; 4
  call div32_16_sub8  ; 17
  rla                 ; 4
  ld e,a              ; 4

  ld a,ixh            ; 8
  call div32_16_sub8  ; 17
  rla                 ; 4
  ld ixh,a            ; 8

  ld a,ixl            ; 8
  call div32_16_sub8  ; 17
  rla                 ; 4
  ld ixl,a            ; 8

  ex de,hl   ; 4
  ret        ; 10

div32_16_sub8:
;119+8*div32_16_sub
;min: 503cc
;max: 655cc
;avg: 573cc
  call +_
_:
;17+2(17+2(div32_16_sub)))
  call +_
_:
;17+2(div32_16_sub)
  call div32_16_sub
div32_16_sub:
;48+{8,0+{0,19}}
;min: 48cc
;max: 67cc
;avg: 56.75cc
  rla        ; 4
  adc hl,hl  ; 15
  jr c,+_    ;12/7
  add hl,bc  ; 11
  ret c      ;11/5
  sbc hl,bc  ; 15
  ret        ; 10
_:
  add hl,bc  ; 11
  scf        ; 4
  ret        ; 10
 

Rounded 16/8 division

The following routine divides hl by c and places the rounded quotient in hl and twice the prerounded remainder in a.

RoundHL_Div_C:
   xor	a
   ld	b, 16

_loop:
   add	hl, hl
   rla
   jr	c, $+5
   cp	c
   jr	c, $+4
   sub	c
   inc	l   
   djnz	_loop
;This part is the rounding
   add a,a
   cp	c
   ret	c
   inc hl
   ret