Difference between revisions of "Z80 Routines:Math:Square root"

From WikiTI
Jump to: navigation, search
(Reorganized and added a few routines including 32-bit integer square root)
 
(5 intermediate revisions by 3 users not shown)
Line 22: Line 22:
 
   ret </nowiki>
 
   ret </nowiki>
  
 +
== Small and Fast ==
  
==Speed Optimization==
+
This code was found on z80 bits and has the advantage of being both faster than all three versions above and smaller than the last two (it runs in under 720 T-states (under 640 if fully unrolled) and takes a mere 29 bytes). On the other hand it takes a somewhat unconventional input... It computes the square root of the 16bit number formed by la and places the result in d.
This version uses the high school method of finding a square root and so it is much faster, running at about ~850 tstates.  Unfortunately it requires 180 bytes and is quite obfuscated.
+
<nowiki>;-------------------------------
+
;Square Root
+
;Inputs:
+
;DE = number to be square rooted
+
;Outputs:
+
;A  = square root
+
 
+
sqrt:
+
    xor a
+
    ld h,a
+
    ld l,a
+
    ld b,a
+
    rl d
+
    rl l
+
    rl d
+
    rl l
+
    ld c,1
+
    sbc hl,bc
+
    jp c,$+3+2+1
+
    sbc hl,bc
+
    inc a
+
    add hl,bc
+
    add a,a
+
    rl d
+
    rl l
+
    rl d
+
    rl l
+
    ld c,a
+
    scf
+
    rl c
+
    sbc hl,bc
+
    jp c,$+3+2+1
+
    sbc hl,bc
+
    inc a
+
    add hl,bc
+
    add a,a
+
    rl d
+
    rl l
+
    rl d
+
    rl l
+
    ld c,a
+
    scf
+
    rl c
+
    sbc hl,bc
+
    jp c,$+3+2+1
+
    sbc hl,bc
+
    inc a
+
    add hl,bc
+
    add a,a
+
    rl d
+
    rl l
+
    rl d
+
    rl l
+
    ld c,a
+
    scf
+
    rl c
+
    sbc hl,bc
+
    jp c,$+3+2+1
+
    sbc hl,bc
+
    inc a
+
    add hl,bc
+
    add a,a
+
    rl e
+
    adc hl,hl
+
    rl e
+
    adc hl,hl
+
    ld c,a
+
    scf
+
    rl c
+
    sbc hl,bc
+
    jp c,$+3+2+1
+
    sbc hl,bc
+
    inc a
+
    add hl,bc
+
    add a,a
+
    rl e
+
    adc hl,hl
+
    rl e
+
    adc hl,hl
+
    ld c,a
+
    scf
+
    rl c
+
    sbc hl,bc
+
    jp c,$+3+2+1
+
    sbc hl,bc
+
    inc a
+
    add hl,bc
+
    add a,a
+
    rl e
+
    adc hl,hl
+
    rl e
+
    adc hl,hl
+
    ld c,a
+
    scf
+
    rl c
+
    sbc hl,bc
+
    jp c,$+3+2+1
+
    sbc hl,bc
+
    inc a
+
    add hl,bc
+
    add a,a
+
    rl e
+
    adc hl,hl
+
    rl e
+
    adc hl,hl
+
    ld c,a
+
    scf
+
    rl c
+
    rl b
+
    sbc hl,bc
+
    jp c,$+3+2+1
+
    sbc hl,bc
+
    inc a
+
    add hl,bc
+
    ret</nowiki>
+
 
+
 
+
==Balanced Optimization==
+
This version is a balance between speed and size. It also uses the high school method and runs under 1200 tstates. It only costs 41 bytes.
+
<nowiki>;-------------------------------
+
;Square Root
+
;Inputs:
+
;DE = number to be square rooted
+
;Outputs:
+
;A  = square root
+
 
+
Sqrt:
+
    ld hl,0
+
    ld c,l
+
    ld b,h
+
    ld a,8
+
Sqrtloop:
+
    sla e
+
    rl d
+
    adc hl,hl
+
    sla e
+
    rl d
+
    adc hl,hl
+
    scf              ;Can be optimised
+
    rl c              ;with SL1 instruction
+
    rl b
+
    sbc hl,bc
+
    jr nc,Sqrtaddbit
+
    add hl,bc
+
    dec c
+
Sqrtaddbit:
+
    inc c
+
    res 0,c
+
    dec a
+
    jr nz,Sqrtloop
+
    ld a,c
+
    rr b
+
    rra
+
    ret</nowiki>
+
 
+
== Presumably the best ==
+
 
+
This code was found on z80 bits and has the advantage of being both faster than all three versions above and smaller than the last two (it runs in under 720 T-states (under 640 if fully unrolled) and takes a mere 29 bytes). On the other hand it takes a somewhat unconventionnal input... It computes the square root of the 16bit number formed by la and places the result in d.
+
 
  <nowiki>
 
  <nowiki>
 
sqrt_la:
 
sqrt_la:
Line 191: Line 33:
 
 
 
; need to clear the carry beforehand
 
; need to clear the carry beforehand
xor a
+
or a
 
 
 
_loop:
 
_loop:
Line 213: Line 55:
 
  </nowiki>
 
  </nowiki>
  
 +
==Speed Optimization==
 +
This 92 byte version is an optimized unrolled loop taking at most 379 t-states. Each iteration is optimized for its location in the algorithm.
  
 +
<nowiki>
 +
; fast 16 bit isqrt by John Metcalf
 +
; 92 bytes, 344-379 cycles (average 362)
 +
; v2 - saved 3 cycles with a tweak suggested by Russ McNulty
 +
 +
; call with hl = number to square root
 +
; returns    a = square root
 +
; corrupts hl, de
 +
 +
; ----------
 +
 +
  ld a,h        ; 4
 +
  ld de,0B0C0h  ; 10
 +
  add a,e      ; 4
 +
  jr c,sq7      ; 12 / 7
 +
  ld a,h        ; 4
 +
  ld d,0F0h    ; 7
 +
sq7:
 +
 +
; ----------
 +
 +
  add a,d      ; 4
 +
  jr nc,sq6    ; 12 / 7
 +
  res 5,d      ; 8
 +
  db 254        ; 7
 +
sq6:
 +
  sub d        ; 4
 +
  sra d        ; 8
 +
 +
; ----------
 +
 +
  set 2,d      ; 8
 +
  add a,d      ; 4
 +
  jr nc,sq5    ; 12 / 7
 +
  res 3,d      ; 8
 +
  db 254        ; 7
 +
sq5:
 +
  sub d        ; 4
 +
  sra d        ; 8
 +
 +
; ----------
 +
 +
  inc d        ; 4
 +
  add a,d      ; 4
 +
  jr nc,sq4    ; 12 / 7
 +
  res 1,d      ; 8
 +
  db 254        ; 7
 +
sq4:
 +
  sub d        ; 4
 +
  sra d        ; 8
 +
  ld h,a        ; 4
 +
 +
; ----------
 +
 +
  add hl,de    ; 11
 +
  jr nc,sq3    ; 12 / 7
 +
  ld e,040h    ; 7
 +
  db 210        ; 10
 +
sq3:
 +
  sbc hl,de    ; 15
 +
  sra d        ; 8
 +
  ld a,e        ; 4
 +
  rra          ; 4
 +
 +
; ----------
 +
 +
  or 010h      ; 7
 +
  ld e,a        ; 4
 +
  add hl,de    ; 11
 +
  jr nc,sq2    ; 12 / 7
 +
  and 0DFh      ; 7
 +
  db 218        ; 10
 +
sq2:
 +
  sbc hl,de    ; 15
 +
  sra d        ; 8
 +
  rra          ; 4
 +
 +
; ----------
 +
 +
  or 04h        ; 7
 +
  ld e,a        ; 4
 +
  add hl,de    ; 11
 +
  jr nc,sq1    ; 12 / 7
 +
  and 0F7h      ; 7
 +
  db 218        ; 10
 +
sq1:
 +
  sbc hl,de    ; 15
 +
  sra d        ; 8
 +
  rra          ; 4
 +
 +
; ----------
 +
 +
  inc a        ; 4
 +
  ld e,a        ; 4
 +
  add hl,de    ; 11
 +
  jr nc,sq0    ; 12 / 7
 +
  and 0FDh      ; 7
 +
sq0:
 +
  sra d        ; 8
 +
  rra          ; 4
 +
  cpl          ; 4
 +
</nowiki>
 +
 +
==Speed Optimization 2==
 +
This version uses a slightly different approach to the same algorithm as the one above, but is smaller and faster (this one includes an RET at the end, contributing 10cc and 1 byte):
 +
 +
<nowiki>
 +
; fast 16-bit isqrt by Zeda Thomas
 +
;Feel free to use for whatever :)
 +
 +
sqrtHL:
 +
;Input: HL
 +
;Output: A is the integer square root of HL
 +
;Destroys: HL,DE (D is actually 0)
 +
;min: 343cc
 +
;max: 380cc
 +
;avg: 361.5cc
 +
;88 bytes
 +
  ld de,05040h  ; 10
 +
  ld a,h        ; 4
 +
  sub e        ; 4
 +
  jr nc,sq7    ;\
 +
  add a,e      ; | branch 1: 12cc
 +
  ld d,16      ; | branch 2: 18cc
 +
sq7:            ;/
 +
 +
; ----------
 +
 +
  cp d          ; 4
 +
  jr c,sq6      ;\
 +
  sub d        ; | branch 1: 12cc
 +
  set 5,d      ; | branch 2: 19cc
 +
sq6:            ;/
 +
 +
; ----------
 +
  res 4,d      ; 8
 +
  srl d        ; 8
 +
  set 2,d      ; 8
 +
  cp d          ; 4
 +
  jr c,sq5      ;\
 +
  sub d        ; | branch 1: 12cc
 +
  set 3,d      ; | branch 2: 19cc
 +
sq5:            ;/
 +
  srl d        ; 8
 +
 +
; ----------
 +
 +
  inc a        ; 4
 +
  sub d        ; 4
 +
  jr nc,sq4    ;\
 +
  dec d        ; | branch 1: 12cc
 +
  add a,d      ; | branch 2: 19cc
 +
  dec d        ; | <-- this resets the low bit of D, so `srl d` resets carry.
 +
sq4:            ;/
 +
  srl d        ; 8
 +
  ld h,a        ; 4
 +
 +
; ----------
 +
 +
  ld a,e        ; 4
 +
  sbc hl,de    ; 15
 +
  jr nc,sq3    ;\
 +
  add hl,de    ; | 12cc or 18cc
 +
sq3:            ;/
 +
  ccf          ; 4
 +
  rra          ; 4
 +
  srl d        ; 8
 +
  rra          ; 4
 +
 +
; ----------
 +
 +
  ld e,a        ; 4
 +
  sbc hl,de    ; 15
 +
  jr c,sq2      ;\
 +
  or 20h        ; | branch 1: 23cc
 +
  db 254        ; |  <-- start of `cp *` which is 7cc to skip the next byte.
 +
sq2:            ; | branch 2: 21cc
 +
  add hl,de    ;/
 +
 +
  xor 18h      ; 7
 +
  srl d        ; 8
 +
  rra          ; 4
 +
 +
; ----------
 +
 +
  ld e,a        ; 4
 +
  sbc hl,de    ; 15
 +
  jr c,sq1      ;\
 +
  or 8          ; | branch 1: 23cc
 +
  db 254        ; |  <-- start of `cp *` which is 7cc to skip the next byte.
 +
sq1:            ; | branch 2: 21cc
 +
  add hl,de    ;/
 +
 +
  xor 6        ; 7
 +
  srl d        ; 8
 +
  rra          ; 4
 +
 +
; ----------
 +
 +
  ld e,a        ; 4
 +
  sbc hl,de    ; 15
 +
 +
;This code would restore the square root
 +
;  jr nc,sq0      ;\
 +
;  add hl,de    ; | 12cc or 18cc
 +
; sq0:            ;/
 +
 +
  sbc a,255    ; 7
 +
  srl d        ; 8
 +
  rra          ; 4
 +
  ret          ; 10
 +
</nowiki>
 +
 +
==Speed Optimization, preserve remainder==
 +
This is essentially the same as the algorithm above, but with an optimized final step that preserves the remainder. The remainder is possibly useful for higher precision square roots.
 +
<nowiki>
 +
;written by Zeda
 +
sqrtHL:
 +
;returns A as the sqrt, HL as the remainder, D = 0
 +
;min: 352cc
 +
;max: 391cc
 +
;avg: 371.5cc
 +
 +
 +
  ld de,05040h  ; 10
 +
  ld a,h        ; 4
 +
  sub e        ; 4
 +
  jr nc,sq7    ;\
 +
  add a,e      ; | branch 1: 12cc
 +
  ld d,16      ; | branch 2: 18cc
 +
sq7:            ;/
 +
 +
; ----------
 +
 +
  cp d          ; 4
 +
  jr c,sq6      ;\
 +
  sub d        ; | branch 1: 12cc
 +
  set 5,d      ; | branch 2: 19cc
 +
sq6:            ;/
 +
 +
; ----------
 +
  res 4,d      ; 8
 +
  srl d        ; 8
 +
  set 2,d      ; 8
 +
  cp d          ; 4
 +
  jr c,sq5      ;\
 +
  sub d        ; | branch 1: 12cc
 +
  set 3,d      ; | branch 2: 19cc
 +
sq5:            ;/
 +
  srl d        ; 8
 +
 +
; ----------
 +
 +
  inc a        ; 4
 +
  sub d        ; 4
 +
  jr nc,sq4    ;\
 +
  dec d        ; | branch 1: 12cc
 +
  add a,d      ; | branch 2: 19cc
 +
  dec d        ; | <-- this resets the low bit of D, so `srl d` resets carry.
 +
sq4:            ;/
 +
  srl d        ; 8
 +
  ld h,a        ; 4
 +
 +
; ----------
 +
 +
  ld a,e        ; 4
 +
  sbc hl,de    ; 15
 +
  jr nc,sq3    ;\
 +
  add hl,de    ; | 12cc or 18cc
 +
sq3:            ;/
 +
  ccf          ; 4
 +
  rra          ; 4
 +
  srl d        ; 8
 +
  rra          ; 4
 +
 +
; ----------
 +
 +
  ld e,a        ; 4
 +
  sbc hl,de    ; 15
 +
  jr c,sq2      ;\
 +
  or 20h        ; | branch 1: 23cc
 +
  db 254        ; |  <-- start of `cp *` which is 7cc to skip the next byte.
 +
sq2:            ; | branch 2: 21cc
 +
  add hl,de    ;/
 +
 +
  xor 18h      ; 7
 +
  srl d        ; 8
 +
  rra          ; 4
 +
 +
; ----------
 +
 +
  ld e,a        ; 4
 +
  sbc hl,de    ; 15
 +
  jr c,sq1      ;\
 +
  or 8          ; | branch 1: 23cc
 +
  db 254        ; |  <-- start of `cp *` which is 7cc to skip the next byte.
 +
sq1:            ; | branch 2: 21cc
 +
  add hl,de    ;/
 +
 +
  xor 6        ; 7
 +
  srl d        ; 8
 +
  rra          ; 4
 +
 +
; ----------
 +
 +
  ld e,a        ; 4
 +
  sbc hl,de    ; 15
 +
  jr nc,+_      ;    \
 +
  add hl,de    ; 15  |
 +
  srl d        ; 8  |
 +
  rra          ; 4  | branch 1: 38cc
 +
  ret          ; 10  | branch 2: 40cc
 +
_:              ;    |
 +
  inc a        ; 4  |
 +
  srl d        ; 8  |
 +
  rra          ; 4  |
 +
  ret          ; 10 /
 +
</nowiki>
 +
 +
==32-bit Square Root (unrolled)==
 +
This routine uses a ''remainder preserving'' 16-bit integer square root routine as a subroutine.
 +
<nowiki>
 +
;NOTE: This uses undocumented instructions `sll` (a.k.a. `slia`) and `ld a,ixh` and `ld a,ixl`
 +
 +
sqrtHLIX:
 +
;Input: HLIX
 +
;Output: DE is the sqrt, AHL is the remainder
 +
;speed: 751+6{0,6}+{0,3+{0,18}}+{0,38}+sqrtHL
 +
;min: 1103
 +
;max: 1237
 +
;avg: 1165.5
 +
;166 bytes
 +
 +
  call sqrtHL  ;expects returns A as sqrt, HL as remainder, D = 0
 +
  add a,a
 +
  ld e,a
 +
  rl d
 +
 +
  ld a,ixh
 +
  sll e \ rl d
 +
  add a,a \ adc hl,hl
 +
  add a,a \ adc hl,hl
 +
  sbc hl,de
 +
  jr nc,+_
 +
  add hl,de
 +
  dec e
 +
  .db $FE    ;start of `cp *`
 +
_:
 +
  inc e
 +
 +
  sll e \ rl d
 +
  add a,a \ adc hl,hl
 +
  add a,a \ adc hl,hl
 +
  sbc hl,de
 +
  jr nc,+_
 +
  add hl,de
 +
  dec e
 +
  .db $FE    ;start of `cp *`
 +
_:
 +
  inc e
 +
 +
  sll e \ rl d
 +
  add a,a \ adc hl,hl
 +
  add a,a \ adc hl,hl
 +
  sbc hl,de
 +
  jr nc,+_
 +
  add hl,de
 +
  dec e
 +
  .db $FE    ;start of `cp *`
 +
_:
 +
  inc e
 +
 +
  sll e \ rl d
 +
  add a,a \ adc hl,hl
 +
  add a,a \ adc hl,hl
 +
  sbc hl,de
 +
  jr nc,+_
 +
  add hl,de
 +
  dec e
 +
  .db $FE    ;start of `cp *`
 +
_:
 +
  inc e
 +
 +
;Now we have four more iterations
 +
;The first two are no problem
 +
  ld a,ixl
 +
  sll e \ rl d
 +
  add a,a \ adc hl,hl
 +
  add a,a \ adc hl,hl
 +
  sbc hl,de
 +
  jr nc,+_
 +
  add hl,de
 +
  dec e
 +
  .db $FE    ;start of `cp *`
 +
_:
 +
  inc e
 +
 +
  sll e \ rl d
 +
  add a,a \ adc hl,hl
 +
  add a,a \ adc hl,hl
 +
  sbc hl,de
 +
  jr nc,+_
 +
  add hl,de
 +
  dec e
 +
  .db $FE    ;start of `cp *`
 +
_:
 +
  inc e
 +
 +
sqrt32_iter15:
 +
;On the next iteration, HL might temporarily overflow by 1 bit
 +
  sll e \ rl d      ;sla e \ rl d \ inc e
 +
  add a,a
 +
  adc hl,hl
 +
  add a,a
 +
  adc hl,hl      ;This might overflow!
 +
  jr c,sqrt32_iter15_br0
 +
;
 +
  sbc hl,de
 +
  jr nc,+_
 +
  add hl,de
 +
  dec e
 +
  jr sqrt32_iter16
 +
sqrt32_iter15_br0:
 +
  or a
 +
  sbc hl,de
 +
_:
 +
  inc e
 +
 +
;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways
 +
sqrt32_iter16:
 +
  add a,a
 +
  ld b,a        ;either 0x00 or 0x80
 +
  adc hl,hl
 +
  rla
 +
  adc hl,hl
 +
  rla
 +
;AHL - (DE+DE+1)
 +
  sbc hl,de \ sbc a,b
 +
  inc e
 +
  or a
 +
  sbc hl,de \ sbc a,b
 +
  ret p
 +
  add hl,de
 +
  adc a,b
 +
  dec e
 +
  add hl,de
 +
  adc a,b
 +
  ret
 +
</nowiki>
 
==Other Options==
 
==Other Options==
 
A binary search of a square table would yield much better best case scenarios and the worst case scenarios would be similar to the high school method. However this would also require 512 byte table making it significantly larger than the other routines.  Of course the table could also serve as a rapid squaring method.
 
A binary search of a square table would yield much better best case scenarios and the worst case scenarios would be similar to the high school method. However this would also require 512 byte table making it significantly larger than the other routines.  Of course the table could also serve as a rapid squaring method.
Line 220: Line 513:
 
* '''James Montelongo'''
 
* '''James Montelongo'''
 
* '''Milos "baze" Bazelides''' (or possibly one of the contributor of [http://baze.au.com/misc/z80bits.html z80bits])
 
* '''Milos "baze" Bazelides''' (or possibly one of the contributor of [http://baze.au.com/misc/z80bits.html z80bits])
 +
* '''John Metcalf''' (Speed Optimization [https://github.com/impomatic/z80snippets/blob/master/fastisqr.asm z80snippets])
 +
* '''Zeda Thomas''' ([https://github.com/Zeda/Z80-Optimized-Routines/blob/master/math/squareroot Z80 Optimized Routines])

Latest revision as of 16:00, 30 September 2019


Size Optimization

This version is size optimized, it compares every perfect square against HL until a square that is larger is found. Obviously slower, but does get the job done in only 12 bytes.

;-------------------------------
;Square Root
;Inputs:
;HL = number to be square rooted
;Outputs:
;A  = square root

sqrt:
   ld a,$ff
   ld de,1
sqrtloop:
   inc a
   dec e
   dec de
   add hl,de
   jr c,sqrtloop
   ret 

Small and Fast

This code was found on z80 bits and has the advantage of being both faster than all three versions above and smaller than the last two (it runs in under 720 T-states (under 640 if fully unrolled) and takes a mere 29 bytes). On the other hand it takes a somewhat unconventional input... It computes the square root of the 16bit number formed by la and places the result in d.

sqrt_la:
	ld	de, 0040h	; 40h appends "01" to D
	ld	h, d
	
	ld	b, 7
	
	; need to clear the carry beforehand
	or	a
	
_loop:
	sbc	hl, de
	jr	nc, $+3
	add	hl, de
	ccf
	rl	d
	rla
	adc	hl, hl
	rla
	adc	hl, hl
	
	djnz	_loop
	
	sbc	hl, de		; optimised last iteration
	ccf
	rl	d
	
	ret
 

Speed Optimization

This 92 byte version is an optimized unrolled loop taking at most 379 t-states. Each iteration is optimized for its location in the algorithm.

; fast 16 bit isqrt by John Metcalf
; 92 bytes, 344-379 cycles (average 362)
; v2 - saved 3 cycles with a tweak suggested by Russ McNulty

; call with hl = number to square root
; returns    a = square root
; corrupts hl, de

; ----------

  ld a,h        ; 4
  ld de,0B0C0h  ; 10
  add a,e       ; 4
  jr c,sq7      ; 12 / 7
  ld a,h        ; 4
  ld d,0F0h     ; 7
sq7:

; ----------

  add a,d       ; 4
  jr nc,sq6     ; 12 / 7
  res 5,d       ; 8
  db 254        ; 7
sq6:
  sub d         ; 4
  sra d         ; 8

; ----------

  set 2,d       ; 8
  add a,d       ; 4
  jr nc,sq5     ; 12 / 7
  res 3,d       ; 8
  db 254        ; 7
sq5:
  sub d         ; 4
  sra d         ; 8

; ----------

  inc d         ; 4
  add a,d       ; 4
  jr nc,sq4     ; 12 / 7
  res 1,d       ; 8
  db 254        ; 7
sq4:
  sub d         ; 4
  sra d         ; 8
  ld h,a        ; 4

; ----------

  add hl,de     ; 11
  jr nc,sq3     ; 12 / 7
  ld e,040h     ; 7
  db 210        ; 10
sq3:
  sbc hl,de     ; 15
  sra d         ; 8
  ld a,e        ; 4
  rra           ; 4

; ----------

  or 010h       ; 7
  ld e,a        ; 4
  add hl,de     ; 11
  jr nc,sq2     ; 12 / 7
  and 0DFh      ; 7
  db 218        ; 10
sq2:
  sbc hl,de     ; 15
  sra d         ; 8
  rra           ; 4

; ----------

  or 04h        ; 7
  ld e,a        ; 4
  add hl,de     ; 11
  jr nc,sq1     ; 12 / 7
  and 0F7h      ; 7
  db 218        ; 10
sq1:
  sbc hl,de     ; 15
  sra d         ; 8
  rra           ; 4

; ----------

  inc a         ; 4
  ld e,a        ; 4
  add hl,de     ; 11
  jr nc,sq0     ; 12 / 7
  and 0FDh      ; 7
sq0:
  sra d         ; 8
  rra           ; 4
  cpl           ; 4

Speed Optimization 2

This version uses a slightly different approach to the same algorithm as the one above, but is smaller and faster (this one includes an RET at the end, contributing 10cc and 1 byte):

; fast 16-bit isqrt by Zeda Thomas
;Feel free to use for whatever :)

sqrtHL:
;Input: HL
;Output: A is the integer square root of HL
;Destroys: HL,DE (D is actually 0)
;min: 343cc
;max: 380cc
;avg: 361.5cc
;88 bytes
  ld de,05040h  ; 10
  ld a,h        ; 4
  sub e         ; 4
  jr nc,sq7     ;\
  add a,e       ; | branch 1: 12cc
  ld d,16       ; | branch 2: 18cc
sq7:            ;/

; ----------

  cp d          ; 4
  jr c,sq6      ;\
  sub d         ; | branch 1: 12cc
  set 5,d       ; | branch 2: 19cc
sq6:            ;/

; ----------
  res 4,d       ; 8
  srl d         ; 8
  set 2,d       ; 8
  cp d          ; 4
  jr c,sq5      ;\
  sub d         ; | branch 1: 12cc
  set 3,d       ; | branch 2: 19cc
sq5:            ;/
  srl d         ; 8

; ----------

  inc a         ; 4
  sub d         ; 4
  jr nc,sq4     ;\
  dec d         ; | branch 1: 12cc
  add a,d       ; | branch 2: 19cc
  dec d         ; | <-- this resets the low bit of D, so `srl d` resets carry.
sq4:            ;/
  srl d         ; 8
  ld h,a        ; 4

; ----------

  ld a,e        ; 4
  sbc hl,de     ; 15
  jr nc,sq3     ;\
  add hl,de     ; | 12cc or 18cc
sq3:            ;/
  ccf           ; 4
  rra           ; 4
  srl d         ; 8
  rra           ; 4

; ----------

  ld e,a        ; 4
  sbc hl,de     ; 15
  jr c,sq2      ;\
  or 20h        ; | branch 1: 23cc
  db 254        ; |   <-- start of `cp *` which is 7cc to skip the next byte.
sq2:            ; | branch 2: 21cc
  add hl,de     ;/

  xor 18h       ; 7
  srl d         ; 8
  rra           ; 4

; ----------

  ld e,a        ; 4
  sbc hl,de     ; 15
  jr c,sq1      ;\
  or 8          ; | branch 1: 23cc
  db 254        ; |   <-- start of `cp *` which is 7cc to skip the next byte.
sq1:            ; | branch 2: 21cc
  add hl,de     ;/

  xor 6         ; 7
  srl d         ; 8
  rra           ; 4

; ----------

  ld e,a        ; 4
  sbc hl,de     ; 15

;This code would restore the square root
;   jr nc,sq0      ;\
;   add hl,de     ; | 12cc or 18cc
; sq0:            ;/

  sbc a,255     ; 7
  srl d         ; 8
  rra           ; 4
  ret           ; 10
 

Speed Optimization, preserve remainder

This is essentially the same as the algorithm above, but with an optimized final step that preserves the remainder. The remainder is possibly useful for higher precision square roots.

;written by Zeda
sqrtHL:
;returns A as the sqrt, HL as the remainder, D = 0
;min: 352cc
;max: 391cc
;avg: 371.5cc


  ld de,05040h  ; 10
  ld a,h        ; 4
  sub e         ; 4
  jr nc,sq7     ;\
  add a,e       ; | branch 1: 12cc
  ld d,16       ; | branch 2: 18cc
sq7:            ;/

; ----------

  cp d          ; 4
  jr c,sq6      ;\
  sub d         ; | branch 1: 12cc
  set 5,d       ; | branch 2: 19cc
sq6:            ;/

; ----------
  res 4,d       ; 8
  srl d         ; 8
  set 2,d       ; 8
  cp d          ; 4
  jr c,sq5      ;\
  sub d         ; | branch 1: 12cc
  set 3,d       ; | branch 2: 19cc
sq5:            ;/
  srl d         ; 8

; ----------

  inc a         ; 4
  sub d         ; 4
  jr nc,sq4     ;\
  dec d         ; | branch 1: 12cc
  add a,d       ; | branch 2: 19cc
  dec d         ; | <-- this resets the low bit of D, so `srl d` resets carry.
sq4:            ;/
  srl d         ; 8
  ld h,a        ; 4

; ----------

  ld a,e        ; 4
  sbc hl,de     ; 15
  jr nc,sq3     ;\
  add hl,de     ; | 12cc or 18cc
sq3:            ;/
  ccf           ; 4
  rra           ; 4
  srl d         ; 8
  rra           ; 4

; ----------

  ld e,a        ; 4
  sbc hl,de     ; 15
  jr c,sq2      ;\
  or 20h        ; | branch 1: 23cc
  db 254        ; |   <-- start of `cp *` which is 7cc to skip the next byte.
sq2:            ; | branch 2: 21cc
  add hl,de     ;/

  xor 18h       ; 7
  srl d         ; 8
  rra           ; 4

; ----------

  ld e,a        ; 4
  sbc hl,de     ; 15
  jr c,sq1      ;\
  or 8          ; | branch 1: 23cc
  db 254        ; |   <-- start of `cp *` which is 7cc to skip the next byte.
sq1:            ; | branch 2: 21cc
  add hl,de     ;/

  xor 6         ; 7
  srl d         ; 8
  rra           ; 4

; ----------

  ld e,a        ; 4
  sbc hl,de     ; 15
  jr nc,+_      ;    \
  add hl,de     ; 15  |
  srl d         ; 8   |
  rra           ; 4   | branch 1: 38cc
  ret           ; 10  | branch 2: 40cc
_:              ;     |
  inc a         ; 4   |
  srl d         ; 8   |
  rra           ; 4   |
  ret           ; 10 /
 

32-bit Square Root (unrolled)

This routine uses a remainder preserving 16-bit integer square root routine as a subroutine.

;NOTE: This uses undocumented instructions `sll` (a.k.a. `slia`) and `ld a,ixh` and `ld a,ixl`

sqrtHLIX:
;Input: HLIX
;Output: DE is the sqrt, AHL is the remainder
;speed: 751+6{0,6}+{0,3+{0,18}}+{0,38}+sqrtHL
;min: 1103
;max: 1237
;avg: 1165.5
;166 bytes

  call sqrtHL   ;expects returns A as sqrt, HL as remainder, D = 0
  add a,a
  ld e,a
  rl d

  ld a,ixh
  sll e \ rl d
  add a,a \ adc hl,hl
  add a,a \ adc hl,hl
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  .db $FE     ;start of `cp *`
_:
  inc e

  sll e \ rl d
  add a,a \ adc hl,hl
  add a,a \ adc hl,hl
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  .db $FE     ;start of `cp *`
_:
  inc e

  sll e \ rl d
  add a,a \ adc hl,hl
  add a,a \ adc hl,hl
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  .db $FE     ;start of `cp *`
_:
  inc e

  sll e \ rl d
  add a,a \ adc hl,hl
  add a,a \ adc hl,hl
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  .db $FE     ;start of `cp *`
_:
  inc e

;Now we have four more iterations
;The first two are no problem
  ld a,ixl
  sll e \ rl d
  add a,a \ adc hl,hl
  add a,a \ adc hl,hl
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  .db $FE     ;start of `cp *`
_:
  inc e

  sll e \ rl d
  add a,a \ adc hl,hl
  add a,a \ adc hl,hl
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  .db $FE     ;start of `cp *`
_:
  inc e

sqrt32_iter15:
;On the next iteration, HL might temporarily overflow by 1 bit
  sll e \ rl d      ;sla e \ rl d \ inc e
  add a,a
  adc hl,hl
  add a,a
  adc hl,hl       ;This might overflow!
  jr c,sqrt32_iter15_br0
;
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  jr sqrt32_iter16
sqrt32_iter15_br0:
  or a
  sbc hl,de
_:
  inc e

;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways
sqrt32_iter16:
  add a,a
  ld b,a        ;either 0x00 or 0x80
  adc hl,hl
  rla
  adc hl,hl
  rla
;AHL - (DE+DE+1)
  sbc hl,de \ sbc a,b
  inc e
  or a
  sbc hl,de \ sbc a,b
  ret p
  add hl,de
  adc a,b
  dec e
  add hl,de
  adc a,b
  ret
 

Other Options

A binary search of a square table would yield much better best case scenarios and the worst case scenarios would be similar to the high school method. However this would also require 512 byte table making it significantly larger than the other routines. Of course the table could also serve as a rapid squaring method.

Credits and Contributions