Difference between revisions of "Z80 Routines:Optimized:PopCountA"

From WikiTI
Jump to: navigation, search
(Created page with "PopCountA PopCountA This is a faster/smaller (and also obfuscated) replacement for the normal popcount a, which r...")
 
(Add more optimized popcount)
 
(4 intermediate revisions by one other user not shown)
Line 6: Line 6:
 
  <nowiki>;input: b
 
  <nowiki>;input: b
 
;output: a
 
;output: a
;27 bytes, 108 clock cycles
+
;destroys: f, c
 +
;26 bytes, 104 clock cycles
 
TypicalPopCountA:
 
TypicalPopCountA:
ld c,a
 
 
xor a
 
xor a
ld b,a
+
ld c, a
rrc c
+
rrc b
adc a,b
+
adc a, c
rrc c
+
rrc b
adc a,b
+
adc a, c
rrc c
+
rrc b
adc a,b
+
adc a, c
rrc c
+
rrc b
adc a,b
+
adc a, c
rrc c
+
rrc b
adc a,b
+
adc a, c
rrc c
+
rrc b
adc a,b
+
adc a, c
rrc c
+
rrc b
adc a,b
+
adc a, c
rrc c
+
rrc b
adc a,b</nowiki>
+
adc a, c</nowiki>
  
 
Better routine:
 
Better routine:
 
  <nowiki>;- Pop Count A
 
  <nowiki>;- Pop Count A
;input: byte in A
+
;input: byte in a
;output: number of set bits in A
+
;output: number of set bits in a
;destroys BC
+
;destroys f, bc
 
;22 bytes and 85 clock cycles
 
;22 bytes and 85 clock cycles
 
;author: jacobly
 
;author: jacobly
 
PopCountA:
 
PopCountA:
 +
ld c, a ; c=(A|B|C|D|E|F|G|H)
 +
and 10101010b ; a=(A|0|C|0|E|0|G|0)
 +
cpl ; a=(~A|1|~C|1|~E|1|~G|1)
 +
rrca ; a=(1|~A|1|~C|1|~E|1|~G), cf=1
 +
adc a, c ; a=(A+B|C+D|E+F|G+H)
 +
ld b, a ; b=(A+B|C+D|E+F|G+H)
 +
and 00110011b ; a=(00|C+D|00|G+H)
 +
ld c, a ; c=(00|C+D|00|G+H)
 +
xor b ; a=(A+B|00|E+F|00)
 +
rrca \ rrca ; a=(00|A+B|00|E+F)
 +
add a, c ; a=(A+B+C+D|E+F+G+H)
 +
ld c, a ; c=(A+B+C+D|E+F+G+H)
 +
rrca \ rrca \ rrca \ rrca ; a=(E+F+G+H|A+B+C+D)
 +
add a, c ; a=(A+B+C+D+E+F+G+H|A+B+C+D+E+F+G+H)
 +
and 00001111b ; a=A+B+C+D+E+F+G+H</nowiki>
 +
 +
Even better routine, making the following observations:
 +
 +
<nowiki>
 +
a = (a>>1) + (a>>1) + (a&1)
 +
a = (a>>1) + ((a>>2) + (a>>2) + ((a>>1)&1)) + (a&1)
 +
...
 +
a = (a>>1) + (a>>2) + ... + (a>>7) + (a>>7) + ((a>>6)&1) + ... + ((a>>1)&1) + (a&1)
 +
a - (a>>1) - (a>>2) - ... - (a>>7) = (a>>7) + ((a>>6)&1) + ... + ((a>>1)&1) + (a&1)</nowiki>
 +
 +
<nowiki>;- Pop Count A
 +
;input: byte in a
 +
;output: number of set bits in a
 +
;destroys f, c
 +
;21 bytes and 84 clock cycles
 +
;author: calc84maniac
 +
PopCountA:
 +
; a -= (byte>>7)
 
ld c, a
 
ld c, a
and 10101010b
 
cpl
 
rrca
 
adc a, c
 
ld b, a
 
and 00110011b
 
ld c, a
 
xor b
 
rrca
 
rrca
 
 
add a, c
 
add a, c
 +
sbc a, c
 +
; a -= (byte>>1)
 +
srl c
 +
sub c
 +
; a -= (byte>>2)
 +
srl c
 +
sub c
 +
; a -= (byte>>3)
 +
srl c
 +
sub c
 +
; a -= (byte>>4)
 +
srl c
 +
sub c
 +
; a -= (byte>>5)
 +
srl c
 +
sub c
 +
; a -= (byte>>6)
 +
srl c
 +
sub c</nowiki>
 +
 +
This algorithm can also be used to implement a size-optimized routine:
 +
<nowiki>;- Pop Count A
 +
;input: byte in a
 +
;output: number of set bits in a
 +
;destroys f, c
 +
;7 bytes and 26-194 clock cycles
 +
;author: calc84maniac
 +
PopCountA:
 
ld c, a
 
ld c, a
rrca
+
.db $FE ;CP imm8, skips subtracting c on the first iteration
rrca
+
.loop:
rrca
+
sub c
rrca
+
srl c
add a, c
+
jr nz, .loop</nowiki>
and 00001111b</nowiki>
+

Latest revision as of 10:37, 26 April 2024

This is a faster/smaller (and also obfuscated) replacement for the normal popcount a, which returns the number of set bits in a.

Typical routine:

;input: b
;output: a
;destroys: f, c
;26 bytes, 104 clock cycles
TypicalPopCountA:
	xor	a
	ld	c, a
	rrc	b
	adc	a, c
	rrc	b
	adc	a, c
	rrc	b
	adc	a, c
	rrc	b
	adc	a, c
	rrc	b
	adc	a, c
	rrc	b
	adc	a, c
	rrc	b
	adc	a, c
	rrc	b
	adc	a, c

Better routine:

;- Pop Count A
;input:	byte in a
;output: number of set bits in a
;destroys f, bc
;22 bytes and 85 clock cycles
;author: jacobly
PopCountA:
	ld	c, a			; c=(A|B|C|D|E|F|G|H)
	and	10101010b		; a=(A|0|C|0|E|0|G|0)
	cpl				; a=(~A|1|~C|1|~E|1|~G|1)
	rrca				; a=(1|~A|1|~C|1|~E|1|~G), cf=1
	adc	a, c			; a=(A+B|C+D|E+F|G+H)
	ld	b, a			; b=(A+B|C+D|E+F|G+H)
	and	00110011b		; a=(00|C+D|00|G+H)
	ld	c, a			; c=(00|C+D|00|G+H)
	xor	b			; a=(A+B|00|E+F|00)
	rrca \ rrca			; a=(00|A+B|00|E+F)
	add	a, c			; a=(A+B+C+D|E+F+G+H)
	ld	c, a			; c=(A+B+C+D|E+F+G+H)
	rrca \ rrca \ rrca \ rrca	; a=(E+F+G+H|A+B+C+D)
	add	a, c			; a=(A+B+C+D+E+F+G+H|A+B+C+D+E+F+G+H)
	and	00001111b		; a=A+B+C+D+E+F+G+H

Even better routine, making the following observations:

a = (a>>1) + (a>>1) + (a&1)
a = (a>>1) + ((a>>2) + (a>>2) + ((a>>1)&1)) + (a&1)
...
a = (a>>1) + (a>>2) + ... + (a>>7) + (a>>7) + ((a>>6)&1) + ... + ((a>>1)&1) + (a&1)
a - (a>>1) - (a>>2) - ... - (a>>7) = (a>>7) + ((a>>6)&1) + ... + ((a>>1)&1) + (a&1)
;- Pop Count A
;input:	byte in a
;output: number of set bits in a
;destroys f, c
;21 bytes and 84 clock cycles
;author: calc84maniac
PopCountA:
	; a -= (byte>>7)
	ld	c, a
	add	a, c
	sbc	a, c
	; a -= (byte>>1)
	srl	c
	sub	c
	; a -= (byte>>2)
	srl	c
	sub	c
	; a -= (byte>>3)
	srl	c
	sub	c
	; a -= (byte>>4)
	srl	c
	sub	c
	; a -= (byte>>5)
	srl	c
	sub	c
	; a -= (byte>>6)
	srl	c
	sub	c

This algorithm can also be used to implement a size-optimized routine:

;- Pop Count A
;input:	byte in a
;output: number of set bits in a
;destroys f, c
;7 bytes and 26-194 clock cycles
;author: calc84maniac
PopCountA:
	ld	c, a
	.db	$FE	;CP imm8, skips subtracting c on the first iteration
.loop:
	sub	c
	srl	c
	jr	nz, .loop