Difference between revisions of "Z80 Routines:Optimized:PopCountA"
From WikiTI
(Created page with "PopCountA PopCountA This is a faster/smaller (and also obfuscated) replacement for the normal popcount a, which r...") |
Calc84maniac (Talk | contribs) (Add more optimized popcount) |
||
(4 intermediate revisions by one other user not shown) | |||
Line 6: | Line 6: | ||
<nowiki>;input: b | <nowiki>;input: b | ||
;output: a | ;output: a | ||
− | ; | + | ;destroys: f, c |
+ | ;26 bytes, 104 clock cycles | ||
TypicalPopCountA: | TypicalPopCountA: | ||
− | |||
xor a | xor a | ||
− | ld | + | ld c, a |
− | rrc | + | rrc b |
− | adc a, | + | adc a, c |
− | rrc | + | rrc b |
− | adc a, | + | adc a, c |
− | rrc | + | rrc b |
− | adc a, | + | adc a, c |
− | rrc | + | rrc b |
− | adc a, | + | adc a, c |
− | rrc | + | rrc b |
− | adc a, | + | adc a, c |
− | rrc | + | rrc b |
− | adc a, | + | adc a, c |
− | rrc | + | rrc b |
− | adc a, | + | adc a, c |
− | rrc | + | rrc b |
− | adc a, | + | adc a, c</nowiki> |
Better routine: | Better routine: | ||
<nowiki>;- Pop Count A | <nowiki>;- Pop Count A | ||
− | ;input: byte in | + | ;input: byte in a |
− | ;output: number of set bits in | + | ;output: number of set bits in a |
− | ;destroys | + | ;destroys f, bc |
;22 bytes and 85 clock cycles | ;22 bytes and 85 clock cycles | ||
;author: jacobly | ;author: jacobly | ||
PopCountA: | PopCountA: | ||
+ | ld c, a ; c=(A|B|C|D|E|F|G|H) | ||
+ | and 10101010b ; a=(A|0|C|0|E|0|G|0) | ||
+ | cpl ; a=(~A|1|~C|1|~E|1|~G|1) | ||
+ | rrca ; a=(1|~A|1|~C|1|~E|1|~G), cf=1 | ||
+ | adc a, c ; a=(A+B|C+D|E+F|G+H) | ||
+ | ld b, a ; b=(A+B|C+D|E+F|G+H) | ||
+ | and 00110011b ; a=(00|C+D|00|G+H) | ||
+ | ld c, a ; c=(00|C+D|00|G+H) | ||
+ | xor b ; a=(A+B|00|E+F|00) | ||
+ | rrca \ rrca ; a=(00|A+B|00|E+F) | ||
+ | add a, c ; a=(A+B+C+D|E+F+G+H) | ||
+ | ld c, a ; c=(A+B+C+D|E+F+G+H) | ||
+ | rrca \ rrca \ rrca \ rrca ; a=(E+F+G+H|A+B+C+D) | ||
+ | add a, c ; a=(A+B+C+D+E+F+G+H|A+B+C+D+E+F+G+H) | ||
+ | and 00001111b ; a=A+B+C+D+E+F+G+H</nowiki> | ||
+ | |||
+ | Even better routine, making the following observations: | ||
+ | |||
+ | <nowiki> | ||
+ | a = (a>>1) + (a>>1) + (a&1) | ||
+ | a = (a>>1) + ((a>>2) + (a>>2) + ((a>>1)&1)) + (a&1) | ||
+ | ... | ||
+ | a = (a>>1) + (a>>2) + ... + (a>>7) + (a>>7) + ((a>>6)&1) + ... + ((a>>1)&1) + (a&1) | ||
+ | a - (a>>1) - (a>>2) - ... - (a>>7) = (a>>7) + ((a>>6)&1) + ... + ((a>>1)&1) + (a&1)</nowiki> | ||
+ | |||
+ | <nowiki>;- Pop Count A | ||
+ | ;input: byte in a | ||
+ | ;output: number of set bits in a | ||
+ | ;destroys f, c | ||
+ | ;21 bytes and 84 clock cycles | ||
+ | ;author: calc84maniac | ||
+ | PopCountA: | ||
+ | ; a -= (byte>>7) | ||
ld c, a | ld c, a | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
add a, c | add a, c | ||
+ | sbc a, c | ||
+ | ; a -= (byte>>1) | ||
+ | srl c | ||
+ | sub c | ||
+ | ; a -= (byte>>2) | ||
+ | srl c | ||
+ | sub c | ||
+ | ; a -= (byte>>3) | ||
+ | srl c | ||
+ | sub c | ||
+ | ; a -= (byte>>4) | ||
+ | srl c | ||
+ | sub c | ||
+ | ; a -= (byte>>5) | ||
+ | srl c | ||
+ | sub c | ||
+ | ; a -= (byte>>6) | ||
+ | srl c | ||
+ | sub c</nowiki> | ||
+ | |||
+ | This algorithm can also be used to implement a size-optimized routine: | ||
+ | <nowiki>;- Pop Count A | ||
+ | ;input: byte in a | ||
+ | ;output: number of set bits in a | ||
+ | ;destroys f, c | ||
+ | ;7 bytes and 26-194 clock cycles | ||
+ | ;author: calc84maniac | ||
+ | PopCountA: | ||
ld c, a | ld c, a | ||
− | + | .db $FE ;CP imm8, skips subtracting c on the first iteration | |
− | + | .loop: | |
− | + | sub c | |
− | + | srl c | |
− | + | jr nz, .loop</nowiki> | |
− | + |
Latest revision as of 10:37, 26 April 2024
This is a faster/smaller (and also obfuscated) replacement for the normal popcount a, which returns the number of set bits in a.
Typical routine:
;input: b ;output: a ;destroys: f, c ;26 bytes, 104 clock cycles TypicalPopCountA: xor a ld c, a rrc b adc a, c rrc b adc a, c rrc b adc a, c rrc b adc a, c rrc b adc a, c rrc b adc a, c rrc b adc a, c rrc b adc a, c
Better routine:
;- Pop Count A ;input: byte in a ;output: number of set bits in a ;destroys f, bc ;22 bytes and 85 clock cycles ;author: jacobly PopCountA: ld c, a ; c=(A|B|C|D|E|F|G|H) and 10101010b ; a=(A|0|C|0|E|0|G|0) cpl ; a=(~A|1|~C|1|~E|1|~G|1) rrca ; a=(1|~A|1|~C|1|~E|1|~G), cf=1 adc a, c ; a=(A+B|C+D|E+F|G+H) ld b, a ; b=(A+B|C+D|E+F|G+H) and 00110011b ; a=(00|C+D|00|G+H) ld c, a ; c=(00|C+D|00|G+H) xor b ; a=(A+B|00|E+F|00) rrca \ rrca ; a=(00|A+B|00|E+F) add a, c ; a=(A+B+C+D|E+F+G+H) ld c, a ; c=(A+B+C+D|E+F+G+H) rrca \ rrca \ rrca \ rrca ; a=(E+F+G+H|A+B+C+D) add a, c ; a=(A+B+C+D+E+F+G+H|A+B+C+D+E+F+G+H) and 00001111b ; a=A+B+C+D+E+F+G+H
Even better routine, making the following observations:
a = (a>>1) + (a>>1) + (a&1) a = (a>>1) + ((a>>2) + (a>>2) + ((a>>1)&1)) + (a&1) ... a = (a>>1) + (a>>2) + ... + (a>>7) + (a>>7) + ((a>>6)&1) + ... + ((a>>1)&1) + (a&1) a - (a>>1) - (a>>2) - ... - (a>>7) = (a>>7) + ((a>>6)&1) + ... + ((a>>1)&1) + (a&1)
;- Pop Count A ;input: byte in a ;output: number of set bits in a ;destroys f, c ;21 bytes and 84 clock cycles ;author: calc84maniac PopCountA: ; a -= (byte>>7) ld c, a add a, c sbc a, c ; a -= (byte>>1) srl c sub c ; a -= (byte>>2) srl c sub c ; a -= (byte>>3) srl c sub c ; a -= (byte>>4) srl c sub c ; a -= (byte>>5) srl c sub c ; a -= (byte>>6) srl c sub c
This algorithm can also be used to implement a size-optimized routine:
;- Pop Count A ;input: byte in a ;output: number of set bits in a ;destroys f, c ;7 bytes and 26-194 clock cycles ;author: calc84maniac PopCountA: ld c, a .db $FE ;CP imm8, skips subtracting c on the first iteration .loop: sub c srl c jr nz, .loop