Skip to content

Commit

Permalink
rework Digest.mem to be passed as side array
Browse files Browse the repository at this point in the history
```
> benchstat /tmp/{old,new,extra}.results
goos: linux
goarch: amd64
pkg: github.com/cespare/xxhash/v2
cpu: AMD Ryzen 9 7950X 16-Core Processor
                 │ no avx512        │ avx512                               │ avx512+extra                        │
                 │      sec/op      │    sec/op     vs base                │   sec/op     vs base                │
DigestBytes/4B          9.358n ± 2%    9.377n ± 2%        ~ (p=0.481 n=10)   5.873n ± 5%  -37.24% (p=0.000 n=10)
DigestBytes/16B        11.485n ± 3%   11.485n ± 1%        ~ (p=0.469 n=10)   7.292n ± 5%  -36.51% (p=0.000 n=10)
DigestBytes/100B        15.97n ± 2%    20.41n ± 3%  +27.80% (p=0.000 n=10)   20.31n ± 3%  +27.18% (p=0.000 n=10)
DigestBytes/4KB         212.4n ± 2%    167.7n ± 3%  -21.09% (p=0.000 n=10)   163.5n ± 3%  -23.02% (p=0.000 n=10)
DigestBytes/10MB        493.2µ ± 3%    380.3µ ± 2%  -22.90% (p=0.000 n=10)   375.1µ ± 2%  -23.94% (p=0.000 n=10)
geomean                 178.2n         169.6n        -4.87%                  139.8n       -21.57%

                 │ no avx512        │ avx512                               │ avx512+extra                         │
                 │       B/s        │     B/s       vs base                │     B/s       vs base                │
DigestBytes/4B         407.6Mi ± 2%   406.8Mi ± 2%        ~ (p=0.481 n=10)   649.6Mi ± 5%  +59.35% (p=0.000 n=10)
DigestBytes/16B        1.298Gi ± 3%   1.298Gi ± 1%        ~ (p=0.529 n=10)   2.044Gi ± 5%  +57.49% (p=0.000 n=10)
DigestBytes/100B       5.831Gi ± 2%   4.563Gi ± 3%  -21.75% (p=0.000 n=10)   4.586Gi ± 3%  -21.35% (p=0.000 n=10)
DigestBytes/4KB        17.54Gi ± 2%   22.22Gi ± 3%  +26.72% (p=0.000 n=10)   22.78Gi ± 3%  +29.90% (p=0.000 n=10)
DigestBytes/10MB       18.88Gi ± 2%   24.49Gi ± 2%  +29.69% (p=0.000 n=10)   24.83Gi ± 2%  +31.48% (p=0.000 n=10)
geomean                3.979Gi        4.183Gi        +5.12%                  5.074Gi       +27.51%
```
  • Loading branch information
Jorropo committed May 8, 2024
1 parent 27324bc commit 53f0391
Show file tree
Hide file tree
Showing 8 changed files with 152 additions and 110 deletions.
54 changes: 31 additions & 23 deletions gen/avx512.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,32 +28,29 @@ func mergeRound(h /*inout*/, v /*in-destroy*/, p1, p2, p4 reg.GPVirtual) {
ADDQ(p4, h)
}

func round(state /*inout*/, p, yprime1, yprime2 reg.Register) {
temp := YMM()
VMOVDQU(Mem{Base: p}, temp)
VPMULLQ(temp, yprime2, temp)
VPADDQ(temp, state, state)
VPROLQ(Imm(31), state, state)
VPMULLQ(state, yprime1, state)
}

// blockLoop handles 32 bytes at a time in one YMM register.
// it assume n is 32 bytes or more.
// state represent v1, v2, v3, v4 as 4 × uint64.
func blockLoop(state /*inout*/, p /*inout*/, n, p1, p2, processed /*out-optional*/ reg.Register) {
func blockLoop(state /*inout*/, p /*inout*/, n, yprime1, yprime2 reg.Register) {
endp := GP64()
MOVL(U32(31), endp.As32())
ANDNQ(n, endp, endp)
if processed != nil {
MOVQ(endp, processed)
}
ADDQ(p, endp)

yprime1 := YMM()
VPBROADCASTQ(p1, yprime1)
yprime2 := YMM()
VPBROADCASTQ(p2, yprime2)

Label("loop_32")
{
// main block loop
temp := YMM()
VMOVDQU(Mem{Base: p}, temp)
round(state, p, yprime1, yprime2)
ADDQ(Imm(32), p)
VPMULLQ(temp, yprime2, temp)
VPADDQ(temp, state, state)
VPROLQ(Imm(31), state, state)
VPMULLQ(state, yprime1, state)

CMPQ(p, endp)
JNE(LabelRef("loop_32"))
Expand All @@ -68,7 +65,7 @@ func sum64() {
DATA(16, U64(0))
DATA(24, U64(-prime1))

TEXT("sum64avx512", NOSPLIT|NOFRAME, "func(b []byte) uint64")
TEXT("sum64Avx512", NOSPLIT|NOFRAME, "func(b []byte) uint64")
p := Load(Param("b").Base(), GP64())
n := Load(Param("b").Len(), GP64())

Expand All @@ -95,7 +92,11 @@ func sum64() {
state := YMM()
VMOVDQU(initStateAvx512, state)

blockLoop(state, p, n, p1, p2, nil)
yprime1, yprime2 := YMM(), YMM()
VPBROADCASTQ(p1, yprime1)
VPBROADCASTQ(p2, yprime2)

blockLoop(state, p, n, yprime1, yprime2)

// This interleave two things: extracting v1,2,3,4 from state and computing h.
v1, v2, v3, v4, temp := GP64(), GP64(), GP64(), GP64(), GP64()
Expand Down Expand Up @@ -212,24 +213,31 @@ func sum64() {
}

func writeBlocks() {
TEXT("writeBlocksAvx512", NOSPLIT|NOFRAME, "func(d *[4]uint64, b []byte) int")
TEXT("writeBlocksAvx512", NOSPLIT|NOFRAME, "func(d *[4]uint64, extra *[32]byte, b []byte)")
d := Load(Param("d"), GP64())
extra := Load(Param("extra"), GP64())
p := Load(Param("b").Base(), GP64())
n := Load(Param("b").Len(), GP64())

state := YMM()
VMOVDQU(Mem{Base: d, Disp: 0}, state)

p1, p2 := GP64(), GP64()
p1, p2, yprime1, yprime2 := GP64(), GP64(), YMM(), YMM()
MOVQ(Imm(prime1), p1)
VPBROADCASTQ(p1, yprime1)
MOVQ(Imm(prime2), p2)
VPBROADCASTQ(p2, yprime2)

TESTQ(extra, extra)
JZ(LabelRef("skip_extra"))
{
round(state, extra, yprime1, yprime2)
}
Label("skip_extra")

processed := GP64()
blockLoop(state, p, n, p1, p2, processed)
blockLoop(state, p, n, yprime1, yprime2)
VMOVDQU(state, Mem{Base: d, Disp: 0})
VZEROUPPER()

Store(processed, ReturnIndex(0))
RET()
}

Expand Down
47 changes: 22 additions & 25 deletions xxhash.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ type Digest struct {
s [4]uint64
total uint64
mem [32]byte
n int // how much of mem is used
n uint8 // how much of mem is used
}

// New creates a new Digest with a zero seed.
Expand Down Expand Up @@ -73,35 +73,32 @@ func (d *Digest) Write(b []byte) (n int, err error) {
n = len(b)
d.total += uint64(n)

memleft := d.mem[d.n&(len(d.mem)-1):]

if d.n+n < 32 {
// This new data doesn't even fill the current block.
copy(memleft, b)
d.n += n
return
}

if d.n > 0 {
// Finish off the partial block.
c := copy(memleft, b)
d.s[0] = round(d.s[0], u64(d.mem[0:8]))
d.s[1] = round(d.s[1], u64(d.mem[8:16]))
d.s[2] = round(d.s[2], u64(d.mem[16:24]))
d.s[3] = round(d.s[3], u64(d.mem[24:32]))
b = b[c:]
var extra *[32]byte
if d.n != 0 {
// there is data already in mem, append to it.
added := copy(d.mem[d.n:], b)
b = b[added:]
d.n += uint8(added)
if uint(d.n) < uint(len(d.mem)) {
// not enough data to hash.
return
}
extra = &d.mem
d.n = 0
}

if len(b) >= 32 {
// One or more full blocks left.
nw := writeBlocks(d, b)
b = b[nw:]
writeBlocks(d, extra, b)
b = b[uint(len(b))&^31:]
} else if extra != nil {
// we don't have enough data to fill b but we have an extra.
// write blocks must never be called with len(b) < 32 so pass extra as b.
writeBlocks(d, nil, extra[:])
}

// Store any remaining partial block.
copy(d.mem[:], b)
d.n = len(b)
d.n = uint8(copy(d.mem[:], b))

return
}
Expand Down Expand Up @@ -139,7 +136,7 @@ func (d *Digest) Sum64() uint64 {

h += d.total

b := d.mem[:d.n&(len(d.mem)-1)]
b := d.mem[:d.n&uint8(len(d.mem)-1)]
for ; len(b) >= 8; b = b[8:] {
k1 := round(0, u64(b[:8]))
h ^= k1
Expand Down Expand Up @@ -179,7 +176,7 @@ func (d *Digest) MarshalBinary() ([]byte, error) {
b = appendUint64(b, d.s[3])
b = appendUint64(b, d.total)
b = append(b, d.mem[:d.n]...)
b = b[:len(b)+len(d.mem)-d.n]
b = b[:len(b)+len(d.mem)-int(d.n)]
return b, nil
}

Expand All @@ -198,7 +195,7 @@ func (d *Digest) UnmarshalBinary(b []byte) error {
b, d.s[3] = consumeUint64(b)
b, d.total = consumeUint64(b)
copy(d.mem[:], b)
d.n = int(d.total % uint64(len(d.mem)))
d.n = uint8(d.total % uint64(len(d.mem)))
return nil
}

Expand Down
49 changes: 28 additions & 21 deletions xxhash_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -40,25 +40,29 @@
IMULQ prime1, acc \
ADDQ prime4, acc

// blockLoop processes as many 32-byte blocks as possible,
// updating v1, v2, v3, and v4. It assumes that there is at least one block
// to process.
#define blockLoop() \
loop: \
// round32 perform a 32byte round loading from ptr on v1, v2, v3, v4.
#define round32() \
MOVQ +0(p), x \
round(v1, x) \
MOVQ +8(p), x \
round(v2, x) \
MOVQ +16(p), x \
round(v3, x) \
MOVQ +24(p), x \
round(v4, x) \
ADDQ $32, p \
CMPQ p, end \
round(v4, x)

// blockLoop processes as many 32-byte blocks as possible,
// updating v1, v2, v3, and v4. It assumes that there is at least one block
// to process.
#define blockLoop() \
loop: \
round32() \
ADDQ $32, p \
CMPQ p, end \
JLE loop

// func sum64(b []byte) uint64
TEXT ·sum64scallar(SB), NOSPLIT|NOFRAME, $0-32
TEXT ·sum64Scalar(SB), NOSPLIT|NOFRAME, $0-32
// Load fixed primes.
MOVQ ·primes+0(SB), prime1
MOVQ ·primes+8(SB), prime2
Expand Down Expand Up @@ -173,25 +177,32 @@ finalize:
MOVQ h, ret+24(FP)
RET

// func writeBlocksScallar(d *Digest, b []byte) int
TEXT ·writeBlocksScallar(SB), NOSPLIT|NOFRAME, $0-40
// func writeBlocksScalar(d *Digest, extra *[32]byte, b []byte)
TEXT ·writeBlocksScalar(SB), NOSPLIT|NOFRAME, $0-40
// Load fixed primes needed for round.
MOVQ ·primes+0(SB), prime1
MOVQ ·primes+8(SB), prime2

// Load slice.
MOVQ b_base+8(FP), p
MOVQ b_len+16(FP), n
LEAQ (p)(n*1), end
SUBQ $32, end

// Load vN from d.
MOVQ s+0(FP), d
MOVQ 0(d), v1
MOVQ 8(d), v2
MOVQ 16(d), v3
MOVQ 24(d), v4

// Handle extra
MOVQ extra+8(FP), p
TESTQ p, p
JZ noExtra
round32()
noExtra:

// Load slice.
MOVQ b_base+16(FP), p
MOVQ b_len+24(FP), n
LEAQ (p)(n*1), end
SUBQ $32, end

// We don't need to check the loop condition here; this function is
// always called with at least one block of data to process.
blockLoop()
Expand All @@ -202,8 +213,4 @@ TEXT ·writeBlocksScallar(SB), NOSPLIT|NOFRAME, $0-40
MOVQ v3, 16(d)
MOVQ v4, 24(d)

// The number of bytes written is p minus the old base pointer.
SUBQ b_base+8(FP), p
MOVQ p, ret+32(FP)

RET
6 changes: 2 additions & 4 deletions xxhash_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,8 @@ finalize:
MOVD h, ret+24(FP)
RET

// func writeBlocks(d *Digest, b []byte) int
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
// func writeBlocksArm64(d *Digest, b []byte)
TEXT ·writeBlocksArm64(SB), NOSPLIT|NOFRAME, $0-32
LDP ·primes+0(SB), (prime1, prime2)

// Load state. Assume v[1-4] are stored contiguously.
Expand All @@ -178,6 +178,4 @@ TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
STP (v1, v2), 0(digest)
STP (v3, v4), 16(digest)

BIC $31, n
MOVD n, ret+32(FP)
RET
20 changes: 11 additions & 9 deletions xxhash_asm_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,28 @@ var useAvx512 = cpuid.CPU.Supports(cpuid.AVX, cpuid.AVX2, cpuid.AVX512DQ, cpuid.
// Sum64 computes the 64-bit xxHash digest of b with a zero seed.
func Sum64(b []byte) uint64 {
if useAvx512 {
return sum64avx512(b)
return sum64Avx512(b)
}
return sum64scallar(b)
return sum64Scalar(b)
}

//go:noescape
func sum64scallar(b []byte) uint64
func sum64Scalar(b []byte) uint64

//go:noescape
func sum64avx512(b []byte) uint64
func sum64Avx512(b []byte) uint64

func writeBlocks(d *Digest, b []byte) int {
// extra is a first block before b, it may be nil then skip it.
func writeBlocks(d *Digest, extra *[32]byte, b []byte) {
if useAvx512 {
return writeBlocksAvx512(&d.s, b)
writeBlocksAvx512(&d.s, extra, b)
return
}
return writeBlocksScallar(d, b)
writeBlocksScalar(d, nil, b)
}

//go:noescape
func writeBlocksAvx512(d *[4]uint64, b []byte) int
func writeBlocksAvx512(d *[4]uint64, extra *[32]byte, b []byte)

//go:noescape
func writeBlocksScallar(d *Digest, b []byte) int
func writeBlocksScalar(d *Digest, extra *[32]byte, b []byte)
14 changes: 13 additions & 1 deletion xxhash_asm_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,17 @@ var useAvx512 = false
//go:noescape
func Sum64(b []byte) uint64

// extra is a first block before b, it may be nil then skip it.
func writeBlocks(d *Digest, extra *[32]byte, b []byte) {
if extra != nil {
// FIXME: handle that logic in ASM, *someone* was lazy and didn't
// cared to learn the arm64 p9 syntax.
// At least this is hopefully on par with how fast the software impl
// it used to be.
writeBlocksArm64(d, extra[:])
}
writeBlocksArm64(d, b)
}

//go:noescape
func writeBlocks(d *Digest, b []byte) int
func writeBlocksArm64(d *Digest, b []byte)
Loading

0 comments on commit 53f0391

Please sign in to comment.