Skip to content

Commit

Permalink
add avx512 routines for writeBlocks (thus Digest)
Browse files Browse the repository at this point in the history
```
goos: linux
goarch: amd64
pkg: github.com/cespare/xxhash/v2
cpu: AMD Ryzen 9 7950X 16-Core Processor
                 │ /tmp/old.results │          /tmp/new.results           │
                 │      sec/op      │   sec/op     vs base                │
DigestBytes/4B          9.358n ± 2%   9.377n ± 2%        ~ (p=0.481 n=10)
DigestBytes/16B         11.48n ± 3%   11.48n ± 1%        ~ (p=0.469 n=10)
DigestBytes/100B        15.97n ± 2%   20.41n ± 3%  +27.80% (p=0.000 n=10)
DigestBytes/4KB         212.4n ± 2%   167.7n ± 3%  -21.09% (p=0.000 n=10)
DigestBytes/10MB        493.2µ ± 3%   380.3µ ± 2%  -22.90% (p=0.000 n=10)
geomean                 178.2n        169.6n        -4.87%

                 │ /tmp/old.results │           /tmp/new.results           │
                 │       B/s        │     B/s       vs base                │
DigestBytes/4B         407.6Mi ± 2%   406.8Mi ± 2%        ~ (p=0.481 n=10)
DigestBytes/16B        1.298Gi ± 3%   1.298Gi ± 1%        ~ (p=0.529 n=10)
DigestBytes/100B       5.831Gi ± 2%   4.563Gi ± 3%  -21.75% (p=0.000 n=10)
DigestBytes/4KB        17.54Gi ± 2%   22.22Gi ± 3%  +26.72% (p=0.000 n=10)
DigestBytes/10MB       18.88Gi ± 2%   24.49Gi ± 2%  +29.69% (p=0.000 n=10)
geomean                3.979Gi        4.183Gi        +5.12%
```
  • Loading branch information
Jorropo committed May 8, 2024
1 parent a767147 commit 234a82e
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 58 deletions.
87 changes: 60 additions & 27 deletions gen/avx512.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,39 @@ func mergeRound(h /*inout*/, v /*in-destroy*/, p1, p2, p4 reg.GPVirtual) {
ADDQ(p4, h)
}

func main() {
// blockLoop handles 32 bytes at a time in one YMM register.
// state represent v1, v2, v3, v4 as 4 × uint64.
func blockLoop(state /*inout*/, p /*inout*/, n, p1, p2, processed /*out-optional*/ reg.Register) {
endp := GP64()
MOVL(U32(31), endp.As32())
ANDNQ(n, endp, endp)
if processed != nil {
MOVQ(endp, processed)
}
ADDQ(p, endp)

yprime1 := YMM()
VPBROADCASTQ(p1, yprime1)
yprime2 := YMM()
VPBROADCASTQ(p2, yprime2)

Label("loop_32")
{
// main block loop
temp := YMM()
VMOVDQU(Mem{Base: p}, temp)
ADDQ(Imm(32), p)
VPMULLQ(temp, yprime2, temp)
VPADDQ(temp, state, state)
VPROLQ(Imm(31), state, state)
VPMULLQ(state, yprime1, state)

CMPQ(p, endp)
JNE(LabelRef("loop_32"))
}
}

func sum64() {
initStateAvx512 := GLOBL("·initWideAvx512", NOSPLIT|NOPTR)
prime1 := prime1
DATA(0, U64(prime1+prime2))
Expand Down Expand Up @@ -60,35 +92,10 @@ func main() {
JBE(LabelRef("loop_8"))

{
// This loop handles 32 bytes at a time in one YMM register.
// state represent v1, v2, v3, v4 as 4 × uint64.
state := YMM()
VMOVDQU(initStateAvx512, state)

endp := GP64()
MOVL(U32(31), endp.As32())
ANDNQ(n, endp, endp)
ADDQ(p, endp)

yprime1 := YMM()
VPBROADCASTQ(p1, yprime1)
yprime2 := YMM()
VPBROADCASTQ(p2, yprime2)

Label("loop_32")
{
// main block loop
temp := YMM()
VMOVDQU(Mem{Base: p}, temp)
ADDQ(Imm(32), p)
VPMULLQ(temp, yprime2, temp)
VPADDQ(temp, state, state)
VPROLQ(Imm(31), state, state)
VPMULLQ(state, yprime1, state)

CMPQ(p, endp)
JNE(LabelRef("loop_32"))
}
blockLoop(state, p, n, p1, p2, nil)

// This interleave two things: extracting v1,2,3,4 from state and computing h.
v1, v2, v3, v4, temp := GP64(), GP64(), GP64(), GP64(), GP64()
Expand Down Expand Up @@ -203,6 +210,32 @@ func main() {
MOVQ(U64(xxhash.Sum64([]byte{})), h)
Store(h, ReturnIndex(0))
RET()
}

func writeBlocks() {
TEXT("writeBlocksAvx512", NOSPLIT|NOFRAME, "func(d *[4]uint64, b []byte) int")
d := Load(Param("d"), GP64())
p := Load(Param("b").Base(), GP64())
n := Load(Param("b").Len(), GP64())

state := YMM()
VMOVDQU(Mem{Base: d, Disp: 0}, state)

p1, p2 := GP64(), GP64()
MOVQ(Imm(prime1), p1)
MOVQ(Imm(prime2), p2)

processed := GP64()
blockLoop(state, p, n, p1, p2, processed)
VMOVDQU(state, Mem{Base: d, Disp: 0})
VZEROUPPER()

Store(processed, ReturnIndex(0))
RET()
}

func main() {
sum64()
writeBlocks()
Generate()
}
41 changes: 19 additions & 22 deletions xxhash.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@ var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}
// Note that a zero-valued Digest is not ready to receive writes.
// Call Reset or create a Digest using New before calling other methods.
type Digest struct {
v1 uint64
v2 uint64
v3 uint64
v4 uint64
s [4]uint64
total uint64
mem [32]byte
n int // how much of mem is used
Expand All @@ -57,10 +54,10 @@ func (d *Digest) Reset() {
// ResetWithSeed clears the Digest's state so that it can be reused.
// It uses the given seed to initialize the state.
func (d *Digest) ResetWithSeed(seed uint64) {
d.v1 = seed + prime1 + prime2
d.v2 = seed + prime2
d.v3 = seed
d.v4 = seed - prime1
d.s[0] = seed + prime1 + prime2
d.s[1] = seed + prime2
d.s[2] = seed
d.s[3] = seed - prime1
d.total = 0
d.n = 0
}
Expand Down Expand Up @@ -88,10 +85,10 @@ func (d *Digest) Write(b []byte) (n int, err error) {
if d.n > 0 {
// Finish off the partial block.
c := copy(memleft, b)
d.v1 = round(d.v1, u64(d.mem[0:8]))
d.v2 = round(d.v2, u64(d.mem[8:16]))
d.v3 = round(d.v3, u64(d.mem[16:24]))
d.v4 = round(d.v4, u64(d.mem[24:32]))
d.s[0] = round(d.s[0], u64(d.mem[0:8]))
d.s[1] = round(d.s[1], u64(d.mem[8:16]))
d.s[2] = round(d.s[2], u64(d.mem[16:24]))
d.s[3] = round(d.s[3], u64(d.mem[24:32]))
b = b[c:]
d.n = 0
}
Expand Down Expand Up @@ -130,14 +127,14 @@ func (d *Digest) Sum64() uint64 {
var h uint64

if d.total >= 32 {
v1, v2, v3, v4 := d.v1, d.v2, d.v3, d.v4
v1, v2, v3, v4 := d.s[0], d.s[1], d.s[2], d.s[3]
h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
h = mergeRound(h, v1)
h = mergeRound(h, v2)
h = mergeRound(h, v3)
h = mergeRound(h, v4)
} else {
h = d.v3 + prime5
h = d.s[2] + prime5
}

h += d.total
Expand Down Expand Up @@ -176,10 +173,10 @@ const (
func (d *Digest) MarshalBinary() ([]byte, error) {
b := make([]byte, 0, marshaledSize)
b = append(b, magic...)
b = appendUint64(b, d.v1)
b = appendUint64(b, d.v2)
b = appendUint64(b, d.v3)
b = appendUint64(b, d.v4)
b = appendUint64(b, d.s[0])
b = appendUint64(b, d.s[1])
b = appendUint64(b, d.s[2])
b = appendUint64(b, d.s[3])
b = appendUint64(b, d.total)
b = append(b, d.mem[:d.n]...)
b = b[:len(b)+len(d.mem)-d.n]
Expand All @@ -195,10 +192,10 @@ func (d *Digest) UnmarshalBinary(b []byte) error {
return errors.New("xxhash: invalid hash state size")
}
b = b[len(magic):]
b, d.v1 = consumeUint64(b)
b, d.v2 = consumeUint64(b)
b, d.v3 = consumeUint64(b)
b, d.v4 = consumeUint64(b)
b, d.s[0] = consumeUint64(b)
b, d.s[1] = consumeUint64(b)
b, d.s[2] = consumeUint64(b)
b, d.s[3] = consumeUint64(b)
b, d.total = consumeUint64(b)
copy(d.mem[:], b)
d.n = int(d.total % uint64(len(d.mem)))
Expand Down
6 changes: 3 additions & 3 deletions xxhash_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ loop: \
JLE loop

// func sum64(b []byte) uint64
TEXT ·sum64(SB), NOSPLIT|NOFRAME, $0-32
TEXT ·sum64scallar(SB), NOSPLIT|NOFRAME, $0-32
// Load fixed primes.
MOVQ ·primes+0(SB), prime1
MOVQ ·primes+8(SB), prime2
Expand Down Expand Up @@ -173,8 +173,8 @@ finalize:
MOVQ h, ret+24(FP)
RET

// func writeBlocks(d *Digest, b []byte) int
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
// func writeBlocksScallar(d *Digest, b []byte) int
TEXT ·writeBlocksScallar(SB), NOSPLIT|NOFRAME, $0-40
// Load fixed primes needed for round.
MOVQ ·primes+0(SB), prime1
MOVQ ·primes+8(SB), prime2
Expand Down
18 changes: 14 additions & 4 deletions xxhash_asm_amd64.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//go:build amd64 && !appengine && gc && !purego
// +build amd64,!appengine,gc,!purego

//go:generate go run ./gen -out xxhash_avx512_amd64.s
//go:generate go run -tags purego ./gen -out xxhash_avx512_amd64.s

package xxhash

Expand All @@ -14,14 +14,24 @@ func Sum64(b []byte) uint64 {
if useAvx512 {
return sum64avx512(b)
}
return sum64(b)
return sum64scallar(b)
}

//go:noescape
func sum64(b []byte) uint64
func sum64scallar(b []byte) uint64

//go:noescape
func sum64avx512(b []byte) uint64

func writeBlocks(d *Digest, b []byte) int {
if useAvx512 {
return writeBlocksAvx512(&d.s, b)
}
return writeBlocksScallar(d, b)
}

//go:noescape
func writeBlocksAvx512(d *[4]uint64, b []byte) int

//go:noescape
func writeBlocks(d *Digest, b []byte) int
func writeBlocksScallar(d *Digest, b []byte) int
30 changes: 30 additions & 0 deletions xxhash_avx512_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,33 @@ zero:
MOVQ $0xef46db3751d8e999, R9
MOVQ R9, ret+24(FP)
RET

// func writeBlocksAvx512(d *[4]uint64, b []byte) int
// Requires: AVX, AVX2, AVX512DQ, AVX512F, AVX512VL, BMI
TEXT ·writeBlocksAvx512(SB), NOSPLIT|NOFRAME, $0-40
MOVQ d+0(FP), AX
MOVQ b_base+8(FP), CX
MOVQ b_len+16(FP), DX
VMOVDQU (AX), Y0
MOVQ $0x9e3779b185ebca87, BX
MOVQ $0xc2b2ae3d27d4eb4f, SI
MOVL $0x0000001f, DI
ANDNQ DX, DI, DI
MOVQ DI, DX
ADDQ CX, DI
VPBROADCASTQ BX, Y1
VPBROADCASTQ SI, Y2

loop_32:
VMOVDQU (CX), Y3
ADDQ $0x20, CX
VPMULLQ Y3, Y2, Y3
VPADDQ Y3, Y0, Y0
VPROLQ $0x1f, Y0, Y0
VPMULLQ Y0, Y1, Y0
CMPQ CX, DI
JNE loop_32
VMOVDQU Y0, (AX)
VZEROUPPER
MOVQ DX, ret+32(FP)
RET
4 changes: 2 additions & 2 deletions xxhash_other.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func Sum64(b []byte) uint64 {
}

func writeBlocks(d *Digest, b []byte) int {
v1, v2, v3, v4 := d.v1, d.v2, d.v3, d.v4
v1, v2, v3, v4 := d.s[0], d.s[1], d.s[2], d.s[3]
n := len(b)
for len(b) >= 32 {
v1 = round(v1, u64(b[0:8:len(b)]))
Expand All @@ -73,6 +73,6 @@ func writeBlocks(d *Digest, b []byte) int {
v4 = round(v4, u64(b[24:32:len(b)]))
b = b[32:len(b):len(b)]
}
d.v1, d.v2, d.v3, d.v4 = v1, v2, v3, v4
d.s[0], d.s[1], d.s[2], d.s[3] = v1, v2, v3, v4
return n - len(b)
}

0 comments on commit 234a82e

Please sign in to comment.