diff --git a/gen/avx512.go b/gen/avx512.go index 3689c33..68058a6 100644 --- a/gen/avx512.go +++ b/gen/avx512.go @@ -28,32 +28,29 @@ func mergeRound(h /*inout*/, v /*in-destroy*/, p1, p2, p4 reg.GPVirtual) { ADDQ(p4, h) } +func round(state /*inout*/, p, yprime1, yprime2 reg.Register) { + temp := YMM() + VMOVDQU(Mem{Base: p}, temp) + VPMULLQ(temp, yprime2, temp) + VPADDQ(temp, state, state) + VPROLQ(Imm(31), state, state) + VPMULLQ(state, yprime1, state) +} + // blockLoop handles 32 bytes at a time in one YMM register. +// it assume n is 32 bytes or more. // state represent v1, v2, v3, v4 as 4 × uint64. -func blockLoop(state /*inout*/, p /*inout*/, n, p1, p2, processed /*out-optional*/ reg.Register) { +func blockLoop(state /*inout*/, p /*inout*/, n, yprime1, yprime2 reg.Register) { endp := GP64() MOVL(U32(31), endp.As32()) ANDNQ(n, endp, endp) - if processed != nil { - MOVQ(endp, processed) - } ADDQ(p, endp) - yprime1 := YMM() - VPBROADCASTQ(p1, yprime1) - yprime2 := YMM() - VPBROADCASTQ(p2, yprime2) - Label("loop_32") { // main block loop - temp := YMM() - VMOVDQU(Mem{Base: p}, temp) + round(state, p, yprime1, yprime2) ADDQ(Imm(32), p) - VPMULLQ(temp, yprime2, temp) - VPADDQ(temp, state, state) - VPROLQ(Imm(31), state, state) - VPMULLQ(state, yprime1, state) CMPQ(p, endp) JNE(LabelRef("loop_32")) @@ -68,7 +65,7 @@ func sum64() { DATA(16, U64(0)) DATA(24, U64(-prime1)) - TEXT("sum64avx512", NOSPLIT|NOFRAME, "func(b []byte) uint64") + TEXT("sum64Avx512", NOSPLIT|NOFRAME, "func(b []byte) uint64") p := Load(Param("b").Base(), GP64()) n := Load(Param("b").Len(), GP64()) @@ -95,7 +92,11 @@ func sum64() { state := YMM() VMOVDQU(initStateAvx512, state) - blockLoop(state, p, n, p1, p2, nil) + yprime1, yprime2 := YMM(), YMM() + VPBROADCASTQ(p1, yprime1) + VPBROADCASTQ(p2, yprime2) + + blockLoop(state, p, n, yprime1, yprime2) // This interleave two things: extracting v1,2,3,4 from state and computing h. v1, v2, v3, v4, temp := GP64(), GP64(), GP64(), GP64(), GP64() @@ -212,24 +213,31 @@ func sum64() { } func writeBlocks() { - TEXT("writeBlocksAvx512", NOSPLIT|NOFRAME, "func(d *[4]uint64, b []byte) int") + TEXT("writeBlocksAvx512", NOSPLIT|NOFRAME, "func(d *[4]uint64, extra *[32]byte, b []byte)") d := Load(Param("d"), GP64()) + extra := Load(Param("extra"), GP64()) p := Load(Param("b").Base(), GP64()) n := Load(Param("b").Len(), GP64()) state := YMM() VMOVDQU(Mem{Base: d, Disp: 0}, state) - p1, p2 := GP64(), GP64() + p1, p2, yprime1, yprime2 := GP64(), GP64(), YMM(), YMM() MOVQ(Imm(prime1), p1) + VPBROADCASTQ(p1, yprime1) MOVQ(Imm(prime2), p2) + VPBROADCASTQ(p2, yprime2) + + TESTQ(extra, extra) + JZ(LabelRef("skip_extra")) + { + round(state, extra, yprime1, yprime2) + } + Label("skip_extra") - processed := GP64() - blockLoop(state, p, n, p1, p2, processed) + blockLoop(state, p, n, yprime1, yprime2) VMOVDQU(state, Mem{Base: d, Disp: 0}) VZEROUPPER() - - Store(processed, ReturnIndex(0)) RET() } diff --git a/xxhash.go b/xxhash.go index e3ed38f..dd188e9 100644 --- a/xxhash.go +++ b/xxhash.go @@ -30,7 +30,7 @@ type Digest struct { s [4]uint64 total uint64 mem [32]byte - n int // how much of mem is used + n uint8 // how much of mem is used } // New creates a new Digest with a zero seed. @@ -73,35 +73,32 @@ func (d *Digest) Write(b []byte) (n int, err error) { n = len(b) d.total += uint64(n) - memleft := d.mem[d.n&(len(d.mem)-1):] - - if d.n+n < 32 { - // This new data doesn't even fill the current block. - copy(memleft, b) - d.n += n - return - } - - if d.n > 0 { - // Finish off the partial block. - c := copy(memleft, b) - d.s[0] = round(d.s[0], u64(d.mem[0:8])) - d.s[1] = round(d.s[1], u64(d.mem[8:16])) - d.s[2] = round(d.s[2], u64(d.mem[16:24])) - d.s[3] = round(d.s[3], u64(d.mem[24:32])) - b = b[c:] + var extra *[32]byte + if d.n != 0 { + // there is data already in mem, append to it. + added := copy(d.mem[d.n:], b) + b = b[added:] + d.n += uint8(added) + if uint(d.n) < uint(len(d.mem)) { + // not enough data to hash. + return + } + extra = &d.mem d.n = 0 } if len(b) >= 32 { // One or more full blocks left. - nw := writeBlocks(d, b) - b = b[nw:] + writeBlocks(d, extra, b) + b = b[uint(len(b))&^31:] + } else if extra != nil { + // we don't have enough data to fill b but we have an extra. + // write blocks must never be called with len(b) < 32 so pass extra as b. + writeBlocks(d, nil, extra[:]) } // Store any remaining partial block. - copy(d.mem[:], b) - d.n = len(b) + d.n = uint8(copy(d.mem[:], b)) return } @@ -139,7 +136,7 @@ func (d *Digest) Sum64() uint64 { h += d.total - b := d.mem[:d.n&(len(d.mem)-1)] + b := d.mem[:d.n&uint8(len(d.mem)-1)] for ; len(b) >= 8; b = b[8:] { k1 := round(0, u64(b[:8])) h ^= k1 @@ -179,7 +176,7 @@ func (d *Digest) MarshalBinary() ([]byte, error) { b = appendUint64(b, d.s[3]) b = appendUint64(b, d.total) b = append(b, d.mem[:d.n]...) - b = b[:len(b)+len(d.mem)-d.n] + b = b[:len(b)+len(d.mem)-int(d.n)] return b, nil } @@ -198,7 +195,7 @@ func (d *Digest) UnmarshalBinary(b []byte) error { b, d.s[3] = consumeUint64(b) b, d.total = consumeUint64(b) copy(d.mem[:], b) - d.n = int(d.total % uint64(len(d.mem))) + d.n = uint8(d.total % uint64(len(d.mem))) return nil } diff --git a/xxhash_amd64.s b/xxhash_amd64.s index 7a60a92..3f00db5 100644 --- a/xxhash_amd64.s +++ b/xxhash_amd64.s @@ -40,11 +40,8 @@ IMULQ prime1, acc \ ADDQ prime4, acc -// blockLoop processes as many 32-byte blocks as possible, -// updating v1, v2, v3, and v4. It assumes that there is at least one block -// to process. -#define blockLoop() \ -loop: \ +// round32 perform a 32byte round loading from ptr on v1, v2, v3, v4. +#define round32() \ MOVQ +0(p), x \ round(v1, x) \ MOVQ +8(p), x \ @@ -52,13 +49,20 @@ loop: \ MOVQ +16(p), x \ round(v3, x) \ MOVQ +24(p), x \ - round(v4, x) \ - ADDQ $32, p \ - CMPQ p, end \ + round(v4, x) + +// blockLoop processes as many 32-byte blocks as possible, +// updating v1, v2, v3, and v4. It assumes that there is at least one block +// to process. +#define blockLoop() \ +loop: \ + round32() \ + ADDQ $32, p \ + CMPQ p, end \ JLE loop // func sum64(b []byte) uint64 -TEXT ·sum64scallar(SB), NOSPLIT|NOFRAME, $0-32 +TEXT ·sum64Scalar(SB), NOSPLIT|NOFRAME, $0-32 // Load fixed primes. MOVQ ·primes+0(SB), prime1 MOVQ ·primes+8(SB), prime2 @@ -173,18 +177,12 @@ finalize: MOVQ h, ret+24(FP) RET -// func writeBlocksScallar(d *Digest, b []byte) int -TEXT ·writeBlocksScallar(SB), NOSPLIT|NOFRAME, $0-40 +// func writeBlocksScalar(d *Digest, extra *[32]byte, b []byte) +TEXT ·writeBlocksScalar(SB), NOSPLIT|NOFRAME, $0-40 // Load fixed primes needed for round. MOVQ ·primes+0(SB), prime1 MOVQ ·primes+8(SB), prime2 - // Load slice. - MOVQ b_base+8(FP), p - MOVQ b_len+16(FP), n - LEAQ (p)(n*1), end - SUBQ $32, end - // Load vN from d. MOVQ s+0(FP), d MOVQ 0(d), v1 @@ -192,6 +190,19 @@ TEXT ·writeBlocksScallar(SB), NOSPLIT|NOFRAME, $0-40 MOVQ 16(d), v3 MOVQ 24(d), v4 + // Handle extra + MOVQ extra+8(FP), p + TESTQ p, p + JZ noExtra + round32() +noExtra: + + // Load slice. + MOVQ b_base+16(FP), p + MOVQ b_len+24(FP), n + LEAQ (p)(n*1), end + SUBQ $32, end + // We don't need to check the loop condition here; this function is // always called with at least one block of data to process. blockLoop() @@ -202,8 +213,4 @@ TEXT ·writeBlocksScallar(SB), NOSPLIT|NOFRAME, $0-40 MOVQ v3, 16(d) MOVQ v4, 24(d) - // The number of bytes written is p minus the old base pointer. - SUBQ b_base+8(FP), p - MOVQ p, ret+32(FP) - RET diff --git a/xxhash_arm64.s b/xxhash_arm64.s index 7e3145a..f03a3e5 100644 --- a/xxhash_arm64.s +++ b/xxhash_arm64.s @@ -161,8 +161,8 @@ finalize: MOVD h, ret+24(FP) RET -// func writeBlocks(d *Digest, b []byte) int -TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 +// func writeBlocksArm64(d *Digest, b []byte) +TEXT ·writeBlocksArm64(SB), NOSPLIT|NOFRAME, $0-32 LDP ·primes+0(SB), (prime1, prime2) // Load state. Assume v[1-4] are stored contiguously. @@ -178,6 +178,4 @@ TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 STP (v1, v2), 0(digest) STP (v3, v4), 16(digest) - BIC $31, n - MOVD n, ret+32(FP) RET diff --git a/xxhash_asm_amd64.go b/xxhash_asm_amd64.go index 898b9b3..a0faa63 100644 --- a/xxhash_asm_amd64.go +++ b/xxhash_asm_amd64.go @@ -12,26 +12,28 @@ var useAvx512 = cpuid.CPU.Supports(cpuid.AVX, cpuid.AVX2, cpuid.AVX512DQ, cpuid. // Sum64 computes the 64-bit xxHash digest of b with a zero seed. func Sum64(b []byte) uint64 { if useAvx512 { - return sum64avx512(b) + return sum64Avx512(b) } - return sum64scallar(b) + return sum64Scalar(b) } //go:noescape -func sum64scallar(b []byte) uint64 +func sum64Scalar(b []byte) uint64 //go:noescape -func sum64avx512(b []byte) uint64 +func sum64Avx512(b []byte) uint64 -func writeBlocks(d *Digest, b []byte) int { +// extra is a first block before b, it may be nil then skip it. +func writeBlocks(d *Digest, extra *[32]byte, b []byte) { if useAvx512 { - return writeBlocksAvx512(&d.s, b) + writeBlocksAvx512(&d.s, extra, b) + return } - return writeBlocksScallar(d, b) + writeBlocksScalar(d, nil, b) } //go:noescape -func writeBlocksAvx512(d *[4]uint64, b []byte) int +func writeBlocksAvx512(d *[4]uint64, extra *[32]byte, b []byte) //go:noescape -func writeBlocksScallar(d *Digest, b []byte) int +func writeBlocksScalar(d *Digest, extra *[32]byte, b []byte) diff --git a/xxhash_asm_arm64.go b/xxhash_asm_arm64.go index ede1693..06e89a0 100644 --- a/xxhash_asm_arm64.go +++ b/xxhash_asm_arm64.go @@ -10,5 +10,17 @@ var useAvx512 = false //go:noescape func Sum64(b []byte) uint64 +// extra is a first block before b, it may be nil then skip it. +func writeBlocks(d *Digest, extra *[32]byte, b []byte) { + if extra != nil { + // FIXME: handle that logic in ASM, *someone* was lazy and didn't + // cared to learn the arm64 p9 syntax. + // At least this is hopefully on par with how fast the software impl + // it used to be. + writeBlocksArm64(d, extra[:]) + } + writeBlocksArm64(d, b) +} + //go:noescape -func writeBlocks(d *Digest, b []byte) int +func writeBlocksArm64(d *Digest, b []byte) diff --git a/xxhash_avx512_amd64.s b/xxhash_avx512_amd64.s index 7e0665a..c779a24 100644 --- a/xxhash_avx512_amd64.s +++ b/xxhash_avx512_amd64.s @@ -8,9 +8,9 @@ DATA ·initWideAvx512<>+16(SB)/8, $0x0000000000000000 DATA ·initWideAvx512<>+24(SB)/8, $0x61c8864e7a143579 GLOBL ·initWideAvx512<>(SB), NOSPLIT|NOPTR, $32 -// func sum64avx512(b []byte) uint64 +// func sum64Avx512(b []byte) uint64 // Requires: AVX, AVX2, AVX512DQ, AVX512F, AVX512VL, BMI -TEXT ·sum64avx512(SB), NOSPLIT|NOFRAME, $0-32 +TEXT ·sum64Avx512(SB), NOSPLIT|NOFRAME, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX MOVQ $0x9e3779b185ebca87, DX @@ -28,19 +28,19 @@ TEXT ·sum64avx512(SB), NOSPLIT|NOFRAME, $0-32 CMPQ CX, $0x1f JBE loop_8 VMOVDQU ·initWideAvx512<>+0(SB), Y0 + VPBROADCASTQ DX, Y1 + VPBROADCASTQ BX, Y2 MOVL $0x0000001f, R9 ANDNQ CX, R9, R9 ADDQ AX, R9 - VPBROADCASTQ DX, Y1 - VPBROADCASTQ BX, Y2 loop_32: VMOVDQU (AX), Y3 - ADDQ $0x20, AX VPMULLQ Y3, Y2, Y3 VPADDQ Y3, Y0, Y0 VPROLQ $0x1f, Y0, Y0 VPMULLQ Y0, Y1, Y0 + ADDQ $0x20, AX CMPQ AX, R9 JNE loop_32 VMOVQ X0, R10 @@ -151,32 +151,40 @@ zero: MOVQ R9, ret+24(FP) RET -// func writeBlocksAvx512(d *[4]uint64, b []byte) int +// func writeBlocksAvx512(d *[4]uint64, extra *[32]byte, b []byte) // Requires: AVX, AVX2, AVX512DQ, AVX512F, AVX512VL, BMI TEXT ·writeBlocksAvx512(SB), NOSPLIT|NOFRAME, $0-40 MOVQ d+0(FP), AX - MOVQ b_base+8(FP), CX - MOVQ b_len+16(FP), DX + MOVQ extra+8(FP), CX + MOVQ b_base+16(FP), DX + MOVQ b_len+24(FP), BX VMOVDQU (AX), Y0 - MOVQ $0x9e3779b185ebca87, BX + MOVQ $0x9e3779b185ebca87, SI + VPBROADCASTQ SI, Y1 MOVQ $0xc2b2ae3d27d4eb4f, SI - MOVL $0x0000001f, DI - ANDNQ DX, DI, DI - MOVQ DI, DX - ADDQ CX, DI - VPBROADCASTQ BX, Y1 VPBROADCASTQ SI, Y2 + TESTQ CX, CX + JZ skip_extra + VMOVDQU (CX), Y3 + VPMULLQ Y3, Y2, Y3 + VPADDQ Y3, Y0, Y0 + VPROLQ $0x1f, Y0, Y0 + VPMULLQ Y0, Y1, Y0 + +skip_extra: + MOVL $0x0000001f, CX + ANDNQ BX, CX, CX + ADDQ DX, CX loop_32: - VMOVDQU (CX), Y3 - ADDQ $0x20, CX + VMOVDQU (DX), Y3 VPMULLQ Y3, Y2, Y3 VPADDQ Y3, Y0, Y0 VPROLQ $0x1f, Y0, Y0 VPMULLQ Y0, Y1, Y0 - CMPQ CX, DI + ADDQ $0x20, DX + CMPQ DX, CX JNE loop_32 VMOVDQU Y0, (AX) VZEROUPPER - MOVQ DX, ret+32(FP) RET diff --git a/xxhash_other.go b/xxhash_other.go index ca3407f..b4a536f 100644 --- a/xxhash_other.go +++ b/xxhash_other.go @@ -63,16 +63,26 @@ func Sum64(b []byte) uint64 { return h } -func writeBlocks(d *Digest, b []byte) int { +func writeBlocks(d *Digest, extra *[32]byte, b []byte) { v1, v2, v3, v4 := d.s[0], d.s[1], d.s[2], d.s[3] - n := len(b) - for len(b) >= 32 { - v1 = round(v1, u64(b[0:8:len(b)])) - v2 = round(v2, u64(b[8:16:len(b)])) - v3 = round(v3, u64(b[16:24:len(b)])) - v4 = round(v4, u64(b[24:32:len(b)])) - b = b[32:len(b):len(b)] + var s []byte + if extra != nil { + s = extra[:] + } else { + s = b + } + for len(s) >= 32 { + for len(s) >= 32 { + v1 = round(v1, u64(s[0:8:len(s)])) + v2 = round(v2, u64(s[8:16:len(s)])) + v3 = round(v3, u64(s[16:24:len(s)])) + v4 = round(v4, u64(s[24:32:len(s)])) + s = s[32:len(s):len(s)] + } + if extra != nil { + s = b + extra = nil + } } d.s[0], d.s[1], d.s[2], d.s[3] = v1, v2, v3, v4 - return n - len(b) }