diff --git a/gen/avx512.go b/gen/avx512.go
index 3689c33..68058a6 100644
--- a/gen/avx512.go
+++ b/gen/avx512.go
@@ -28,32 +28,29 @@ func mergeRound(h /*inout*/, v /*in-destroy*/, p1, p2, p4 reg.GPVirtual) {
 	ADDQ(p4, h)
 }
 
+func round(state /*inout*/, p, yprime1, yprime2 reg.Register) {
+	temp := YMM()
+	VMOVDQU(Mem{Base: p}, temp)
+	VPMULLQ(temp, yprime2, temp)
+	VPADDQ(temp, state, state)
+	VPROLQ(Imm(31), state, state)
+	VPMULLQ(state, yprime1, state)
+}
+
 // blockLoop handles 32 bytes at a time in one YMM register.
+// it assume n is 32 bytes or more.
 // state represent v1, v2, v3, v4 as 4 × uint64.
-func blockLoop(state /*inout*/, p /*inout*/, n, p1, p2, processed /*out-optional*/ reg.Register) {
+func blockLoop(state /*inout*/, p /*inout*/, n, yprime1, yprime2 reg.Register) {
 	endp := GP64()
 	MOVL(U32(31), endp.As32())
 	ANDNQ(n, endp, endp)
-	if processed != nil {
-		MOVQ(endp, processed)
-	}
 	ADDQ(p, endp)
 
-	yprime1 := YMM()
-	VPBROADCASTQ(p1, yprime1)
-	yprime2 := YMM()
-	VPBROADCASTQ(p2, yprime2)
-
 	Label("loop_32")
 	{
 		// main block loop
-		temp := YMM()
-		VMOVDQU(Mem{Base: p}, temp)
+		round(state, p, yprime1, yprime2)
 		ADDQ(Imm(32), p)
-		VPMULLQ(temp, yprime2, temp)
-		VPADDQ(temp, state, state)
-		VPROLQ(Imm(31), state, state)
-		VPMULLQ(state, yprime1, state)
 
 		CMPQ(p, endp)
 		JNE(LabelRef("loop_32"))
@@ -68,7 +65,7 @@ func sum64() {
 	DATA(16, U64(0))
 	DATA(24, U64(-prime1))
 
-	TEXT("sum64avx512", NOSPLIT|NOFRAME, "func(b []byte) uint64")
+	TEXT("sum64Avx512", NOSPLIT|NOFRAME, "func(b []byte) uint64")
 	p := Load(Param("b").Base(), GP64())
 	n := Load(Param("b").Len(), GP64())
 
@@ -95,7 +92,11 @@ func sum64() {
 		state := YMM()
 		VMOVDQU(initStateAvx512, state)
 
-		blockLoop(state, p, n, p1, p2, nil)
+		yprime1, yprime2 := YMM(), YMM()
+		VPBROADCASTQ(p1, yprime1)
+		VPBROADCASTQ(p2, yprime2)
+
+		blockLoop(state, p, n, yprime1, yprime2)
 
 		// This interleave two things: extracting v1,2,3,4 from state and computing h.
 		v1, v2, v3, v4, temp := GP64(), GP64(), GP64(), GP64(), GP64()
@@ -212,24 +213,31 @@ func sum64() {
 }
 
 func writeBlocks() {
-	TEXT("writeBlocksAvx512", NOSPLIT|NOFRAME, "func(d *[4]uint64, b []byte) int")
+	TEXT("writeBlocksAvx512", NOSPLIT|NOFRAME, "func(d *[4]uint64, extra *[32]byte, b []byte)")
 	d := Load(Param("d"), GP64())
+	extra := Load(Param("extra"), GP64())
 	p := Load(Param("b").Base(), GP64())
 	n := Load(Param("b").Len(), GP64())
 
 	state := YMM()
 	VMOVDQU(Mem{Base: d, Disp: 0}, state)
 
-	p1, p2 := GP64(), GP64()
+	p1, p2, yprime1, yprime2 := GP64(), GP64(), YMM(), YMM()
 	MOVQ(Imm(prime1), p1)
+	VPBROADCASTQ(p1, yprime1)
 	MOVQ(Imm(prime2), p2)
+	VPBROADCASTQ(p2, yprime2)
+
+	TESTQ(extra, extra)
+	JZ(LabelRef("skip_extra"))
+	{
+		round(state, extra, yprime1, yprime2)
+	}
+	Label("skip_extra")
 
-	processed := GP64()
-	blockLoop(state, p, n, p1, p2, processed)
+	blockLoop(state, p, n, yprime1, yprime2)
 	VMOVDQU(state, Mem{Base: d, Disp: 0})
 	VZEROUPPER()
-
-	Store(processed, ReturnIndex(0))
 	RET()
 }
 
diff --git a/xxhash.go b/xxhash.go
index e3ed38f..dd188e9 100644
--- a/xxhash.go
+++ b/xxhash.go
@@ -30,7 +30,7 @@ type Digest struct {
 	s     [4]uint64
 	total uint64
 	mem   [32]byte
-	n     int // how much of mem is used
+	n     uint8 // how much of mem is used
 }
 
 // New creates a new Digest with a zero seed.
@@ -73,35 +73,32 @@ func (d *Digest) Write(b []byte) (n int, err error) {
 	n = len(b)
 	d.total += uint64(n)
 
-	memleft := d.mem[d.n&(len(d.mem)-1):]
-
-	if d.n+n < 32 {
-		// This new data doesn't even fill the current block.
-		copy(memleft, b)
-		d.n += n
-		return
-	}
-
-	if d.n > 0 {
-		// Finish off the partial block.
-		c := copy(memleft, b)
-		d.s[0] = round(d.s[0], u64(d.mem[0:8]))
-		d.s[1] = round(d.s[1], u64(d.mem[8:16]))
-		d.s[2] = round(d.s[2], u64(d.mem[16:24]))
-		d.s[3] = round(d.s[3], u64(d.mem[24:32]))
-		b = b[c:]
+	var extra *[32]byte
+	if d.n != 0 {
+		// there is data already in mem, append to it.
+		added := copy(d.mem[d.n:], b)
+		b = b[added:]
+		d.n += uint8(added)
+		if uint(d.n) < uint(len(d.mem)) {
+			// not enough data to hash.
+			return
+		}
+		extra = &d.mem
 		d.n = 0
 	}
 
 	if len(b) >= 32 {
 		// One or more full blocks left.
-		nw := writeBlocks(d, b)
-		b = b[nw:]
+		writeBlocks(d, extra, b)
+		b = b[uint(len(b))&^31:]
+	} else if extra != nil {
+		// we don't have enough data to fill b but we have an extra.
+		// write blocks must never be called with len(b) < 32 so pass extra as b.
+		writeBlocks(d, nil, extra[:])
 	}
 
 	// Store any remaining partial block.
-	copy(d.mem[:], b)
-	d.n = len(b)
+	d.n = uint8(copy(d.mem[:], b))
 
 	return
 }
@@ -139,7 +136,7 @@ func (d *Digest) Sum64() uint64 {
 
 	h += d.total
 
-	b := d.mem[:d.n&(len(d.mem)-1)]
+	b := d.mem[:d.n&uint8(len(d.mem)-1)]
 	for ; len(b) >= 8; b = b[8:] {
 		k1 := round(0, u64(b[:8]))
 		h ^= k1
@@ -179,7 +176,7 @@ func (d *Digest) MarshalBinary() ([]byte, error) {
 	b = appendUint64(b, d.s[3])
 	b = appendUint64(b, d.total)
 	b = append(b, d.mem[:d.n]...)
-	b = b[:len(b)+len(d.mem)-d.n]
+	b = b[:len(b)+len(d.mem)-int(d.n)]
 	return b, nil
 }
 
@@ -198,7 +195,7 @@ func (d *Digest) UnmarshalBinary(b []byte) error {
 	b, d.s[3] = consumeUint64(b)
 	b, d.total = consumeUint64(b)
 	copy(d.mem[:], b)
-	d.n = int(d.total % uint64(len(d.mem)))
+	d.n = uint8(d.total % uint64(len(d.mem)))
 	return nil
 }
 
diff --git a/xxhash_amd64.s b/xxhash_amd64.s
index 7a60a92..3f00db5 100644
--- a/xxhash_amd64.s
+++ b/xxhash_amd64.s
@@ -40,11 +40,8 @@
 	IMULQ prime1, acc \
 	ADDQ  prime4, acc
 
-// blockLoop processes as many 32-byte blocks as possible,
-// updating v1, v2, v3, and v4. It assumes that there is at least one block
-// to process.
-#define blockLoop() \
-loop:  \
+// round32 perform a 32byte round loading from ptr on v1, v2, v3, v4.
+#define round32() \
 	MOVQ +0(p), x  \
 	round(v1, x)   \
 	MOVQ +8(p), x  \
@@ -52,13 +49,20 @@ loop:  \
 	MOVQ +16(p), x \
 	round(v3, x)   \
 	MOVQ +24(p), x \
-	round(v4, x)   \
-	ADDQ $32, p    \
-	CMPQ p, end    \
+	round(v4, x)
+
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that there is at least one block
+// to process.
+#define blockLoop() \
+loop:  \
+	round32()   \
+	ADDQ $32, p \
+	CMPQ p, end \
 	JLE  loop
 
 // func sum64(b []byte) uint64
-TEXT ·sum64scallar(SB), NOSPLIT|NOFRAME, $0-32
+TEXT ·sum64Scalar(SB), NOSPLIT|NOFRAME, $0-32
 	// Load fixed primes.
 	MOVQ ·primes+0(SB), prime1
 	MOVQ ·primes+8(SB), prime2
@@ -173,18 +177,12 @@ finalize:
 	MOVQ h, ret+24(FP)
 	RET
 
-// func writeBlocksScallar(d *Digest, b []byte) int
-TEXT ·writeBlocksScallar(SB), NOSPLIT|NOFRAME, $0-40
+// func writeBlocksScalar(d *Digest, extra *[32]byte, b []byte)
+TEXT ·writeBlocksScalar(SB), NOSPLIT|NOFRAME, $0-40
 	// Load fixed primes needed for round.
 	MOVQ ·primes+0(SB), prime1
 	MOVQ ·primes+8(SB), prime2
 
-	// Load slice.
-	MOVQ b_base+8(FP), p
-	MOVQ b_len+16(FP), n
-	LEAQ (p)(n*1), end
-	SUBQ $32, end
-
 	// Load vN from d.
 	MOVQ s+0(FP), d
 	MOVQ 0(d), v1
@@ -192,6 +190,19 @@ TEXT ·writeBlocksScallar(SB), NOSPLIT|NOFRAME, $0-40
 	MOVQ 16(d), v3
 	MOVQ 24(d), v4
 
+	// Handle extra
+	MOVQ extra+8(FP), p
+	TESTQ p, p
+	JZ	noExtra
+		round32()
+noExtra:
+
+	// Load slice.
+	MOVQ b_base+16(FP), p
+	MOVQ b_len+24(FP), n
+	LEAQ (p)(n*1), end
+	SUBQ $32, end
+
 	// We don't need to check the loop condition here; this function is
 	// always called with at least one block of data to process.
 	blockLoop()
@@ -202,8 +213,4 @@ TEXT ·writeBlocksScallar(SB), NOSPLIT|NOFRAME, $0-40
 	MOVQ v3, 16(d)
 	MOVQ v4, 24(d)
 
-	// The number of bytes written is p minus the old base pointer.
-	SUBQ b_base+8(FP), p
-	MOVQ p, ret+32(FP)
-
 	RET
diff --git a/xxhash_arm64.s b/xxhash_arm64.s
index 7e3145a..f03a3e5 100644
--- a/xxhash_arm64.s
+++ b/xxhash_arm64.s
@@ -161,8 +161,8 @@ finalize:
 	MOVD h, ret+24(FP)
 	RET
 
-// func writeBlocks(d *Digest, b []byte) int
-TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
+// func writeBlocksArm64(d *Digest, b []byte)
+TEXT ·writeBlocksArm64(SB), NOSPLIT|NOFRAME, $0-32
 	LDP ·primes+0(SB), (prime1, prime2)
 
 	// Load state. Assume v[1-4] are stored contiguously.
@@ -178,6 +178,4 @@ TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
 	STP (v1, v2), 0(digest)
 	STP (v3, v4), 16(digest)
 
-	BIC  $31, n
-	MOVD n, ret+32(FP)
 	RET
diff --git a/xxhash_asm_amd64.go b/xxhash_asm_amd64.go
index 898b9b3..a0faa63 100644
--- a/xxhash_asm_amd64.go
+++ b/xxhash_asm_amd64.go
@@ -12,26 +12,28 @@ var useAvx512 = cpuid.CPU.Supports(cpuid.AVX, cpuid.AVX2, cpuid.AVX512DQ, cpuid.
 // Sum64 computes the 64-bit xxHash digest of b with a zero seed.
 func Sum64(b []byte) uint64 {
 	if useAvx512 {
-		return sum64avx512(b)
+		return sum64Avx512(b)
 	}
-	return sum64scallar(b)
+	return sum64Scalar(b)
 }
 
 //go:noescape
-func sum64scallar(b []byte) uint64
+func sum64Scalar(b []byte) uint64
 
 //go:noescape
-func sum64avx512(b []byte) uint64
+func sum64Avx512(b []byte) uint64
 
-func writeBlocks(d *Digest, b []byte) int {
+// extra is a first block before b, it may be nil then skip it.
+func writeBlocks(d *Digest, extra *[32]byte, b []byte) {
 	if useAvx512 {
-		return writeBlocksAvx512(&d.s, b)
+		writeBlocksAvx512(&d.s, extra, b)
+		return
 	}
-	return writeBlocksScallar(d, b)
+	writeBlocksScalar(d, nil, b)
 }
 
 //go:noescape
-func writeBlocksAvx512(d *[4]uint64, b []byte) int
+func writeBlocksAvx512(d *[4]uint64, extra *[32]byte, b []byte)
 
 //go:noescape
-func writeBlocksScallar(d *Digest, b []byte) int
+func writeBlocksScalar(d *Digest, extra *[32]byte, b []byte)
diff --git a/xxhash_asm_arm64.go b/xxhash_asm_arm64.go
index ede1693..06e89a0 100644
--- a/xxhash_asm_arm64.go
+++ b/xxhash_asm_arm64.go
@@ -10,5 +10,17 @@ var useAvx512 = false
 //go:noescape
 func Sum64(b []byte) uint64
 
+// extra is a first block before b, it may be nil then skip it.
+func writeBlocks(d *Digest, extra *[32]byte, b []byte) {
+	if extra != nil {
+		// FIXME: handle that logic in ASM, *someone* was lazy and didn't
+		// cared to learn the arm64 p9 syntax.
+		// At least this is hopefully on par with how fast the software impl
+		// it used to be.
+		writeBlocksArm64(d, extra[:])
+	}
+	writeBlocksArm64(d, b)
+}
+
 //go:noescape
-func writeBlocks(d *Digest, b []byte) int
+func writeBlocksArm64(d *Digest, b []byte)
diff --git a/xxhash_avx512_amd64.s b/xxhash_avx512_amd64.s
index 7e0665a..c779a24 100644
--- a/xxhash_avx512_amd64.s
+++ b/xxhash_avx512_amd64.s
@@ -8,9 +8,9 @@ DATA ·initWideAvx512<>+16(SB)/8, $0x0000000000000000
 DATA ·initWideAvx512<>+24(SB)/8, $0x61c8864e7a143579
 GLOBL ·initWideAvx512<>(SB), NOSPLIT|NOPTR, $32
 
-// func sum64avx512(b []byte) uint64
+// func sum64Avx512(b []byte) uint64
 // Requires: AVX, AVX2, AVX512DQ, AVX512F, AVX512VL, BMI
-TEXT ·sum64avx512(SB), NOSPLIT|NOFRAME, $0-32
+TEXT ·sum64Avx512(SB), NOSPLIT|NOFRAME, $0-32
 	MOVQ         b_base+0(FP), AX
 	MOVQ         b_len+8(FP), CX
 	MOVQ         $0x9e3779b185ebca87, DX
@@ -28,19 +28,19 @@ TEXT ·sum64avx512(SB), NOSPLIT|NOFRAME, $0-32
 	CMPQ         CX, $0x1f
 	JBE          loop_8
 	VMOVDQU      ·initWideAvx512<>+0(SB), Y0
+	VPBROADCASTQ DX, Y1
+	VPBROADCASTQ BX, Y2
 	MOVL         $0x0000001f, R9
 	ANDNQ        CX, R9, R9
 	ADDQ         AX, R9
-	VPBROADCASTQ DX, Y1
-	VPBROADCASTQ BX, Y2
 
 loop_32:
 	VMOVDQU      (AX), Y3
-	ADDQ         $0x20, AX
 	VPMULLQ      Y3, Y2, Y3
 	VPADDQ       Y3, Y0, Y0
 	VPROLQ       $0x1f, Y0, Y0
 	VPMULLQ      Y0, Y1, Y0
+	ADDQ         $0x20, AX
 	CMPQ         AX, R9
 	JNE          loop_32
 	VMOVQ        X0, R10
@@ -151,32 +151,40 @@ zero:
 	MOVQ R9, ret+24(FP)
 	RET
 
-// func writeBlocksAvx512(d *[4]uint64, b []byte) int
+// func writeBlocksAvx512(d *[4]uint64, extra *[32]byte, b []byte)
 // Requires: AVX, AVX2, AVX512DQ, AVX512F, AVX512VL, BMI
 TEXT ·writeBlocksAvx512(SB), NOSPLIT|NOFRAME, $0-40
 	MOVQ         d+0(FP), AX
-	MOVQ         b_base+8(FP), CX
-	MOVQ         b_len+16(FP), DX
+	MOVQ         extra+8(FP), CX
+	MOVQ         b_base+16(FP), DX
+	MOVQ         b_len+24(FP), BX
 	VMOVDQU      (AX), Y0
-	MOVQ         $0x9e3779b185ebca87, BX
+	MOVQ         $0x9e3779b185ebca87, SI
+	VPBROADCASTQ SI, Y1
 	MOVQ         $0xc2b2ae3d27d4eb4f, SI
-	MOVL         $0x0000001f, DI
-	ANDNQ        DX, DI, DI
-	MOVQ         DI, DX
-	ADDQ         CX, DI
-	VPBROADCASTQ BX, Y1
 	VPBROADCASTQ SI, Y2
+	TESTQ        CX, CX
+	JZ           skip_extra
+	VMOVDQU      (CX), Y3
+	VPMULLQ      Y3, Y2, Y3
+	VPADDQ       Y3, Y0, Y0
+	VPROLQ       $0x1f, Y0, Y0
+	VPMULLQ      Y0, Y1, Y0
+
+skip_extra:
+	MOVL  $0x0000001f, CX
+	ANDNQ BX, CX, CX
+	ADDQ  DX, CX
 
 loop_32:
-	VMOVDQU (CX), Y3
-	ADDQ    $0x20, CX
+	VMOVDQU (DX), Y3
 	VPMULLQ Y3, Y2, Y3
 	VPADDQ  Y3, Y0, Y0
 	VPROLQ  $0x1f, Y0, Y0
 	VPMULLQ Y0, Y1, Y0
-	CMPQ    CX, DI
+	ADDQ    $0x20, DX
+	CMPQ    DX, CX
 	JNE     loop_32
 	VMOVDQU Y0, (AX)
 	VZEROUPPER
-	MOVQ    DX, ret+32(FP)
 	RET
diff --git a/xxhash_other.go b/xxhash_other.go
index ca3407f..b4a536f 100644
--- a/xxhash_other.go
+++ b/xxhash_other.go
@@ -63,16 +63,26 @@ func Sum64(b []byte) uint64 {
 	return h
 }
 
-func writeBlocks(d *Digest, b []byte) int {
+func writeBlocks(d *Digest, extra *[32]byte, b []byte) {
 	v1, v2, v3, v4 := d.s[0], d.s[1], d.s[2], d.s[3]
-	n := len(b)
-	for len(b) >= 32 {
-		v1 = round(v1, u64(b[0:8:len(b)]))
-		v2 = round(v2, u64(b[8:16:len(b)]))
-		v3 = round(v3, u64(b[16:24:len(b)]))
-		v4 = round(v4, u64(b[24:32:len(b)]))
-		b = b[32:len(b):len(b)]
+	var s []byte
+	if extra != nil {
+		s = extra[:]
+	} else {
+		s = b
+	}
+	for len(s) >= 32 {
+		for len(s) >= 32 {
+			v1 = round(v1, u64(s[0:8:len(s)]))
+			v2 = round(v2, u64(s[8:16:len(s)]))
+			v3 = round(v3, u64(s[16:24:len(s)]))
+			v4 = round(v4, u64(s[24:32:len(s)]))
+			s = s[32:len(s):len(s)]
+		}
+		if extra != nil {
+			s = b
+			extra = nil
+		}
 	}
 	d.s[0], d.s[1], d.s[2], d.s[3] = v1, v2, v3, v4
-	return n - len(b)
 }