-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
varint: experiments in bit tricks for varint decoding
- Loading branch information
Showing
4 changed files
with
531 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
This directory contains experiments with varint decoding using | ||
hand-coded assembly. | ||
|
||
The simple assembly loop is 15–30% faster than the Go loop. The loop | ||
is somewhat clever, but in principle the compiler could probably | ||
produce this code. | ||
|
||
Most of the code experiments with BMI2 instructions. This requires | ||
Haswell or newer, which the benchmark will detect. This approach is | ||
constant time up to 8 byte varints (56 bit values). It's 50% faster | ||
than the Go code for 8 byte varints, but 80% slower for one byte | ||
varints. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,295 @@ | ||
// Copyright 2016 The Go Authors. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
#include "textflag.h" | ||
|
||
GLOBL ·hasBMI2(SB),NOPTR,$1 | ||
|
||
TEXT ·queryBMI2(SB),NOSPLIT,$0-1 | ||
// TODO: Check validity of query. | ||
MOVQ $0x07, AX | ||
MOVQ $0, CX | ||
CPUID | ||
// Bit 8 of EBX indicates BMI2 support. | ||
BTQ $8, BX | ||
SETCS ret+0(FP) | ||
RET | ||
|
||
// Hand-coded byte decoding loop with some clever tricks. | ||
TEXT ·decodeVarintAsmLoop(SB),NOSPLIT,$0-40 | ||
MOVQ buf_base+0(FP), BX // Pointer | ||
MOVQ buf_len+8(FP), AX // Length | ||
MOVL $10, CX | ||
CMPQ AX, CX | ||
CMOVLGT CX, AX // Length is at most 10 | ||
XORL SI, SI // Index | ||
XORL CX, CX // Shift | ||
XORL DX, DX // Value | ||
|
||
loop: | ||
CMPL SI, AX | ||
JEQ bad // Reached end of buffer or >10 bytes | ||
|
||
MOVBLZX (SI)(BX*1), DI // Load next byte | ||
INCL SI | ||
BTRL $7, DI // Is bit 7 set? Clear bit 7. | ||
JNC last // If not set, this is the final byte | ||
SHLQ CL, DI // value |= value << shift | ||
ORQ DI, DX | ||
ADDL $7, CX // shift += 7 | ||
JMP loop | ||
|
||
last: | ||
SHLQ CL, DI // Final value |= value << shift | ||
ORQ DI, DX | ||
// Return decoded value and length. | ||
MOVQ DX, x+24(FP) | ||
MOVQ SI, n+32(FP) | ||
RET | ||
|
||
bad: | ||
MOVQ $0, x+24(FP) | ||
MOVQ $0, n+32(FP) | ||
RET | ||
|
||
// decodeVarintAsmBMI2 uses the BMI2 PEXT instruction to extract 7 | ||
// bits from each byte in one instruction. | ||
TEXT ·decodeVarintAsmBMI2(SB),NOSPLIT,$0-40 | ||
MOVQ buf_base+0(FP), BX | ||
MOVQ buf_len+8(FP), CX | ||
|
||
// Take the slow path if there's no BMI2 or there are fewer | ||
// than 8 bytes available. | ||
MOVBLZX ·hasBMI2(SB), AX | ||
TESTB AL, AL | ||
JEQ slowpath | ||
CMPQ CX, $8 | ||
JLT slowpath | ||
|
||
// Load 8 bytes from buf. | ||
MOVQ (BX), AX | ||
|
||
// Extract the continuation bits into BX. | ||
MOVQ AX, M0 | ||
PMOVMSKB M0, BX | ||
// Compute byte length - 1 of varint into BX. | ||
NOTL BX | ||
BSFL BX, BX | ||
// If it's more than 8 bytes, take the slow path. | ||
CMPL BX, $8 | ||
JGE slowpath | ||
// Extract the relevant bytes from the input. | ||
INCL BX | ||
MOVQ BX, CX | ||
SHLQ $(3+8), CX // CX[15:8] = (byte len * 8); CX[7:0] = 0 | ||
BEXTRQ CX, AX, AX // Requires BMI1 | ||
// Extract the low 7 bits from each byte of the input. | ||
MOVQ $0x7f7f7f7f7f7f7f7f, DI | ||
PEXTQ DI, AX, DX // Requires BMI2 | ||
// Return decoded value and length. | ||
MOVQ DX, x+24(FP) | ||
MOVQ BX, n+32(FP) | ||
RET | ||
|
||
slowpath: | ||
// Consume buffer one byte at a time. | ||
// TODO: Could merge with some of the above registers better. | ||
MOVQ buf_base+0(FP), BX // Pointer | ||
MOVQ buf_len+8(FP), AX // Length | ||
MOVQ $10, CX | ||
CMPQ AX, CX | ||
CMOVQGT CX, AX // Length is at most 10 | ||
XORQ SI, SI // Index | ||
XORQ CX, CX // Shift | ||
XORQ DX, DX // Value | ||
|
||
loop: | ||
CMPQ SI, AX | ||
JEQ bad // Reached end of buffer or >10 bytes | ||
|
||
MOVBLZX (SI)(BX*1), DI // Load next byte | ||
INCQ SI | ||
BTRL $7, DI // Is bit 7 set? Clear bit 7. | ||
JNC last // If not set, this is the final byte | ||
SHLQ CL, DI // value |= value << shift | ||
ORQ DI, DX | ||
ADDQ $7, CX // shift += 7 | ||
JMP loop | ||
|
||
last: | ||
SHLQ CL, DI // value |= value << shift | ||
ORQ DI, DX | ||
// Return decoded value and length. | ||
MOVQ DX, x+24(FP) | ||
MOVQ SI, n+32(FP) | ||
RET | ||
|
||
bad: | ||
MOVQ $0, x+24(FP) | ||
MOVQ $0, n+32(FP) | ||
RET | ||
|
||
// The other two also use PEXT, but use different tricks to extract | ||
// the length and set up the mask. They turned out to be slower than | ||
// the one above, but are historically interesting. | ||
|
||
DATA extract<>+0x00(SB)/8,$0x000000000000007f | ||
DATA extract<>+0x08(SB)/8,$0x0000000000007f7f | ||
DATA extract<>+0x10(SB)/8,$0x00000000007f7f7f | ||
DATA extract<>+0x18(SB)/8,$0x000000007f7f7f7f | ||
DATA extract<>+0x20(SB)/8,$0x0000007f7f7f7f7f | ||
DATA extract<>+0x28(SB)/8,$0x00007f7f7f7f7f7f | ||
DATA extract<>+0x30(SB)/8,$0x007f7f7f7f7f7f7f | ||
DATA extract<>+0x38(SB)/8,$0x7f7f7f7f7f7f7f7f | ||
GLOBL extract<>(SB),(NOPTR+RODATA),$(8*8) | ||
|
||
TEXT ·decodeVarintAsm1(SB),NOSPLIT,$0-40 | ||
// Take the slow path if there's no BMI2 or there are fewer | ||
// than 8 bytes available. | ||
MOVBLZX ·hasBMI2(SB), AX | ||
TESTB AL, AL | ||
JEQ slowpath | ||
MOVQ buf_len+8(FP), AX | ||
CMPQ AX, $8 | ||
JLT slowpath | ||
|
||
// Load 8 bytes from buf. | ||
MOVQ buf_base+0(FP), AX | ||
MOVQ (AX), AX | ||
|
||
// Extract the continuation bits into BX. | ||
MOVQ AX, M0 | ||
PMOVMSKB M0, BX | ||
// Compute byte length - 1 of varint into BX. | ||
NOTL BX | ||
BSFL BX, BX | ||
// If it's more than 8 bytes, take the slow path. | ||
CMPL BX, $8 | ||
JGE slowpath | ||
// Extract the value into DX using a mask lookup table. | ||
MOVQ $extract<>(SB), CX | ||
MOVQ (CX)(BX*8), DX | ||
PEXTQ DX, AX, DX // Requires BMI2 | ||
// Return decoded value and length. | ||
MOVQ DX, x+24(FP) | ||
INCL BX | ||
MOVQ BX, n+32(FP) | ||
RET | ||
|
||
slowpath: | ||
// Consume buffer one byte at a time. | ||
// TODO: Could merge with some of the above registers better. | ||
MOVQ buf_base+0(FP), BX // Pointer | ||
MOVQ buf_len+8(FP), AX // Length | ||
MOVQ $10, CX | ||
CMPQ AX, CX | ||
CMOVQGT CX, AX // Length is at most 10 | ||
XORQ SI, SI // Index | ||
XORQ CX, CX // Shift | ||
XORQ DX, DX // Value | ||
|
||
loop: | ||
CMPQ SI, AX | ||
JEQ bad // Reached end of buffer or >10 bytes | ||
|
||
MOVBLZX (SI)(BX*1), DI // Load next byte | ||
INCQ SI | ||
BTRL $7, DI // Is bit 7 set? Clear bit 7. | ||
JNC last // If not set, this is the final byte | ||
SHLQ CL, DI // value |= value << shift | ||
ORQ DI, DX | ||
ADDQ $7, CX // shift += 7 | ||
JMP loop | ||
|
||
last: | ||
SHLQ CL, DI // value |= value << shift | ||
ORQ DI, DX | ||
// Return decoded value and length. | ||
MOVQ DX, x+24(FP) | ||
MOVQ SI, n+32(FP) | ||
RET | ||
|
||
bad: | ||
MOVQ $0, x+24(FP) | ||
MOVQ $0, n+32(FP) | ||
RET | ||
|
||
TEXT ·decodeVarintAsm2(SB),NOSPLIT,$0-40 | ||
MOVQ buf_base+0(FP), BX | ||
MOVQ buf_len+8(FP), CX | ||
|
||
// Take the slow path if there's no BMI2 or there are fewer | ||
// than 8 bytes available. | ||
MOVBLZX ·hasBMI2(SB), AX | ||
TESTB AL, AL | ||
JEQ slowpath | ||
CMPQ CX, $8 | ||
JLT slowpath | ||
|
||
// Load 8 bytes from buf. | ||
MOVQ (BX), AX | ||
|
||
// Get continuation bit mask into DX. | ||
MOVQ $0x7f7f7f7f7f7f7f7f, DI | ||
MOVQ AX, DX | ||
ORQ DI, DX | ||
// Compute bit length of varint into CX. | ||
NOTQ DX | ||
BSFQ DX, CX | ||
// If all continuation bits are set, take the slow path. | ||
JZ slowpath | ||
// Compute bit extraction mask into R14. | ||
//BLSMSKQ DX, R14 // Requires BMI1 | ||
BYTE $0xc4; BYTE $0xe2; BYTE $0x88; BYTE $0xf3; BYTE $0xd2 | ||
// Mask the value. | ||
ANDQ R14, AX | ||
// Extract the bits. | ||
PEXTQ DI, AX, DX // Requires BMI2 | ||
|
||
// Compute byte length. 7=>1, 15=>2, etc. | ||
INCQ CX | ||
SHRQ $3, CX | ||
|
||
// Return decoded value and length. | ||
MOVQ DX, x+24(FP) | ||
MOVQ CX, n+32(FP) | ||
RET | ||
|
||
slowpath: | ||
// Consume buffer one byte at a time. | ||
// TODO: Could merge with some of the above registers better. | ||
MOVQ buf_base+0(FP), BX // Pointer | ||
MOVQ buf_len+8(FP), AX // Length | ||
MOVQ $10, CX | ||
CMPQ AX, CX | ||
CMOVQGT CX, AX // Length is at most 10 | ||
XORQ SI, SI // Index | ||
XORQ CX, CX // Shift | ||
XORQ DX, DX // Value | ||
|
||
loop: | ||
CMPQ SI, AX | ||
JEQ bad // Reached end of buffer or >10 bytes | ||
|
||
MOVBLZX (SI)(BX*1), DI // Load next byte | ||
INCQ SI | ||
BTRL $7, DI // Is bit 7 set? Clear bit 7. | ||
JNC last // If not set, this is the final byte | ||
SHLQ CL, DI // value |= value << shift | ||
ORQ DI, DX | ||
ADDQ $7, CX // shift += 7 | ||
JMP loop | ||
|
||
last: | ||
SHLQ CL, DI // value |= value << shift | ||
ORQ DI, DX | ||
// Return decoded value and length. | ||
MOVQ DX, x+24(FP) | ||
MOVQ SI, n+32(FP) | ||
RET | ||
|
||
bad: | ||
MOVQ $0, x+24(FP) | ||
MOVQ $0, n+32(FP) | ||
RET |
Oops, something went wrong.