Skip to content

Commit

Permalink
sha3: Add SIMD implementation on ARMv8
Browse files Browse the repository at this point in the history
On ARMv8 four SIMD instructions, EOR3, RAX1, XAR, BCAX are added
to accelerate sha3 operations. Here the SIMD version of sha3
on ARMv8 is added.

Compare to the pure golang implementation (the implementation in
keccakf.go), the time difference is listed in the following

benchmark                            old ns/op        new ns/op      delta
BenchmarkPermutationFunction-8     227.0 ns/op      153.6 ns/op    -32.33%
BenchmarkSha3_512_MTU-8             4954 ns/op       3296 ns/op    -33.47%
BenchmarkSha3_384_MTU-8             3586 ns/op       2441 ns/op    -31.93%
BenchmarkSha3_256_MTU-8             2909 ns/op       1982 ns/op    -31.87%
BenchmarkSha3_224_MTU-8             2779 ns/op       1905 ns/op    -31.45%
BenchmarkShake128_MTU-8             2326 ns/op       1588 ns/op    -31.73%
BenchmarkShake256_MTU-8             2485 ns/op       1670 ns/op    -32.80%
BenchmarkShake256_16x-8            37052 ns/op      26715 ns/op    -27.90%
BenchmarkShake256_1MiB-8         1911863 ns/op    1293014 ns/op    -32.37%
BenchmarkSha3_512_1MiB-8         3496335 ns/op    2317853 ns/op    -33.71%

benchmark                             old MB/s        new MB/s     speedup
BenchmarkPermutationFunction-8     881.22 MB/s    1302.48 MB/s       1.48x
BenchmarkSha3_512_MTU-8            272.50 MB/s     409.64 MB/s       1.50x
BenchmarkSha3_384_MTU-8            376.47 MB/s     553.06 MB/s       1.47x
BenchmarkSha3_256_MTU-8            464.11 MB/s     681.27 MB/s       1.47x
BenchmarkSha3_224_MTU-8            485.75 MB/s     708.83 MB/s       1.46x
BenchmarkShake128_MTU-8            580.32 MB/s     849.97 MB/s       1.46x
BenchmarkShake256_MTU-8            543.34 MB/s     808.53 MB/s       1.49x
BenchmarkShake256_16x-8            442.19 MB/s     613.29 MB/s       1.39x
BenchmarkShake256_1MiB-8           548.46 MB/s     810.95 MB/s       1.48x
BenchmarkSha3_512_1MiB-8           299.91 MB/s     452.39 MB/s       1.51x
  • Loading branch information
howjmay committed Sep 4, 2022
1 parent 5ff15b2 commit acc175a
Show file tree
Hide file tree
Showing 4 changed files with 200 additions and 4 deletions.
8 changes: 4 additions & 4 deletions sha3/keccakf.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build !amd64 || purego || !gc
// +build !amd64 purego !gc
//go:build !amd64 || purego || !gc || !arm64
// +build !amd64 purego !gc !arm64

package sha3

Expand Down Expand Up @@ -35,9 +35,9 @@ var rc = [24]uint64{
0x8000000080008008,
}

// keccakF1600 applies the Keccak permutation to a 1600b-wide
// keccakF1600Generic applies the Keccak permutation to a 1600b-wide
// state represented as a slice of 25 uint64s.
func keccakF1600(a *[25]uint64) {
func keccakF1600Generic(a *[25]uint64) {
// Implementation translated from Keccak-inplace.c
// in the keccak reference code.
var t, bc0, bc1, bc2, bc3, bc4, d0, d1, d2, d3, d4 uint64
Expand Down
19 changes: 19 additions & 0 deletions sha3/keccakf_arm64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build arm64
// +build arm64

package sha3

// This function is implemented in keccakf_arm64.s.
// For ARMv8 machines GOARM=n/a, and GOARCH=arm64
// see https://go.dev/wiki/GoArm
//go:noescape
func keccakF1600NEON(a *[25]uint64)

func keccakF1600(a *[25]uint64) {
// FIXME: use "golang.org/x/sys/cpu" to check if the running machine has SHA3 feature.
keccakF1600NEON(a)
}
165 changes: 165 additions & 0 deletions sha3/keccakf_arm64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build arm64
// +build arm64

#include "textflag.h"

// func keccakF1600NEON(a *[25]uint64)
TEXT ·keccakF1600NEON(SB), NOSPLIT, $200-8
MOVD a+0(FP), R0
MOVD $round_consts(SB), R1
MOVD $24, R2 // counter for loop

VLD1.P 16(R0), [V0.D1, V1.D1]
VLD1.P 16(R0), [V2.D1, V3.D1]
VLD1.P 16(R0), [V4.D1, V5.D1]
VLD1.P 16(R0), [V6.D1, V7.D1]
VLD1.P 16(R0), [V8.D1, V9.D1]
VLD1.P 16(R0), [V10.D1, V11.D1]
VLD1.P 16(R0), [V12.D1, V13.D1]
VLD1.P 16(R0), [V14.D1, V15.D1]
VLD1.P 16(R0), [V16.D1, V17.D1]
VLD1.P 16(R0), [V18.D1, V19.D1]
VLD1.P 16(R0), [V20.D1, V21.D1]
VLD1.P 16(R0), [V22.D1, V23.D1]
VLD1 (R0), [V24.D1]

SUB $192, R0, R0

loop:
// theta
VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
VEOR3 V22.B16, V17.B16, V12.B16, V27.B16
VEOR3 V23.B16, V18.B16, V13.B16, V28.B16
VEOR3 V24.B16, V19.B16, V14.B16, V29.B16
VEOR3 V25.B16, V5.B16, V0.B16, V25.B16
VEOR3 V26.B16, V6.B16, V1.B16, V26.B16
VEOR3 V27.B16, V7.B16, V2.B16, V27.B16
VEOR3 V28.B16, V8.B16, V3.B16, V28.B16
VEOR3 V29.B16, V9.B16, V4.B16, V29.B16

VRAX1 V27.D2, V25.D2, V30.D2
VRAX1 V28.D2, V26.D2, V31.D2
VRAX1 V29.D2, V27.D2, V27.D2
VRAX1 V25.D2, V28.D2, V28.D2
VRAX1 V26.D2, V29.D2, V29.D2

// theta and rho and Pi
VXAR $63, V30.D2, V1.D2, V25.D2

VXAR $20, V30.D2, V6.D2, V1.D2
VXAR $44, V28.D2, V9.D2, V6.D2
VXAR $3, V31.D2, V22.D2, V9.D2
VXAR $25, V28.D2, V14.D2, V22.D2
VXAR $46, V29.D2, V20.D2, V14.D2

VXAR $2, V31.D2, V2.D2, V26.D2

VXAR $21, V31.D2, V12.D2, V2.D2
VXAR $39, V27.D2, V13.D2, V12.D2
VXAR $56, V28.D2, V19.D2, V13.D2
VXAR $8, V27.D2, V23.D2, V19.D2
VXAR $23, V29.D2, V15.D2, V23.D2

VXAR $37, V28.D2, V4.D2, V15.D2

VXAR $50, V28.D2, V24.D2, V28.D2
VXAR $62, V30.D2, V21.D2, V24.D2
VXAR $9, V27.D2, V8.D2, V8.D2
VXAR $19, V30.D2, V16.D2, V4.D2
VXAR $28, V29.D2, V5.D2, V16.D2

VXAR $36, V27.D2, V3.D2, V5.D2

VEOR V29.B16, V0.B16, V0.B16

VXAR $43, V27.D2, V18.D2, V27.D2
VXAR $49, V31.D2, V17.D2, V3.D2
VXAR $54, V30.D2, V11.D2, V30.D2
VXAR $58, V31.D2, V7.D2, V31.D2
VXAR $61, V29.D2, V10.D2, V29.D2

// chi and iota
VBCAX V8.B16, V22.B16, V26.B16, V20.B16
VBCAX V22.B16, V23.B16, V8.B16, V21.B16
VBCAX V23.B16, V24.B16, V22.B16, V22.B16
VBCAX V24.B16, V26.B16, V23.B16, V23.B16
VBCAX V26.B16, V8.B16, V24.B16, V24.B16

VLD1R.P 8(R1), [V26.D2]

VBCAX V3.B16, V19.B16, V30.B16, V17.B16
VBCAX V19.B16, V15.B16, V3.B16, V18.B16
VBCAX V15.B16, V16.B16, V19.B16, V19.B16
VBCAX V16.B16, V30.B16, V15.B16, V15.B16
VBCAX V30.B16, V3.B16, V16.B16, V16.B16

VBCAX V31.B16, V12.B16, V25.B16, V10.B16
VBCAX V12.B16, V13.B16, V31.B16, V11.B16
VBCAX V13.B16, V14.B16, V12.B16, V12.B16
VBCAX V14.B16, V25.B16, V13.B16, V13.B16
VBCAX V25.B16, V31.B16, V14.B16, V14.B16

VBCAX V4.B16, V9.B16, V29.B16, V7.B16
VBCAX V9.B16, V5.B16, V4.B16, V8.B16
VBCAX V5.B16, V6.B16, V9.B16, V9.B16
VBCAX V6.B16, V29.B16, V5.B16, V5.B16
VBCAX V29.B16, V4.B16, V6.B16, V6.B16

VBCAX V28.B16, V0.B16, V27.B16, V3.B16
VBCAX V0.B16, V1.B16, V28.B16, V4.B16

VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (chi part)
VEOR V26.B16, V0.B16, V0.B16 // iota

VBCAX V2.B16, V27.B16, V1.B16, V1.B16
VBCAX V27.B16, V28.B16, V2.B16, V2.B16

SUB $1, R2, R2
CBNZ R2, loop

VST1.P [V0.D1, V1.D1], 16(R0)
VST1.P [V2.D1, V3.D1], 16(R0)
VST1.P [V4.D1, V5.D1], 16(R0)
VST1.P [V6.D1, V7.D1], 16(R0)
VST1.P [V8.D1, V9.D1], 16(R0)
VST1.P [V10.D1, V11.D1], 16(R0)
VST1.P [V12.D1, V13.D1], 16(R0)
VST1.P [V14.D1, V15.D1], 16(R0)
VST1.P [V16.D1, V17.D1], 16(R0)
VST1.P [V18.D1, V19.D1], 16(R0)
VST1.P [V20.D1, V21.D1], 16(R0)
VST1.P [V22.D1, V23.D1], 16(R0)
VST1 [V24.D1], (R0)

RET

DATA round_consts<>+0x00(SB)/8, $0x0000000000000001
DATA round_consts<>+0x08(SB)/8, $0x0000000000008082
DATA round_consts<>+0x10(SB)/8, $0x800000000000808a
DATA round_consts<>+0x18(SB)/8, $0x8000000080008000
DATA round_consts<>+0x20(SB)/8, $0x000000000000808b
DATA round_consts<>+0x28(SB)/8, $0x0000000080000001
DATA round_consts<>+0x30(SB)/8, $0x8000000080008081
DATA round_consts<>+0x38(SB)/8, $0x8000000000008009
DATA round_consts<>+0x40(SB)/8, $0x000000000000008a
DATA round_consts<>+0x48(SB)/8, $0x0000000000000088
DATA round_consts<>+0x50(SB)/8, $0x0000000080008009
DATA round_consts<>+0x58(SB)/8, $0x000000008000000a
DATA round_consts<>+0x60(SB)/8, $0x000000008000808b
DATA round_consts<>+0x68(SB)/8, $0x800000000000008b
DATA round_consts<>+0x70(SB)/8, $0x8000000000008089
DATA round_consts<>+0x78(SB)/8, $0x8000000000008003
DATA round_consts<>+0x80(SB)/8, $0x8000000000008002
DATA round_consts<>+0x88(SB)/8, $0x8000000000000080
DATA round_consts<>+0x90(SB)/8, $0x000000000000800a
DATA round_consts<>+0x98(SB)/8, $0x800000008000000a
DATA round_consts<>+0xA0(SB)/8, $0x8000000080008081
DATA round_consts<>+0xA8(SB)/8, $0x8000000000008080
DATA round_consts<>+0xB0(SB)/8, $0x0000000080000001
DATA round_consts<>+0xB8(SB)/8, $0x8000000080008008
GLOBL round_consts(SB), (NOPTR+RODATA), $192
12 changes: 12 additions & 0 deletions sha3/keccakf_noasm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego || !arm64
// +build !arm64,!s390x,!ppc64le !gc purego !arm64

package sha3

func keccakF1600(a *[25]uint64) {
keccakF1600Generic(a)
}

0 comments on commit acc175a

Please sign in to comment.