From 4383406a09e2ea76a78779dee5ef3d71ab1c64b5 Mon Sep 17 00:00:00 2001 From: HowJmay Date: Wed, 12 May 2021 00:08:36 +0800 Subject: [PATCH] sha3: Add SIMD implementation on ARMv8 On ARMv8 four SIMD instructions, EOR3, RAX1, XAR, BCAX are added to accelerate sha3 operations. Here the SIMD version of sha3 on ARMv8 is added. --- sha3/keccakf.go | 4 +- sha3/keccakf_arm64.go | 25 +++++++ sha3/keccakf_arm64.s | 164 ++++++++++++++++++++++++++++++++++++++++++ sha3/keccakf_noasm.go | 9 +++ 4 files changed, 200 insertions(+), 2 deletions(-) create mode 100644 sha3/keccakf_arm64.go create mode 100644 sha3/keccakf_arm64.s create mode 100644 sha3/keccakf_noasm.go diff --git a/sha3/keccakf.go b/sha3/keccakf.go index 0f4ae8bacf..eea5f5b754 100644 --- a/sha3/keccakf.go +++ b/sha3/keccakf.go @@ -35,9 +35,9 @@ var rc = [24]uint64{ 0x8000000080008008, } -// keccakF1600 applies the Keccak permutation to a 1600b-wide +// keccakF1600Generic applies the Keccak permutation to a 1600b-wide // state represented as a slice of 25 uint64s. -func keccakF1600(a *[25]uint64) { +func keccakF1600Generic(a *[25]uint64) { // Implementation translated from Keccak-inplace.c // in the keccak reference code. var t, bc0, bc1, bc2, bc3, bc4, d0, d1, d2, d3, d4 uint64 diff --git a/sha3/keccakf_arm64.go b/sha3/keccakf_arm64.go new file mode 100644 index 0000000000..d2ade3fc5a --- /dev/null +++ b/sha3/keccakf_arm64.go @@ -0,0 +1,25 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build arm +// +build arm + +package sha3 + +import _ "unsafe" + +// This function is implemented in keccakf_arm64.s. +//go:linkname goarm runtime.goarm +var goarm uint8 + +//go:noescape +func keccakF1600armv8(a *[25]uint64) + +func keccakF1600(a *[25]uint64) { + if goarm >= 8 { + keccakF1600armv8(a) + } else { + keccakF1600Generic(a) + } +} diff --git a/sha3/keccakf_arm64.s b/sha3/keccakf_arm64.s new file mode 100644 index 0000000000..bc719fa7cb --- /dev/null +++ b/sha3/keccakf_arm64.s @@ -0,0 +1,164 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build arm +// +build arm + +#include "textflag.h" + +// func keccakF1600armv8(a *[25]uint64) +TEXT keccakF1600armv8(SB),$0-24 + MOVD a+0(FP), R0 + MOVD $round_consts(SB), R1 + MOVD $24, R2 // counter for loop + + VLD1.P 16(R0), [V0.D1, V1.D1] + VLD1.P 16(R0), [V2.D1, V3.D1] + VLD1.P 16(R0), [V4.D1, V5.D1] + VLD1.P 16(R0), [V6.D1, V7.D1] + VLD1.P 16(R0), [V8.D1, V9.D1] + VLD1.P 16(R0), [V10.D1, V11.D1] + VLD1.P 16(R0), [V12.D1, V13.D1] + VLD1.P 16(R0), [V14.D1, V15.D1] + VLD1.P 16(R0), [V16.D1, V17.D1] + VLD1.P 16(R0), [V18.D1, V19.D1] + VLD1.P 16(R0), [V20.D1, V21.D1] + VLD1.P 16(R0), [V22.D1, V23.D1] + VLD1 (R0), [V24.D1] + + SUB $192, R0, R0 + +loop: + // theta + VEOR3 V20.B16, V15.B16, V10.B16, V25.B16 + VEOR3 V21.B16, V16.B16, V11.B16, V26.B16 + VEOR3 V22.B16, V17.B16, V12.B16, V27.B16 + VEOR3 V23.B16, V18.B16, V13.B16, V28.B16 + VEOR3 V24.B16, V19.B16, V14.B16, V29.B16 + VEOR3 V25.B16, V5.B16, V0.B16, V25.B16 + VEOR3 V26.B16, V6.B16, V1.B16, V26.B16 + VEOR3 V27.B16, V7.B16, V2.B16, V27.B16 + VEOR3 V28.B16, V8.B16, V3.B16, V28.B16 + VEOR3 V29.B16, V9.B16, V4.B16, V29.B16 + + VRAX1 V27.D2, V25.D2, V30.D2 + VRAX1 V28.D2, V26.D2, V31.D2 + VRAX1 V29.D2, V27.D2, V27.D2 + VRAX1 V25.D2, V28.D2, V28.D2 + VRAX1 V26.D2, V29.D2, V29.D2 + + // theta + rho + Pi + VXAR $64-1, V30.D2, V1.D2, V25.D2 + + VXAR $64-44, V30.D2, V6.D2, V1.D2 + VXAR $64-20, V28.D2, V9.D2, V6.D2 + VXAR $64-61, V31.D2, V22.D2, V9.D2 + VXAR $64-39, V28.D2, V14.D2, V22.D2 + VXAR $64-18, V29.D2, V20.D2, V14.D2 + + VXAR $64-62, V31.D2, V2.D2, V26.D2 + + VXAR $64-43, V31.D2, V12.D2, V2.D2 + VXAR $64-25, V27.D2, V13.D2, V12.D2 + VXAR $64-8, V28.D2, V19.D2, V13.D2 + VXAR $64-56, V27.D2, V23.D2, V19.D2 + VXAR $64-41, V29.D2, V15.D2, V23.D2 + + VXAR $64-27, V28.D2, V4.D2, V15.D2 + + VXAR $64-14, V28.D2, V24.D2, V28.D2 + VXAR $64-2, V30.D2, V21.D2, V24.D2 + VXAR $64-55, V27.D2, V8.D2, V8.D2 + VXAR $64-45, V30.D2, V16.D2, V4.D2 + VXAR $64-36, V29.D2, V5.D2, V16.D2 + + VXAR $64-28, V27.D2, V3.D2, V5.D2 + + VEOR V29.B16, V0.B16, V0.B16 + + VXAR $64-21, V27.D2, V18.D2, V27.D2 + VXAR $64-15, V31.D2, V17.D2, V3.D2 + VXAR $64-10, V30.D2, V11.D2, V30.D2 + VXAR $64-6, V31.D2, V7.D2, V31.D2 + VXAR $64-3, V29.D2, V10.D2, V29.D2 + + // chi + iota + VBCAX V8.B16, V22.B16, V26.B16, V20.B16 + VBCAX V22.B16, V23.B16, V8.B16, V21.B16 + VBCAX V23.B16, V24.B16, V22.B16, V22.B16 + VBCAX V24.B16, V26.B16, V23.B16, V23.B16 + VBCAX V26.B16, V8.B16, V24.B16, V24.B16 + + VLD1R.P 8(R1), [V26.D2] + + VBCAX V3.B16, V19.B16, V30.B16, V17.B16 + VBCAX V19.B16, V15.B16, V3.B16, V18.B16 + VBCAX V15.B16, V16.B16, V19.B16, V19.B16 + VBCAX V16.B16, V30.B16, V15.B16, V15.B16 + VBCAX V30.B16, V3.B16, V16.B16, V16.B16 + + VBCAX V31.B16, V12.B16, V25.B16, V10.B16 + VBCAX V12.B16, V13.B16, V31.B16, V11.B16 + VBCAX V13.B16, V14.B16, V12.B16, V12.B16 + VBCAX V14.B16, V25.B16, V13.B16, V13.B16 + VBCAX V25.B16, V31.B16, V14.B16, V14.B16 + + VBCAX V4.B16, V9.B16, V29.B16, V7.B16 + VBCAX V9.B16, V5.B16, V4.B16, V8.B16 + VBCAX V5.B16, V6.B16, V9.B16, V9.B16 + VBCAX V6.B16, V29.B16, V5.B16, V5.B16 + VBCAX V29.B16, V4.B16, V6.B16, V6.B16 + + VBCAX V28.B16, V0.B16, V27.B16, V3.B16 + VBCAX V0.B16, V1.B16, V28.B16, V4.B16 + VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (for chi part) + VBCAX V2.B16, V27.B16, V1.B16, V1.B16 + VBCAX V27.B16, V28.B16, V2.B16, V2.B16 + + VEOR V26.B16, V0.B16, V0.B16 // iota + + SUBS $1, R2, R2 + BNE loop + + VST1.P [V0.D1, V1.D1], 16(R0) + VST1.P [V2.D1, V3.D1], 16(R0) + VST1.P [V4.D1, V5.D1], 16(R0) + VST1.P [V6.D1, V7.D1], 16(R0) + VST1.P [V8.D1, V9.D1], 16(R0) + VST1.P [V10.D1, V11.D1], 16(R0) + VST1.P [V12.D1, V13.D1], 16(R0) + VST1.P [V14.D1, V15.D1], 16(R0) + VST1.P [V16.D1, V17.D1], 16(R0) + VST1.P [V18.D1, V19.D1], 16(R0) + VST1.P [V20.D1, V21.D1], 16(R0) + VST1.P [V22.D1, V23.D1], 16(R0) + VST1 [V24.D1], (R0) + + RET + +DATA round_consts+0x00(SB)/8, $0x0000000000000001 +DATA round_consts+0x08(SB)/8, $0x0000000000008082 +DATA round_consts+0x10(SB)/8, $0x800000000000808a +DATA round_consts+0x18(SB)/8, $0x8000000080008000 +DATA round_consts+0x20(SB)/8, $0x000000000000808b +DATA round_consts+0x28(SB)/8, $0x0000000080000001 +DATA round_consts+0x30(SB)/8, $0x8000000080008081 +DATA round_consts+0x38(SB)/8, $0x8000000000008009 +DATA round_consts+0x40(SB)/8, $0x000000000000008a +DATA round_consts+0x48(SB)/8, $0x0000000000000088 +DATA round_consts+0x50(SB)/8, $0x0000000080008009 +DATA round_consts+0x58(SB)/8, $0x000000008000000a +DATA round_consts+0x60(SB)/8, $0x000000008000808b +DATA round_consts+0x68(SB)/8, $0x800000000000008b +DATA round_consts+0x70(SB)/8, $0x8000000000008089 +DATA round_consts+0x78(SB)/8, $0x8000000000008003 +DATA round_consts+0x80(SB)/8, $0x8000000000008002 +DATA round_consts+0x88(SB)/8, $0x8000000000000080 +DATA round_consts+0x90(SB)/8, $0x000000000000800a +DATA round_consts+0x98(SB)/8, $0x800000008000000a +DATA round_consts+0xA0(SB)/8, $0x8000000080008081 +DATA round_consts+0xA8(SB)/8, $0x8000000000008080 +DATA round_consts+0xB0(SB)/8, $0x0000000080000001 +DATA round_consts+0xB8(SB)/8, $0x8000000080008008 +GLOBL round_consts(SB), (8+16), $192 diff --git a/sha3/keccakf_noasm.go b/sha3/keccakf_noasm.go new file mode 100644 index 0000000000..68264291db --- /dev/null +++ b/sha3/keccakf_noasm.go @@ -0,0 +1,9 @@ +//go:build (!arm64 && !s390x && !ppc64le) || !arm || !gc || purego +// +build !arm64,!s390x,!ppc64le !gc purego !arm + +package sha3 + +// Use generic implementation +func keccakF1600(a *[25]uint64) { + keccakF1600Generic(a) +}