Skip to content

Commit

Permalink
crypto/sha512: provide optimised assembly for riscv64
Browse files Browse the repository at this point in the history
Provide an optimised assembly implementation of sha512 for riscv64.
This results in significant performance gains.

On a StarFive VisionFive 2:

                    │   sha512a   │               sha512b               │
                    │   sec/op    │   sec/op     vs base                │
Hash8Bytes/New-4      7.998µ ± 0%   6.962µ ± 0%  -12.96% (p=0.000 n=10)
Hash8Bytes/Sum384-4   8.113µ ± 0%   6.651µ ± 0%  -18.02% (p=0.000 n=10)
Hash8Bytes/Sum512-4   8.269µ ± 0%   6.748µ ± 0%  -18.39% (p=0.000 n=10)
Hash1K/New-4          57.38µ ± 0%   36.92µ ± 0%  -35.66% (p=0.000 n=10)
Hash1K/Sum384-4       57.47µ ± 0%   36.57µ ± 0%  -36.37% (p=0.000 n=10)
Hash1K/Sum512-4       57.61µ ± 0%   36.75µ ± 0%  -36.21% (p=0.000 n=10)
Hash8K/New-4          402.5µ ± 0%   245.4µ ± 0%  -39.02% (p=0.000 n=10)
Hash8K/Sum384-4       402.5µ ± 0%   245.1µ ± 0%  -39.12% (p=0.000 n=10)
Hash8K/Sum512-4       402.7µ ± 0%   245.3µ ± 0%  -39.09% (p=0.000 n=10)

                    │   sha512a    │                sha512b                │
                    │     B/s      │      B/s       vs base                │
Hash8Bytes/New-4      976.6Ki ± 0%   1123.0Ki ± 0%  +15.00% (p=0.000 n=10)
Hash8Bytes/Sum384-4   966.8Ki ± 0%   1171.9Ki ± 0%  +21.21% (p=0.000 n=10)
Hash8Bytes/Sum512-4   947.3Ki ± 0%   1162.1Ki ± 1%  +22.68% (p=0.000 n=10)
Hash1K/New-4          17.01Mi ± 0%    26.45Mi ± 0%  +55.47% (p=0.000 n=10)
Hash1K/Sum384-4       16.99Mi ± 0%    26.70Mi ± 0%  +57.13% (p=0.000 n=10)
Hash1K/Sum512-4       16.95Mi ± 0%    26.57Mi ± 0%  +56.74% (p=0.000 n=10)
Hash8K/New-4          19.41Mi ± 0%    31.83Mi ± 0%  +63.99% (p=0.000 n=10)
Hash8K/Sum384-4       19.41Mi ± 0%    31.88Mi ± 0%  +64.28% (p=0.000 n=10)
Hash8K/Sum512-4       19.40Mi ± 0%    31.85Mi ± 0%  +64.21% (p=0.000 n=10)

Change-Id: I92629a106b75b0526e9f2a8fe3cc4a6f7fc63c8c
Reviewed-on: https://go-review.googlesource.com/c/go/+/518631
Auto-Submit: Dmitri Shuralyov <[email protected]>
Reviewed-by: Dmitri Shuralyov <[email protected]>
Run-TryBot: Joel Sing <[email protected]>
Reviewed-by: M Zhuo <[email protected]>
Reviewed-by: Wang Yaduo <[email protected]>
TryBot-Result: Gopher Robot <[email protected]>
Reviewed-by: Mark Ryan <[email protected]>
Reviewed-by: Cherry Mui <[email protected]>
  • Loading branch information
4a6f656c authored and gopherbot committed Jan 22, 2024
1 parent a807295 commit 370f1a8
Show file tree
Hide file tree
Showing 3 changed files with 293 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/crypto/sha512/sha512block_decl.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build s390x || ppc64le || ppc64
//go:build ppc64le || ppc64 || riscv64 || s390x

package sha512

Expand Down
2 changes: 1 addition & 1 deletion src/crypto/sha512/sha512block_generic.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64
//go:build !amd64 && !arm64 && !ppc64 && !ppc64le && !riscv64 && !s390x

package sha512

Expand Down
291 changes: 291 additions & 0 deletions src/crypto/sha512/sha512block_riscv64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "textflag.h"

// SHA512 block routine. See sha512block.go for Go equivalent.
//
// The algorithm is detailed in FIPS 180-4:
//
// https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
//
// Wt = Mt; for 0 <= t <= 15
// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
//
// a = H0
// b = H1
// c = H2
// d = H3
// e = H4
// f = H5
// g = H6
// h = H7
//
// for t = 0 to 79 {
// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
// T2 = BIGSIGMA0(a) + Maj(a,b,c)
// h = g
// g = f
// f = e
// e = d + T1
// d = c
// c = b
// b = a
// a = T1 + T2
// }
//
// H0 = a + H0
// H1 = b + H1
// H2 = c + H2
// H3 = d + H3
// H4 = e + H4
// H5 = f + H5
// H6 = g + H6
// H7 = h + H7

#define ROR(s, r, d, t1, t2) \
SLL $(64-s), r, t1; \
SRL $(s), r, t2; \
OR t1, t2, d

// Wt = Mt; for 0 <= t <= 15
#define MSGSCHEDULE0(index) \
MOVBU ((index*8)+0)(X29), X5; \
MOVBU ((index*8)+1)(X29), X6; \
MOVBU ((index*8)+2)(X29), X7; \
MOVBU ((index*8)+3)(X29), X8; \
SLL $56, X5; \
SLL $48, X6; \
OR X5, X6, X5; \
SLL $40, X7; \
OR X5, X7, X5; \
SLL $32, X8; \
OR X5, X8, X5; \
MOVBU ((index*8)+4)(X29), X9; \
MOVBU ((index*8)+5)(X29), X6; \
MOVBU ((index*8)+6)(X29), X7; \
MOVBU ((index*8)+7)(X29), X8; \
SLL $24, X9; \
OR X5, X9, X5; \
SLL $16, X6; \
OR X5, X6, X5; \
SLL $8, X7; \
OR X5, X7, X5; \
OR X5, X8, X5; \
MOV X5, (index*8)(X19)

// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
// SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
// SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
#define MSGSCHEDULE1(index) \
MOV (((index-2)&0xf)*8)(X19), X5; \
MOV (((index-15)&0xf)*8)(X19), X6; \
MOV (((index-7)&0xf)*8)(X19), X9; \
MOV (((index-16)&0xf)*8)(X19), X21; \
ROR(19, X5, X7, X23, X24); \
ROR(61, X5, X8, X23, X24); \
SRL $6, X5; \
XOR X7, X5; \
XOR X8, X5; \
ADD X9, X5; \
ROR(1, X6, X7, X23, X24); \
ROR(8, X6, X8, X23, X24); \
SRL $7, X6; \
XOR X7, X6; \
XOR X8, X6; \
ADD X6, X5; \
ADD X21, X5; \
MOV X5, ((index&0xf)*8)(X19)

// Calculate T1 in X5.
// h is also used as an accumulator. Wt is passed in X5.
// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
// BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
// Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
#define SHA512T1(index, e, f, g, h) \
MOV (index*8)(X18), X8; \
ADD X5, h; \
ROR(14, e, X6, X23, X24); \
ADD X8, h; \
ROR(18, e, X7, X23, X24); \
XOR X7, X6; \
ROR(41, e, X8, X23, X24); \
XOR X8, X6; \
ADD X6, h; \
AND e, f, X5; \
NOT e, X7; \
AND g, X7; \
XOR X7, X5; \
ADD h, X5

// Calculate T2 in X6.
// T2 = BIGSIGMA0(a) + Maj(a, b, c)
// BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
#define SHA512T2(a, b, c) \
ROR(28, a, X6, X23, X24); \
ROR(34, a, X7, X23, X24); \
XOR X7, X6; \
ROR(39, a, X8, X23, X24); \
XOR X8, X6; \
AND a, b, X7; \
AND a, c, X8; \
XOR X8, X7; \
AND b, c, X9; \
XOR X9, X7; \
ADD X7, X6

// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
// The values for e and a are stored in d and h, ready for rotation.
#define SHA512ROUND(index, a, b, c, d, e, f, g, h) \
SHA512T1(index, e, f, g, h); \
SHA512T2(a, b, c); \
MOV X6, h; \
ADD X5, d; \
ADD X5, h

#define SHA512ROUND0(index, a, b, c, d, e, f, g, h) \
MSGSCHEDULE0(index); \
SHA512ROUND(index, a, b, c, d, e, f, g, h)

#define SHA512ROUND1(index, a, b, c, d, e, f, g, h) \
MSGSCHEDULE1(index); \
SHA512ROUND(index, a, b, c, d, e, f, g, h)

// func block(dig *digest, p []byte)
TEXT ·block(SB),0,$128-32
MOV p_base+8(FP), X29
MOV p_len+16(FP), X30
SRL $7, X30
SLL $7, X30

ADD X29, X30, X28
BEQ X28, X29, end

MOV ·_K(SB), X18 // const table
ADD $8, X2, X19 // message schedule

MOV dig+0(FP), X20
MOV (0*8)(X20), X10 // a = H0
MOV (1*8)(X20), X11 // b = H1
MOV (2*8)(X20), X12 // c = H2
MOV (3*8)(X20), X13 // d = H3
MOV (4*8)(X20), X14 // e = H4
MOV (5*8)(X20), X15 // f = H5
MOV (6*8)(X20), X16 // g = H6
MOV (7*8)(X20), X17 // h = H7

loop:
SHA512ROUND0(0, X10, X11, X12, X13, X14, X15, X16, X17)
SHA512ROUND0(1, X17, X10, X11, X12, X13, X14, X15, X16)
SHA512ROUND0(2, X16, X17, X10, X11, X12, X13, X14, X15)
SHA512ROUND0(3, X15, X16, X17, X10, X11, X12, X13, X14)
SHA512ROUND0(4, X14, X15, X16, X17, X10, X11, X12, X13)
SHA512ROUND0(5, X13, X14, X15, X16, X17, X10, X11, X12)
SHA512ROUND0(6, X12, X13, X14, X15, X16, X17, X10, X11)
SHA512ROUND0(7, X11, X12, X13, X14, X15, X16, X17, X10)
SHA512ROUND0(8, X10, X11, X12, X13, X14, X15, X16, X17)
SHA512ROUND0(9, X17, X10, X11, X12, X13, X14, X15, X16)
SHA512ROUND0(10, X16, X17, X10, X11, X12, X13, X14, X15)
SHA512ROUND0(11, X15, X16, X17, X10, X11, X12, X13, X14)
SHA512ROUND0(12, X14, X15, X16, X17, X10, X11, X12, X13)
SHA512ROUND0(13, X13, X14, X15, X16, X17, X10, X11, X12)
SHA512ROUND0(14, X12, X13, X14, X15, X16, X17, X10, X11)
SHA512ROUND0(15, X11, X12, X13, X14, X15, X16, X17, X10)

SHA512ROUND1(16, X10, X11, X12, X13, X14, X15, X16, X17)
SHA512ROUND1(17, X17, X10, X11, X12, X13, X14, X15, X16)
SHA512ROUND1(18, X16, X17, X10, X11, X12, X13, X14, X15)
SHA512ROUND1(19, X15, X16, X17, X10, X11, X12, X13, X14)
SHA512ROUND1(20, X14, X15, X16, X17, X10, X11, X12, X13)
SHA512ROUND1(21, X13, X14, X15, X16, X17, X10, X11, X12)
SHA512ROUND1(22, X12, X13, X14, X15, X16, X17, X10, X11)
SHA512ROUND1(23, X11, X12, X13, X14, X15, X16, X17, X10)
SHA512ROUND1(24, X10, X11, X12, X13, X14, X15, X16, X17)
SHA512ROUND1(25, X17, X10, X11, X12, X13, X14, X15, X16)
SHA512ROUND1(26, X16, X17, X10, X11, X12, X13, X14, X15)
SHA512ROUND1(27, X15, X16, X17, X10, X11, X12, X13, X14)
SHA512ROUND1(28, X14, X15, X16, X17, X10, X11, X12, X13)
SHA512ROUND1(29, X13, X14, X15, X16, X17, X10, X11, X12)
SHA512ROUND1(30, X12, X13, X14, X15, X16, X17, X10, X11)
SHA512ROUND1(31, X11, X12, X13, X14, X15, X16, X17, X10)
SHA512ROUND1(32, X10, X11, X12, X13, X14, X15, X16, X17)
SHA512ROUND1(33, X17, X10, X11, X12, X13, X14, X15, X16)
SHA512ROUND1(34, X16, X17, X10, X11, X12, X13, X14, X15)
SHA512ROUND1(35, X15, X16, X17, X10, X11, X12, X13, X14)
SHA512ROUND1(36, X14, X15, X16, X17, X10, X11, X12, X13)
SHA512ROUND1(37, X13, X14, X15, X16, X17, X10, X11, X12)
SHA512ROUND1(38, X12, X13, X14, X15, X16, X17, X10, X11)
SHA512ROUND1(39, X11, X12, X13, X14, X15, X16, X17, X10)
SHA512ROUND1(40, X10, X11, X12, X13, X14, X15, X16, X17)
SHA512ROUND1(41, X17, X10, X11, X12, X13, X14, X15, X16)
SHA512ROUND1(42, X16, X17, X10, X11, X12, X13, X14, X15)
SHA512ROUND1(43, X15, X16, X17, X10, X11, X12, X13, X14)
SHA512ROUND1(44, X14, X15, X16, X17, X10, X11, X12, X13)
SHA512ROUND1(45, X13, X14, X15, X16, X17, X10, X11, X12)
SHA512ROUND1(46, X12, X13, X14, X15, X16, X17, X10, X11)
SHA512ROUND1(47, X11, X12, X13, X14, X15, X16, X17, X10)
SHA512ROUND1(48, X10, X11, X12, X13, X14, X15, X16, X17)
SHA512ROUND1(49, X17, X10, X11, X12, X13, X14, X15, X16)
SHA512ROUND1(50, X16, X17, X10, X11, X12, X13, X14, X15)
SHA512ROUND1(51, X15, X16, X17, X10, X11, X12, X13, X14)
SHA512ROUND1(52, X14, X15, X16, X17, X10, X11, X12, X13)
SHA512ROUND1(53, X13, X14, X15, X16, X17, X10, X11, X12)
SHA512ROUND1(54, X12, X13, X14, X15, X16, X17, X10, X11)
SHA512ROUND1(55, X11, X12, X13, X14, X15, X16, X17, X10)
SHA512ROUND1(56, X10, X11, X12, X13, X14, X15, X16, X17)
SHA512ROUND1(57, X17, X10, X11, X12, X13, X14, X15, X16)
SHA512ROUND1(58, X16, X17, X10, X11, X12, X13, X14, X15)
SHA512ROUND1(59, X15, X16, X17, X10, X11, X12, X13, X14)
SHA512ROUND1(60, X14, X15, X16, X17, X10, X11, X12, X13)
SHA512ROUND1(61, X13, X14, X15, X16, X17, X10, X11, X12)
SHA512ROUND1(62, X12, X13, X14, X15, X16, X17, X10, X11)
SHA512ROUND1(63, X11, X12, X13, X14, X15, X16, X17, X10)
SHA512ROUND1(64, X10, X11, X12, X13, X14, X15, X16, X17)
SHA512ROUND1(65, X17, X10, X11, X12, X13, X14, X15, X16)
SHA512ROUND1(66, X16, X17, X10, X11, X12, X13, X14, X15)
SHA512ROUND1(67, X15, X16, X17, X10, X11, X12, X13, X14)
SHA512ROUND1(68, X14, X15, X16, X17, X10, X11, X12, X13)
SHA512ROUND1(69, X13, X14, X15, X16, X17, X10, X11, X12)
SHA512ROUND1(70, X12, X13, X14, X15, X16, X17, X10, X11)
SHA512ROUND1(71, X11, X12, X13, X14, X15, X16, X17, X10)
SHA512ROUND1(72, X10, X11, X12, X13, X14, X15, X16, X17)
SHA512ROUND1(73, X17, X10, X11, X12, X13, X14, X15, X16)
SHA512ROUND1(74, X16, X17, X10, X11, X12, X13, X14, X15)
SHA512ROUND1(75, X15, X16, X17, X10, X11, X12, X13, X14)
SHA512ROUND1(76, X14, X15, X16, X17, X10, X11, X12, X13)
SHA512ROUND1(77, X13, X14, X15, X16, X17, X10, X11, X12)
SHA512ROUND1(78, X12, X13, X14, X15, X16, X17, X10, X11)
SHA512ROUND1(79, X11, X12, X13, X14, X15, X16, X17, X10)

MOV (0*8)(X20), X5
MOV (1*8)(X20), X6
MOV (2*8)(X20), X7
MOV (3*8)(X20), X8
ADD X5, X10 // H0 = a + H0
ADD X6, X11 // H1 = b + H1
ADD X7, X12 // H2 = c + H2
ADD X8, X13 // H3 = d + H3
MOV X10, (0*8)(X20)
MOV X11, (1*8)(X20)
MOV X12, (2*8)(X20)
MOV X13, (3*8)(X20)
MOV (4*8)(X20), X5
MOV (5*8)(X20), X6
MOV (6*8)(X20), X7
MOV (7*8)(X20), X8
ADD X5, X14 // H4 = e + H4
ADD X6, X15 // H5 = f + H5
ADD X7, X16 // H6 = g + H6
ADD X8, X17 // H7 = h + H7
MOV X14, (4*8)(X20)
MOV X15, (5*8)(X20)
MOV X16, (6*8)(X20)
MOV X17, (7*8)(X20)

ADD $128, X29
BNE X28, X29, loop

end:
RET

0 comments on commit 370f1a8

Please sign in to comment.