From 63e083c539dd73b02a4bf068b1be1a9780186a49 Mon Sep 17 00:00:00 2001 From: 24bit-xjkp <2283572185@qq.com> Date: Sun, 12 Jan 2025 19:25:07 +0800 Subject: [PATCH 1/2] feat(xxhash3): Support LASX instruction set and refactor LSX implement 1. Use __lsx_vmul_d dircetly instead of using 2 32-bit multiply to emulate a 64-bit multiply. 2. Add LASX support. --- xxhash.h | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 5 deletions(-) diff --git a/xxhash.h b/xxhash.h index 78fc2e8d..e70ef233 100644 --- a/xxhash.h +++ b/xxhash.h @@ -1125,6 +1125,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const # define XXH_VSX 5 /*!< VSX and ZVector for POWER8/z13 (64-bit) */ # define XXH_SVE 6 /*!< SVE for some ARMv8-A and ARMv9-A */ # define XXH_LSX 7 /*!< LSX (128-bit SIMD) for LoongArch64 */ +# define XXH_LASX 8 /*!< LASX (256-bit SIMD) for LoongArch64 */ /*-********************************************************************** @@ -3855,6 +3856,9 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can # include # elif defined(__SSE2__) # include +# elif defined(__loongarch_asx) +# include +# include # elif defined(__loongarch_sx) # include # endif @@ -3991,6 +3995,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can || (defined(__s390x__) && defined(__VEC__)) \ && defined(__GNUC__) /* TODO: IBM XL */ # define XXH_VECTOR XXH_VSX +# elif defined(__loongarch_asx) +# define XXH_VECTOR XXH_LASX # elif defined(__loongarch_sx) # define XXH_VECTOR XXH_LSX # else @@ -4030,6 +4036,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can # define XXH_ACC_ALIGN 64 # elif XXH_VECTOR == XXH_SVE /* sve */ # define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_LASX /* lasx */ +# define XXH_ACC_ALIGN 64 # elif XXH_VECTOR == XXH_LSX /* lsx */ # define XXH_ACC_ALIGN 64 # endif @@ -5712,7 +5720,7 @@ XXH3_scrambleAcc_lsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { __m128i* const xacc = (__m128i*) acc; const __m128i* const xsecret = (const __m128i *) secret; - const __m128i prime32 = __lsx_vreplgr2vr_w((int)XXH_PRIME32_1); + const __m128i prime32 = __lsx_vreplgr2vr_d(XXH_PRIME32_1); for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { /* xacc[i] ^= (xacc[i] >> 47) */ @@ -5724,10 +5732,69 @@ XXH3_scrambleAcc_lsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) __m128i const data_key = __lsx_vxor_v(data_vec, key_vec); /* xacc[i] *= XXH_PRIME32_1; */ - __m128i const data_key_hi = __lsx_vsrli_d(data_key, 32); - __m128i const prod_lo = __lsx_vmulwev_d_wu(data_key, prime32); - __m128i const prod_hi = __lsx_vmulwev_d_wu(data_key_hi, prime32); - xacc[i] = __lsx_vadd_d(prod_lo, __lsx_vslli_d(prod_hi, 32)); + xacc[i] = __lsx_vmul_d(data_key, prime32); + } + } +} + +#endif + +#if (XXH_VECTOR == XXH_LASX) +#define _LASX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_lasx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { + __m256i* const xacc = (__m256i *) acc; + const __m256i* const xinput = (const __m256i *) input; + const __m256i* const xsecret = (const __m256i *) secret; + + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = __lasx_xvld(xinput + i, 0); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = __lasx_xvld(xsecret + i, 0); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = __lasx_xvxor_v(data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = __lasx_xvsrli_d(data_key, 32); + // __m256i const data_key_lo = __lasx_xvsrli_d(data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = __lasx_xvmulwev_d_wu(data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = __lasx_xvshuf4i_w(data_vec, _LASX_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = __lasx_xvadd_d(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = __lasx_xvadd_d(product, sum); + } + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lasx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_lasx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { + __m256i* const xacc = (__m256i*) acc; + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = __lasx_xvreplgr2vr_d(XXH_PRIME32_1); + + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = __lasx_xvsrli_d(acc_vec, 47); + __m256i const data_vec = __lasx_xvxor_v(acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m256i const key_vec = __lasx_xvld(xsecret + i, 0); + __m256i const data_key = __lasx_xvxor_v(data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + xacc[i] = __lasx_xvmul_d(data_key, prime32); } } } @@ -5964,6 +6031,12 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#elif (XXH_VECTOR == XXH_LASX) +#define XXH3_accumulate_512 XXH3_accumulate_512_lasx +#define XXH3_accumulate XXH3_accumulate_lasx +#define XXH3_scrambleAcc XXH3_scrambleAcc_lasx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + #elif (XXH_VECTOR == XXH_LSX) #define XXH3_accumulate_512 XXH3_accumulate_512_lsx #define XXH3_accumulate XXH3_accumulate_lsx From 7d6bd4ea09ade21410d48ca42d50cf7836a47110 Mon Sep 17 00:00:00 2001 From: 24bit-xjkp <2283572185@qq.com> Date: Mon, 13 Jan 2025 11:30:39 +0800 Subject: [PATCH 2/2] feat(xsum): Display which mode is complied in the welcome message for LoongArch 1. Display the mode which is used as below: "loongarch64 + lasx" -> LoongArch64 platform with LoongArch Advanced SIMD Extension "loongarch64 + lsx" -> LoongArch64 platform with LoongArch SIMD Extension "loongarch64" -> LoongArch64 platform, use scalar implement 2. Align the define in xxhash.h --- cli/xsum_arch.h | 8 +++++++- xxhash.h | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cli/xsum_arch.h b/cli/xsum_arch.h index aae7eaa8..79de3c7c 100644 --- a/cli/xsum_arch.h +++ b/cli/xsum_arch.h @@ -163,7 +163,13 @@ # define XSUM_ARCH "wasm/asmjs" # endif #elif defined(__loongarch_lp64) -# define XSUM_ARCH "loongarch" +# if defined(__loongarch_asx) +# define XSUM_ARCH "loongarch64 + lasx" +# elif defined(__loongarch_sx) +# define XSUM_ARCH "loongarch64 + lsx" +# else +# define XSUM_ARCH "loongarch64" +# endif #else # define XSUM_ARCH "unknown" #endif diff --git a/xxhash.h b/xxhash.h index e70ef233..812dcb01 100644 --- a/xxhash.h +++ b/xxhash.h @@ -1125,7 +1125,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const # define XXH_VSX 5 /*!< VSX and ZVector for POWER8/z13 (64-bit) */ # define XXH_SVE 6 /*!< SVE for some ARMv8-A and ARMv9-A */ # define XXH_LSX 7 /*!< LSX (128-bit SIMD) for LoongArch64 */ -# define XXH_LASX 8 /*!< LASX (256-bit SIMD) for LoongArch64 */ +# define XXH_LASX 8 /*!< LASX (256-bit SIMD) for LoongArch64 */ /*-**********************************************************************