From 41c25e3827c68a39b9e20c1625a0b96e49955445 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Sat, 9 Dec 2023 13:02:11 -0800 Subject: [PATCH] ht_dec.c: Improve MSVC arm64 popcount performance (#1479) Use NEON instructions for ARM64 (implementation based on microsoft/STL#2127). Godbolt output here: https://godbolt.org/z/q7GPTqT14 --- src/lib/openjp2/ht_dec.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/lib/openjp2/ht_dec.c b/src/lib/openjp2/ht_dec.c index 9ea061f3..a554b24a 100644 --- a/src/lib/openjp2/ht_dec.c +++ b/src/lib/openjp2/ht_dec.c @@ -55,6 +55,16 @@ #define OPJ_COMPILER_GNUC #endif +#if defined(OPJ_COMPILER_MSVC) && defined(_M_ARM64) \ + && !defined(_M_ARM64EC) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \ + && !defined(__INTEL_COMPILER) && !defined(__clang__) +#define MSVC_NEON_INTRINSICS +#endif + +#ifdef MSVC_NEON_INTRINSICS +#include +#endif + //************************************************************************/ /** @brief Displays the error message for disabling the decoding of SPP and * MRP passes @@ -71,6 +81,9 @@ OPJ_UINT32 population_count(OPJ_UINT32 val) { #if defined(OPJ_COMPILER_MSVC) && (defined(_M_IX86) || defined(_M_AMD64)) return (OPJ_UINT32)__popcnt(val); +#elif defined(OPJ_COMPILER_MSVC) && defined(MSVC_NEON_INTRINSICS) + const __n64 temp = neon_cnt(__uint64ToN64_v(val)); + return neon_addv8(temp).n8_i8[0]; #elif (defined OPJ_COMPILER_GNUC) return (OPJ_UINT32)__builtin_popcount(val); #else -- 2.30.2