From 846ca29569ffc281c45ff5597e91c0a0377eb56c Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 11 Apr 2024 18:16:51 -0700 Subject: [PATCH] Speed up pldexp_generic. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Measured effect in pexp: SSE4.1: name old cpu/op new cpu/op delta BM_eigen_exp_float/1 1.88ns ± 1% 1.68ns ± 1% -10.88% (p=0.000 n=54+47) BM_eigen_exp_float/8 28.9ns ± 1% 28.5ns ± 0% -1.37% (p=0.000 n=51+47) BM_eigen_exp_float/64 145ns ± 1% 139ns ± 0% -4.09% (p=0.000 n=49+43) BM_eigen_exp_float/512 1.11µs ± 1% 1.06µs ± 0% -4.42% (p=0.000 n=42+46) BM_eigen_exp_float/4k 8.80µs ± 0% 8.40µs ± 0% -4.54% (p=0.000 n=42+42) BM_eigen_exp_float/32k 70.2µs ± 0% 67.6µs ± 3% -3.74% (p=0.000 n=46+59) BM_eigen_exp_float/256k 561µs ± 0% 537µs ± 1% -4.27% (p=0.000 n=45+45) BM_eigen_exp_float/1M 2.24ms ± 0% 2.15ms ± 1% -4.15% (p=0.000 n=39+43) AVX2: name old cpu/op new cpu/op delta BM_eigen_exp_float/1 1.70ns ± 6% 1.70ns ± 5% ~ (p=0.488 n=60+60) BM_eigen_exp_float/8 30.9ns ± 0% 30.9ns ± 0% ~ (p=0.352 n=49+50) BM_eigen_exp_float/64 84.1ns ± 4% 81.0ns ± 4% -3.71% (p=0.000 n=59+58) BM_eigen_exp_float/512 520ns ± 4% 489ns ± 3% -5.96% (p=0.000 n=57+58) BM_eigen_exp_float/4k 3.99µs ± 4% 3.77µs ± 4% -5.45% (p=0.000 n=48+46) BM_eigen_exp_float/32k 31.8µs ± 5% 29.9µs ± 5% -5.87% (p=0.000 n=50+53) BM_eigen_exp_float/256k 253µs ± 4% 239µs ± 4% -5.65% (p=0.000 n=50+53) BM_eigen_exp_float/1M 1.01ms ± 4% 0.95ms ± 4% -6.04% (p=0.000 n=60+56) AVX512: name old cpu/op new cpu/op delta BM_eigen_exp_float/1 2.64ns ± 1% 2.65ns ± 2% ~ (p=0.061 n=51+54) BM_eigen_exp_float/8 33.9ns ± 2% 33.9ns ± 2% ~ (p=0.546 n=49+46) BM_eigen_exp_float/64 88.5ns ± 3% 88.7ns ± 4% ~ (p=0.703 n=57+59) BM_eigen_exp_float/512 275ns ± 3% 274ns ± 3% -0.60% (p=0.009 n=52+54) BM_eigen_exp_float/4k 1.77µs ± 3% 1.76µs ± 3% -0.62% (p=0.006 n=59+59) BM_eigen_exp_float/32k 13.7µs ± 3% 13.7µs ± 4% ~ (p=0.153 n=58+60) BM_eigen_exp_float/256k 119µs ± 5% 118µs ± 4% ~ (p=0.453 n=60+58) BM_eigen_exp_float/1M 475µs ± 6% 475µs ± 5% ~ (p=0.723 n=60+60) --- Eigen/src/Core/GenericPacketMath.h | 6 +++--- Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 58a197f91..eab717feb 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -1293,13 +1293,13 @@ EIGEN_DEVICE_FUNC inline Packet pmsub(const Packet& a, const Packet& b, const Pa /** \internal \returns -(a * b) + c (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet pnmadd(const Packet& a, const Packet& b, const Packet& c) { - return padd(pnegate(pmul(a, b)), c); + return psub(c, pmul(a, b)); } -/** \internal \returns -(a * b) - c (coeff-wise) */ +/** \internal \returns -((a * b + c) (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) { - return psub(pnegate(pmul(a, b)), c); + return pnegate(pmadd(a, b, c)); } /** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 78dbf207d..c973efdda 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -129,8 +129,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, con const PacketI e = pcast(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); PacketI b = parithmetic_shift_right<2>(e); // floor(e/4); Packet c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^b - Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) - b = psub(psub(psub(e, b), b), b); // e - 3b + Packet out = pmul(pmul(a, c), pmul(c, c)); // a * 2^(3b) + b = pnmadd(pset1(3), b, e); // e - 3b c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^(e-3*b) out = pmul(out, c); return out; -- GitLab